重构采集脚本并新增按时间导出Excel
- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
This commit is contained in:
@@ -29,3 +29,8 @@ Thumbs.db
|
|||||||
|
|
||||||
# Local runtime files
|
# Local runtime files
|
||||||
*.log
|
*.log
|
||||||
|
logs/
|
||||||
|
data/
|
||||||
|
|
||||||
|
# accidental local files
|
||||||
|
=*
|
||||||
|
|||||||
@@ -14,7 +14,49 @@
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd /www/wwwroot/lawyers
|
cd /www/wwwroot/lawyers
|
||||||
python3 -m pip install -r requirements.txt
|
python3 -m venv .venv
|
||||||
cd common_sites
|
.venv/bin/pip install -r requirements.txt
|
||||||
./start.sh
|
./common_sites/start.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## 启动参数
|
||||||
|
|
||||||
|
`start.sh` 默认并行启动 5 个站点采集(大律师使用 `dls_fresh.py`)。
|
||||||
|
|
||||||
|
- 日志目录:`/www/wwwroot/lawyers/logs`
|
||||||
|
- 大律师 JSON 输出:`/www/wwwroot/lawyers/data/dls_records.jsonl`
|
||||||
|
|
||||||
|
常用环境变量:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 顺序执行(默认 parallel)
|
||||||
|
RUN_MODE=sequential ./common_sites/start.sh
|
||||||
|
|
||||||
|
# 大律师限制采集范围
|
||||||
|
DLS_CITY_FILTER=beijing DLS_MAX_CITIES=1 DLS_MAX_PAGES=1 ./common_sites/start.sh
|
||||||
|
|
||||||
|
# 大律师直连(不走代理)/ 仅导出JSON不写库
|
||||||
|
DLS_DIRECT=1 DLS_NO_DB=1 ./common_sites/start.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## 导出 Excel
|
||||||
|
|
||||||
|
新增导出脚本:`common_sites/export_lawyers_excel.py`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 无参数:默认导出最近7天数据(含手机号/姓名/律所/省份/市区/站点名称)
|
||||||
|
# 并默认解析 params 扩展信息(邮箱/地址/执业证号/执业年限/擅长领域等)
|
||||||
|
./.venv/bin/python ./common_sites/export_lawyers_excel.py
|
||||||
|
|
||||||
|
# 按 create_time 时间戳范围导出
|
||||||
|
./.venv/bin/python ./common_sites/export_lawyers_excel.py \
|
||||||
|
--start-ts 1772380000 --end-ts 1772429999 \
|
||||||
|
--output ./data/lawyers_20260302.xlsx
|
||||||
|
|
||||||
|
# 只导出某站点,并带技术字段(url/域名/时间等)
|
||||||
|
./.venv/bin/python ./common_sites/export_lawyers_excel.py \
|
||||||
|
--domain 大律师 --include-extra
|
||||||
|
|
||||||
|
# 如果不需要解析 params 扩展信息
|
||||||
|
./.venv/bin/python ./common_sites/export_lawyers_excel.py --no-parse-params
|
||||||
```
|
```
|
||||||
|
|||||||
+252
-137
@@ -1,9 +1,14 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import random
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
from typing import Dict, Optional
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import urllib3
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -13,8 +18,7 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
import urllib3
|
from Db import Db
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from request.requests_client import (
|
from request.requests_client import (
|
||||||
RequestClientError,
|
RequestClientError,
|
||||||
RequestConnectTimeout,
|
RequestConnectTimeout,
|
||||||
@@ -22,168 +26,136 @@ from request.requests_client import (
|
|||||||
RequestTimeout,
|
RequestTimeout,
|
||||||
RequestsClient,
|
RequestsClient,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 禁用 SSL 警告
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
||||||
|
|
||||||
from Db import Db
|
|
||||||
from utils.rate_limiter import wait_for_request
|
from utils.rate_limiter import wait_for_request
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
DOMAIN = "大律师"
|
DOMAIN = "大律师"
|
||||||
LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
|
SITE_BASE = "https://m.maxlaw.cn"
|
||||||
_PROXY_TESTED = False
|
LIST_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}"
|
||||||
|
PHONE_PATTERN = re.compile(r"1[3-9]\d{9}")
|
||||||
|
MAX_PAGES_PER_CITY = int(os.getenv("MAXLAW_MAX_PAGES", "0"))
|
||||||
|
PROXY_TESTED = False
|
||||||
|
|
||||||
|
|
||||||
class DlsSpider:
|
class DlsSpider:
|
||||||
def __init__(self, db_connection):
|
def __init__(self, db_connection):
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = self._build_session()
|
self.client = self._build_client()
|
||||||
self.areas = self._load_areas()
|
self.areas = self._load_areas()
|
||||||
|
|
||||||
def _build_session(self) -> RequestsClient:
|
def _build_client(self) -> RequestsClient:
|
||||||
"""构建带重试机制的 session"""
|
|
||||||
client = RequestsClient(
|
client = RequestsClient(
|
||||||
headers={
|
headers={
|
||||||
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||||
|
"Mobile/15E148 Safari/604.1"
|
||||||
|
),
|
||||||
"Host": "m.maxlaw.cn",
|
"Host": "m.maxlaw.cn",
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
"Connection": "close",
|
"Connection": "close",
|
||||||
},
|
},
|
||||||
retry_total=3, # 总共重试3次
|
retry_total=3,
|
||||||
retry_backoff_factor=1, # 重试间隔:1s, 2s, 4s
|
retry_backoff_factor=1,
|
||||||
retry_status_forcelist=(429, 500, 502, 503, 504), # 对这些状态码进行重试
|
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||||
retry_allowed_methods=("GET", "POST"),
|
retry_allowed_methods=("GET", "POST"),
|
||||||
)
|
)
|
||||||
self._proxy_test(client, client.proxies or None)
|
self._proxy_test(client, client.proxies or None)
|
||||||
return client
|
return client
|
||||||
|
|
||||||
def _refresh_session(self) -> None:
|
def _refresh_client(self) -> None:
|
||||||
self.client.refresh()
|
self.client.refresh()
|
||||||
self._proxy_test(self.client, self.client.proxies or None)
|
self._proxy_test(self.client, self.client.proxies or None)
|
||||||
|
|
||||||
def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
|
def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
|
||||||
global _PROXY_TESTED
|
global PROXY_TESTED
|
||||||
if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
|
if PROXY_TESTED or not os.getenv("PROXY_TEST"):
|
||||||
return
|
return
|
||||||
_PROXY_TESTED = True
|
PROXY_TESTED = True
|
||||||
if not proxies:
|
if not proxies:
|
||||||
print("[proxy] test skipped: no proxy configured")
|
print("[proxy] test skipped: no proxy configured")
|
||||||
return
|
return
|
||||||
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
||||||
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
||||||
try:
|
try:
|
||||||
resp = client.get_text(
|
resp = client.get_text(test_url, timeout=timeout, headers={"Connection": "close"})
|
||||||
test_url,
|
|
||||||
timeout=timeout,
|
|
||||||
headers={"Connection": "close"},
|
|
||||||
)
|
|
||||||
print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
|
print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"[proxy] test failed: {exc}")
|
print(f"[proxy] test failed: {exc}")
|
||||||
|
|
||||||
def _load_areas(self):
|
def _load_areas(self) -> List[Dict[str, str]]:
|
||||||
|
tables = ("area_new", "area2", "area")
|
||||||
|
last_error = None
|
||||||
|
for table in tables:
|
||||||
try:
|
try:
|
||||||
return self.db.select_data(
|
rows = self.db.select_data(table, "province, city, pinyin", "domain='maxlaw'") or []
|
||||||
"area_new",
|
|
||||||
"province, city, pinyin",
|
|
||||||
"domain='maxlaw'"
|
|
||||||
) or []
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"加载地区失败: {exc}")
|
last_error = exc
|
||||||
|
continue
|
||||||
|
if rows:
|
||||||
|
missing_pinyin = sum(1 for row in rows if not (row.get("pinyin") or "").strip())
|
||||||
|
print(f"[大律师] 地区来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
||||||
|
return rows
|
||||||
|
if last_error:
|
||||||
|
print(f"[大律师] 加载地区失败: {last_error}")
|
||||||
|
print("[大律师] 无地区数据(已尝试 area_new/area2/area)")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
|
def _get(
|
||||||
"""发送 GET 请求,带重试机制"""
|
self,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
max_retries: int = 3,
|
||||||
|
timeout: Tuple[int, int] = (10, 30),
|
||||||
|
) -> Optional[str]:
|
||||||
wait_for_request()
|
wait_for_request()
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
# 使用更长的超时时间,分别设置连接和读取超时
|
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
|
||||||
resp = self.client.get_text(
|
if resp.status_code == 403:
|
||||||
url,
|
|
||||||
timeout=(10, 30), # (connect_timeout, read_timeout)
|
|
||||||
verify=False,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
status_code = resp.status_code
|
|
||||||
content = resp.text
|
|
||||||
if status_code == 403:
|
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
wait_time = (2 ** attempt) + random.uniform(0.3, 1.0)
|
||||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
print(f"请求403,{wait_time:.2f}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
self._refresh_session()
|
self._refresh_client()
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
continue
|
continue
|
||||||
print(f"请求失败 {url}: 403 Forbidden")
|
print(f"请求失败 {url}: 403 Forbidden")
|
||||||
return None
|
return None
|
||||||
if status_code >= 400:
|
if resp.status_code >= 400:
|
||||||
raise RequestClientError(f"{status_code} Error: {url}")
|
raise RequestClientError(f"{resp.status_code} Error: {url}")
|
||||||
return content
|
return resp.text
|
||||||
except RequestConnectTimeout as exc:
|
except RequestConnectTimeout as exc:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt # 指数退避:2s, 4s, 8s
|
wait_time = 2 ** attempt
|
||||||
print(f"连接超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
print(f"连接超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
else:
|
continue
|
||||||
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
|
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
|
||||||
return None
|
return None
|
||||||
except RequestTimeout as exc:
|
except RequestTimeout as exc:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt
|
wait_time = 2 ** attempt
|
||||||
print(f"请求超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
print(f"请求超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
else:
|
continue
|
||||||
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
|
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
|
||||||
return None
|
return None
|
||||||
except RequestConnectionError as exc:
|
except RequestConnectionError as exc:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt
|
wait_time = 2 ** attempt
|
||||||
print(f"连接错误,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
print(f"连接错误,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
else:
|
continue
|
||||||
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
|
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
|
||||||
return None
|
return None
|
||||||
except RequestClientError as exc:
|
except RequestClientError as exc:
|
||||||
print(f"请求失败 {url}: {exc}")
|
print(f"请求失败 {url}: {exc}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
cards = soup.find_all("div", class_="lstx")
|
|
||||||
if not cards:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
inserted = 0
|
|
||||||
for card in cards:
|
|
||||||
link = card.find("a")
|
|
||||||
if not link or not link.get("href"):
|
|
||||||
continue
|
|
||||||
detail = self._parse_detail(link['href'], province, city, list_url)
|
|
||||||
if not detail:
|
|
||||||
continue
|
|
||||||
phone = detail.get("phone")
|
|
||||||
if not phone:
|
|
||||||
continue
|
|
||||||
condition = f"phone='{phone}' and domain='{DOMAIN}'"
|
|
||||||
if self.db.is_data_exist("lawyer", condition):
|
|
||||||
print(f" -- 已存在: {detail['name']} ({phone})")
|
|
||||||
time.sleep(0.3)
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
self.db.insert_data("lawyer", detail)
|
|
||||||
inserted += 1
|
|
||||||
print(f" -> 新增: {detail['name']} ({phone})")
|
|
||||||
except Exception as exc:
|
|
||||||
print(f" 插入失败: {exc}")
|
|
||||||
time.sleep(1)
|
|
||||||
time.sleep(0.3)
|
|
||||||
# 列表页结束后再缓一缓,降低风控
|
|
||||||
time.sleep(0.6)
|
|
||||||
return inserted
|
|
||||||
|
|
||||||
def _detail_headers(self, referer: str) -> Dict[str, str]:
|
def _detail_headers(self, referer: str) -> Dict[str, str]:
|
||||||
return {
|
return {
|
||||||
"Referer": referer,
|
"Referer": referer,
|
||||||
@@ -194,72 +166,215 @@ class DlsSpider:
|
|||||||
"Upgrade-Insecure-Requests": "1",
|
"Upgrade-Insecure-Requests": "1",
|
||||||
}
|
}
|
||||||
|
|
||||||
def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
|
def _extract_detail_urls(self, html: str) -> List[str]:
|
||||||
url = f"https://m.maxlaw.cn{path}"
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
print(f" 详情: {url}")
|
urls: List[str] = []
|
||||||
html = self._get(url, headers=self._detail_headers(list_url))
|
seen: Set[str] = set()
|
||||||
|
|
||||||
|
# 主选择器:当前站点列表卡片
|
||||||
|
for a_tag in soup.select("div.lstx a[href]"):
|
||||||
|
href = (a_tag.get("href") or "").strip()
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
url = urljoin(SITE_BASE, href)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
urls.append(url)
|
||||||
|
|
||||||
|
# 回退选择器:页面结构轻微变化时尽量保活
|
||||||
|
if not urls:
|
||||||
|
for a_tag in soup.select("a[href]"):
|
||||||
|
href = (a_tag.get("href") or "").strip()
|
||||||
|
if "/lawyer/" not in href:
|
||||||
|
continue
|
||||||
|
url = urljoin(SITE_BASE, href)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
urls.append(url)
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def _extract_name(self, soup: BeautifulSoup) -> str:
|
||||||
|
for selector in ("h2.lawyerName", "h1.lawyerName", "h1", "h2"):
|
||||||
|
tag = soup.select_one(selector)
|
||||||
|
if tag:
|
||||||
|
name = tag.get_text(strip=True)
|
||||||
|
if name:
|
||||||
|
return name
|
||||||
|
title = soup.title.get_text(strip=True) if soup.title else ""
|
||||||
|
match = re.search(r"(\S+律师)", title)
|
||||||
|
return match.group(1) if match else ""
|
||||||
|
|
||||||
|
def _extract_law_firm(self, soup: BeautifulSoup) -> str:
|
||||||
|
for selector in ("p.law-firm", "div.law-firm", "p[class*=firm]"):
|
||||||
|
tag = soup.select_one(selector)
|
||||||
|
if tag:
|
||||||
|
text = tag.get_text(strip=True)
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
page_text = soup.get_text(" ", strip=True)
|
||||||
|
match = re.search(r"(执业机构|律所)\s*[::]?\s*([^\s,。,;;]{2,40})", page_text)
|
||||||
|
if match:
|
||||||
|
return match.group(2).strip()
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _normalize_phone(self, text: str) -> str:
|
||||||
|
compact = re.sub(r"\D", "", text or "")
|
||||||
|
match = PHONE_PATTERN.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
def _extract_phone(self, soup: BeautifulSoup) -> str:
|
||||||
|
contact = soup.select_one("ul.contact-content")
|
||||||
|
if contact:
|
||||||
|
phone = self._normalize_phone(contact.get_text(" ", strip=True))
|
||||||
|
if phone:
|
||||||
|
return phone
|
||||||
|
for selector in ("a[href^='tel:']", "span.phone", "p.phone"):
|
||||||
|
tag = soup.select_one(selector)
|
||||||
|
if tag:
|
||||||
|
phone = self._normalize_phone(tag.get_text(" ", strip=True))
|
||||||
|
if phone:
|
||||||
|
return phone
|
||||||
|
return self._normalize_phone(soup.get_text(" ", strip=True))
|
||||||
|
|
||||||
|
def _parse_detail(self, detail_url: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
|
||||||
|
print(f" 详情: {detail_url}")
|
||||||
|
html = self._get(detail_url, headers=self._detail_headers(list_url))
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
name_tag = soup.find("h2", class_="lawyerName")
|
name = self._extract_name(soup)
|
||||||
law_firm_tag = soup.find("p", class_="law-firm")
|
phone = self._extract_phone(soup)
|
||||||
contact_list = soup.find("ul", class_="contact-content")
|
|
||||||
|
|
||||||
name = name_tag.get_text(strip=True) if name_tag else ""
|
|
||||||
law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
|
|
||||||
phone = ""
|
|
||||||
|
|
||||||
if contact_list:
|
|
||||||
items = contact_list.find_all("li")
|
|
||||||
if len(items) > 2:
|
|
||||||
phone_tag = items[2].find("p")
|
|
||||||
if phone_tag:
|
|
||||||
phone = phone_tag.get_text(strip=True)
|
|
||||||
phone = phone.split("咨询请说明来自大律师网")[0].strip()
|
|
||||||
|
|
||||||
phone = phone.replace('-', '').strip()
|
|
||||||
if not name or not phone:
|
if not name or not phone:
|
||||||
print(" 信息不完整,跳过")
|
print(" 信息不完整,跳过")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
safe_city = city if city else province
|
safe_city = city or province
|
||||||
return {
|
return {
|
||||||
"name": name,
|
"name": name,
|
||||||
"law_firm": law_firm,
|
"law_firm": self._extract_law_firm(soup),
|
||||||
"province": province,
|
"province": province,
|
||||||
"city": safe_city,
|
"city": safe_city,
|
||||||
"phone": phone,
|
"phone": phone,
|
||||||
"url": url,
|
"url": detail_url,
|
||||||
"domain": DOMAIN,
|
"domain": DOMAIN,
|
||||||
"create_time": int(time.time()),
|
"create_time": int(time.time()),
|
||||||
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
|
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||||
|
if not phones:
|
||||||
|
return set()
|
||||||
|
existing: Set[str] = set()
|
||||||
|
cur = self.db.db.cursor()
|
||||||
|
try:
|
||||||
|
chunk_size = 500
|
||||||
|
for idx in range(0, len(phones), chunk_size):
|
||||||
|
chunk = phones[idx:idx + chunk_size]
|
||||||
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
|
cur.execute(sql, [DOMAIN, *chunk])
|
||||||
|
for row in cur.fetchall():
|
||||||
|
existing.add(row[0])
|
||||||
|
finally:
|
||||||
|
cur.close()
|
||||||
|
return existing
|
||||||
|
|
||||||
|
def _save_lawyers(self, lawyers: List[Dict[str, str]]) -> Tuple[int, int]:
|
||||||
|
if not lawyers:
|
||||||
|
return 0, 0
|
||||||
|
phones = [row["phone"] for row in lawyers if row.get("phone")]
|
||||||
|
existing = self._existing_phones(phones)
|
||||||
|
inserted = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for row in lawyers:
|
||||||
|
phone = row.get("phone", "")
|
||||||
|
if not phone:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
if phone in existing:
|
||||||
|
skipped += 1
|
||||||
|
print(f" -- 已存在: {row.get('name', '')} ({phone})")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
self.db.insert_data("lawyer", row)
|
||||||
|
existing.add(phone)
|
||||||
|
inserted += 1
|
||||||
|
print(f" -> 新增: {row.get('name', '')} ({phone})")
|
||||||
|
except Exception as exc:
|
||||||
|
skipped += 1
|
||||||
|
print(f" 插入失败 {row.get('url', '')}: {exc}")
|
||||||
|
return inserted, skipped
|
||||||
|
|
||||||
|
def _crawl_city(self, area: Dict[str, str]) -> Tuple[int, int]:
|
||||||
|
pinyin = (area.get("pinyin") or "").strip()
|
||||||
|
province = area.get("province", "")
|
||||||
|
city = area.get("city", "")
|
||||||
|
if not pinyin:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
total_inserted = 0
|
||||||
|
total_parsed = 0
|
||||||
|
page = 1
|
||||||
|
prev_fingerprint = ""
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if MAX_PAGES_PER_CITY > 0 and page > MAX_PAGES_PER_CITY:
|
||||||
|
print(f"达到分页上限({MAX_PAGES_PER_CITY}),停止 {province}-{city}")
|
||||||
|
break
|
||||||
|
|
||||||
|
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
||||||
|
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
||||||
|
html = self._get(list_url)
|
||||||
|
if not html:
|
||||||
|
break
|
||||||
|
|
||||||
|
detail_urls = self._extract_detail_urls(html)
|
||||||
|
if not detail_urls:
|
||||||
|
print(" 列表为空,结束当前城市")
|
||||||
|
break
|
||||||
|
|
||||||
|
fingerprint = "|".join(detail_urls[:8])
|
||||||
|
if fingerprint and fingerprint == prev_fingerprint:
|
||||||
|
print(" 列表页重复,提前停止当前城市")
|
||||||
|
break
|
||||||
|
prev_fingerprint = fingerprint
|
||||||
|
|
||||||
|
lawyers: List[Dict[str, str]] = []
|
||||||
|
for detail_url in detail_urls:
|
||||||
|
row = self._parse_detail(detail_url, province, city, list_url)
|
||||||
|
if row:
|
||||||
|
lawyers.append(row)
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
inserted, skipped = self._save_lawyers(lawyers)
|
||||||
|
total_inserted += inserted
|
||||||
|
total_parsed += len(lawyers)
|
||||||
|
print(
|
||||||
|
f" 第 {page} 页完成: 列表{len(detail_urls)}条, "
|
||||||
|
f"解析{len(lawyers)}条, 新增{inserted}条, 跳过{skipped}条"
|
||||||
|
)
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
time.sleep(0.5)
|
||||||
|
return total_inserted, total_parsed
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
print("启动大律师采集...")
|
print("启动大律师采集...")
|
||||||
if not self.areas:
|
if not self.areas:
|
||||||
print("无地区数据")
|
print("无地区数据")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
all_inserted = 0
|
||||||
|
all_parsed = 0
|
||||||
for area in self.areas:
|
for area in self.areas:
|
||||||
pinyin = area.get("pinyin")
|
inserted, parsed = self._crawl_city(area)
|
||||||
province = area.get("province", "")
|
all_inserted += inserted
|
||||||
city = area.get("city", "")
|
all_parsed += parsed
|
||||||
if not pinyin:
|
print(f"大律师采集完成: 解析{all_parsed}条, 新增{all_inserted}条")
|
||||||
continue
|
|
||||||
page = 1
|
|
||||||
while True:
|
|
||||||
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
|
||||||
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
|
||||||
html = self._get(list_url)
|
|
||||||
if not html:
|
|
||||||
break
|
|
||||||
inserted = self._parse_list(html, province, city, list_url)
|
|
||||||
if inserted == 0:
|
|
||||||
break
|
|
||||||
page += 1
|
|
||||||
print("大律师采集完成")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -0,0 +1,621 @@
|
|||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import urllib3
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
request_dir = os.path.join(project_root, "request")
|
||||||
|
if request_dir not in sys.path:
|
||||||
|
sys.path.insert(0, request_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
from request.requests_client import RequestClientError, RequestsClient
|
||||||
|
from utils.rate_limiter import wait_for_request
|
||||||
|
from Db import Db
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
SITE_NAME = "maxlaw"
|
||||||
|
LEGACY_DOMAIN = "大律师"
|
||||||
|
SITE_BASE = "https://m.maxlaw.cn"
|
||||||
|
CITY_API = "https://js.maxlaw.cn/js/ajax/common/getprovice.js"
|
||||||
|
CITY_DETAIL_API = "https://js.maxlaw.cn/js/ajax/common/getcity_{province_id}.js"
|
||||||
|
LIST_URL_TEMPLATE = SITE_BASE + "/law/{city_py}?page={page}"
|
||||||
|
|
||||||
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||||
|
ANSWER_RE = re.compile(r"已解答\s*(\d+)\s*次")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CityTarget:
|
||||||
|
province_id: int
|
||||||
|
province_name: str
|
||||||
|
province_py: str
|
||||||
|
city_id: int
|
||||||
|
city_name: str
|
||||||
|
city_py: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ListCard:
|
||||||
|
detail_url: str
|
||||||
|
name: str = ""
|
||||||
|
law_firm: str = ""
|
||||||
|
specialties: List[str] = field(default_factory=list)
|
||||||
|
answered_count: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
def clean_prefixed_name(value: str) -> str:
|
||||||
|
text = (value or "").strip()
|
||||||
|
# 接口返回常见格式如 "B 北京"
|
||||||
|
text = re.sub(r"^[A-Za-z]\s*", "", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(text: str) -> str:
|
||||||
|
compact = re.sub(r"\D", "", text or "")
|
||||||
|
match = PHONE_RE.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_json_with_bom(text: str) -> Dict:
|
||||||
|
cleaned = (text or "").strip().lstrip("\ufeff")
|
||||||
|
return json.loads(cleaned)
|
||||||
|
|
||||||
|
|
||||||
|
class DlsFreshCrawler:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_pages: int = 3,
|
||||||
|
sleep_seconds: float = 0.2,
|
||||||
|
use_proxy: bool = True,
|
||||||
|
db_connection=None,
|
||||||
|
):
|
||||||
|
self.max_pages = max_pages
|
||||||
|
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||||
|
self.db = db_connection
|
||||||
|
self.client = RequestsClient(
|
||||||
|
headers={
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||||
|
"Mobile/15E148 Safari/604.1"
|
||||||
|
),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Connection": "close",
|
||||||
|
},
|
||||||
|
use_proxy=use_proxy,
|
||||||
|
retry_total=2,
|
||||||
|
retry_backoff_factor=1,
|
||||||
|
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
retry_allowed_methods=("GET",),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_text(self, url: str, timeout: int = 20, max_retries: int = 3) -> str:
|
||||||
|
last_error: Optional[Exception] = None
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
wait_for_request()
|
||||||
|
try:
|
||||||
|
resp = self.client.get_text(url, timeout=timeout, verify=False)
|
||||||
|
code = resp.status_code
|
||||||
|
if code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
self.client.refresh()
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
|
continue
|
||||||
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
|
if code >= 500 and attempt < max_retries - 1:
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
|
continue
|
||||||
|
if code >= 400:
|
||||||
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
|
return resp.text
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = exc
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
if last_error is not None:
|
||||||
|
raise last_error
|
||||||
|
raise RequestClientError(f"Unknown request error: {url}")
|
||||||
|
|
||||||
|
def discover_cities(self) -> List[CityTarget]:
|
||||||
|
province_text = self._get_text(CITY_API)
|
||||||
|
province_data = parse_json_with_bom(province_text)
|
||||||
|
province_rows = province_data.get("ds", []) or []
|
||||||
|
|
||||||
|
cities: List[CityTarget] = []
|
||||||
|
seen_py: Set[str] = set()
|
||||||
|
|
||||||
|
for province in province_rows:
|
||||||
|
province_id = int(province.get("id"))
|
||||||
|
province_name = clean_prefixed_name(province.get("name", ""))
|
||||||
|
province_py = (province.get("py_code") or "").strip()
|
||||||
|
if not province_py:
|
||||||
|
continue
|
||||||
|
|
||||||
|
city_api = CITY_DETAIL_API.format(province_id=province_id)
|
||||||
|
try:
|
||||||
|
city_text = self._get_text(city_api)
|
||||||
|
city_data = parse_json_with_bom(city_text)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[city] 获取失败 pid={province_id}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for city in city_data.get("ds", []) or []:
|
||||||
|
city_py = (city.get("py_code") or "").strip()
|
||||||
|
if not city_py or city_py in seen_py:
|
||||||
|
continue
|
||||||
|
seen_py.add(city_py)
|
||||||
|
cities.append(
|
||||||
|
CityTarget(
|
||||||
|
province_id=province_id,
|
||||||
|
province_name=province_name,
|
||||||
|
province_py=province_py,
|
||||||
|
city_id=int(city.get("id")),
|
||||||
|
city_name=clean_prefixed_name(city.get("name", "")),
|
||||||
|
city_py=city_py,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return cities
|
||||||
|
|
||||||
|
def parse_list_cards(self, html: str) -> List[ListCard]:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
cards: List[ListCard] = []
|
||||||
|
seen: Set[str] = set()
|
||||||
|
|
||||||
|
for item in soup.select("div.lawyer_list ul.lawyer_ul > li"):
|
||||||
|
link = item.select_one("div.lstx a[href]")
|
||||||
|
if not link:
|
||||||
|
continue
|
||||||
|
detail_url = urljoin(SITE_BASE, link.get("href", "").strip())
|
||||||
|
if not detail_url or detail_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(detail_url)
|
||||||
|
|
||||||
|
name = ""
|
||||||
|
law_firm = ""
|
||||||
|
specialties: List[str] = []
|
||||||
|
answered_count = None
|
||||||
|
|
||||||
|
name_tag = item.select_one("p.name")
|
||||||
|
if name_tag:
|
||||||
|
name = name_tag.get_text(strip=True)
|
||||||
|
|
||||||
|
firm_tag = item.select_one("div.li_r h2")
|
||||||
|
if firm_tag:
|
||||||
|
law_firm = firm_tag.get_text(strip=True)
|
||||||
|
|
||||||
|
for span in item.select("div.zc span"):
|
||||||
|
text = span.get_text(strip=True)
|
||||||
|
if text:
|
||||||
|
specialties.append(text)
|
||||||
|
|
||||||
|
distance_text = item.select_one("div.distance i")
|
||||||
|
if distance_text:
|
||||||
|
match = ANSWER_RE.search(distance_text.get_text(" ", strip=True))
|
||||||
|
if match:
|
||||||
|
answered_count = int(match.group(1))
|
||||||
|
|
||||||
|
cards.append(
|
||||||
|
ListCard(
|
||||||
|
detail_url=detail_url,
|
||||||
|
name=name,
|
||||||
|
law_firm=law_firm,
|
||||||
|
specialties=specialties,
|
||||||
|
answered_count=answered_count,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return cards
|
||||||
|
|
||||||
|
def has_next_page(self, html: str) -> bool:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
return soup.select_one("a.mnext") is not None
|
||||||
|
|
||||||
|
def parse_detail(self, detail_url: str) -> Dict:
|
||||||
|
html = self._get_text(detail_url)
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
name = ""
|
||||||
|
law_firm = ""
|
||||||
|
license_no = ""
|
||||||
|
practice_years = None
|
||||||
|
phone = ""
|
||||||
|
email = ""
|
||||||
|
address = ""
|
||||||
|
specialties: List[str] = []
|
||||||
|
|
||||||
|
name_tag = soup.select_one("h2.lawyerName")
|
||||||
|
if name_tag:
|
||||||
|
name = name_tag.get_text(strip=True)
|
||||||
|
|
||||||
|
firm_tag = soup.select_one("p.law-firm")
|
||||||
|
if firm_tag:
|
||||||
|
law_firm = firm_tag.get_text(strip=True)
|
||||||
|
|
||||||
|
license_tag = soup.select_one("p.card-zyz")
|
||||||
|
if license_tag:
|
||||||
|
license_no = (
|
||||||
|
license_tag.get_text(" ", strip=True)
|
||||||
|
.replace("执业证号:", "")
|
||||||
|
.replace("执业证号:", "")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
years_tag = soup.select_one("div#practice i")
|
||||||
|
if years_tag:
|
||||||
|
year_text = years_tag.get_text(strip=True)
|
||||||
|
if year_text.isdigit():
|
||||||
|
practice_years = int(year_text)
|
||||||
|
|
||||||
|
tel_tag = soup.select_one("a[href^='tel:']")
|
||||||
|
if tel_tag:
|
||||||
|
phone = normalize_phone(tel_tag.get("href", ""))
|
||||||
|
|
||||||
|
for li in soup.select("ul.contact-content > li"):
|
||||||
|
key = li.select_one("i")
|
||||||
|
val = li.select_one("p")
|
||||||
|
if not key or not val:
|
||||||
|
continue
|
||||||
|
k = key.get_text(strip=True).replace(":", ":")
|
||||||
|
v = val.get_text(" ", strip=True)
|
||||||
|
if "电话" in k and not phone:
|
||||||
|
phone = normalize_phone(v)
|
||||||
|
elif "邮箱" in k and not email:
|
||||||
|
email = v.strip()
|
||||||
|
elif "地址" in k and not address:
|
||||||
|
address = v.strip()
|
||||||
|
|
||||||
|
for node in soup.select("div.exp-main li.on"):
|
||||||
|
text = node.get_text(strip=True)
|
||||||
|
if text:
|
||||||
|
specialties.append(text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"license_no": license_no,
|
||||||
|
"practice_years": practice_years,
|
||||||
|
"phone": phone,
|
||||||
|
"email": email,
|
||||||
|
"address": address,
|
||||||
|
"specialties": specialties,
|
||||||
|
"detail_url": detail_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||||
|
profile = record.get("profile", {}) or {}
|
||||||
|
source = record.get("source", {}) or {}
|
||||||
|
|
||||||
|
phone = normalize_phone(profile.get("phone", ""))
|
||||||
|
if not phone:
|
||||||
|
return None
|
||||||
|
|
||||||
|
province = (source.get("province") or "").strip()
|
||||||
|
city = (source.get("city") or province).strip()
|
||||||
|
return {
|
||||||
|
"name": (profile.get("name") or "").strip(),
|
||||||
|
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||||
|
"province": province,
|
||||||
|
"city": city,
|
||||||
|
"phone": phone,
|
||||||
|
"url": (source.get("detail_url") or "").strip(),
|
||||||
|
"domain": LEGACY_DOMAIN,
|
||||||
|
"create_time": int(record.get("collected_at") or time.time()),
|
||||||
|
"params": json.dumps(record, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||||
|
if not self.db or not phones:
|
||||||
|
return set()
|
||||||
|
deduped = sorted({p for p in phones if p})
|
||||||
|
if not deduped:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
existing: Set[str] = set()
|
||||||
|
cur = self.db.db.cursor()
|
||||||
|
try:
|
||||||
|
chunk_size = 500
|
||||||
|
for i in range(0, len(deduped), chunk_size):
|
||||||
|
chunk = deduped[i:i + chunk_size]
|
||||||
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
|
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||||
|
for row in cur.fetchall():
|
||||||
|
existing.add(row[0])
|
||||||
|
finally:
|
||||||
|
cur.close()
|
||||||
|
return existing
|
||||||
|
|
||||||
|
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||||
|
if not self.db:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
rows: List[Dict[str, str]] = []
|
||||||
|
for record in records:
|
||||||
|
row = self._to_legacy_lawyer_row(record)
|
||||||
|
if row:
|
||||||
|
rows.append(row)
|
||||||
|
if not rows:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||||
|
inserted = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
phone = row.get("phone", "")
|
||||||
|
if not phone or phone in existing:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
self.db.insert_data("lawyer", row)
|
||||||
|
existing.add(phone)
|
||||||
|
inserted += 1
|
||||||
|
except Exception as exc:
|
||||||
|
skipped += 1
|
||||||
|
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||||
|
return inserted, skipped
|
||||||
|
|
||||||
|
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||||
|
# 同一城市内去重,避免站点分页回流导致重复抓取
|
||||||
|
seen_detail_urls: Set[str] = set()
|
||||||
|
last_page_signature: Tuple[str, ...] = tuple()
|
||||||
|
repeated_signature_pages = 0
|
||||||
|
no_new_pages = 0
|
||||||
|
|
||||||
|
for page in range(1, self.max_pages + 1):
|
||||||
|
list_url = LIST_URL_TEMPLATE.format(city_py=target.city_py, page=page)
|
||||||
|
try:
|
||||||
|
html = self._get_text(list_url)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[list] 失败 {list_url}: {exc}")
|
||||||
|
break
|
||||||
|
|
||||||
|
cards = self.parse_list_cards(html)
|
||||||
|
if not cards:
|
||||||
|
break
|
||||||
|
|
||||||
|
page_signature = tuple(sorted(card.detail_url for card in cards if card.detail_url))
|
||||||
|
if page_signature and page_signature == last_page_signature:
|
||||||
|
repeated_signature_pages += 1
|
||||||
|
else:
|
||||||
|
repeated_signature_pages = 0
|
||||||
|
last_page_signature = page_signature
|
||||||
|
|
||||||
|
if repeated_signature_pages >= 2:
|
||||||
|
print(
|
||||||
|
f"[list] 城市 {target.city_py} 第{page}页列表签名重复,提前结束,"
|
||||||
|
f"list_url={list_url}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
fresh_cards: List[ListCard] = []
|
||||||
|
for card in cards:
|
||||||
|
if not card.detail_url:
|
||||||
|
continue
|
||||||
|
if card.detail_url in seen_detail_urls:
|
||||||
|
continue
|
||||||
|
seen_detail_urls.add(card.detail_url)
|
||||||
|
fresh_cards.append(card)
|
||||||
|
|
||||||
|
if not fresh_cards:
|
||||||
|
no_new_pages += 1
|
||||||
|
if no_new_pages >= 3:
|
||||||
|
print(
|
||||||
|
f"[list] 城市 {target.city_py} 连续{no_new_pages}页无新增律师,提前结束,"
|
||||||
|
f"list_url={list_url}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
no_new_pages = 0
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[page] city={target.city_py} page={page} cards={len(cards)} "
|
||||||
|
f"fresh={len(fresh_cards)} next={self.has_next_page(html)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
for card in fresh_cards:
|
||||||
|
try:
|
||||||
|
detail = self.parse_detail(card.detail_url)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[detail] 失败 {card.detail_url}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
|
||||||
|
yield {
|
||||||
|
"record_id": record_id,
|
||||||
|
"collected_at": now,
|
||||||
|
"source": {
|
||||||
|
"site": SITE_NAME,
|
||||||
|
"list_url": list_url,
|
||||||
|
"detail_url": card.detail_url,
|
||||||
|
"province": target.province_name,
|
||||||
|
"province_py": target.province_py,
|
||||||
|
"city": target.city_name,
|
||||||
|
"city_py": target.city_py,
|
||||||
|
"page": page,
|
||||||
|
},
|
||||||
|
"list_snapshot": {
|
||||||
|
"name": card.name,
|
||||||
|
"law_firm": card.law_firm,
|
||||||
|
"specialties": card.specialties,
|
||||||
|
"answered_count": card.answered_count,
|
||||||
|
},
|
||||||
|
"profile": {
|
||||||
|
"name": detail.get("name") or card.name,
|
||||||
|
"law_firm": detail.get("law_firm") or card.law_firm,
|
||||||
|
"phone": detail.get("phone", ""),
|
||||||
|
"license_no": detail.get("license_no", ""),
|
||||||
|
"practice_years": detail.get("practice_years"),
|
||||||
|
"email": detail.get("email", ""),
|
||||||
|
"address": detail.get("address", ""),
|
||||||
|
"specialties": detail.get("specialties") or card.specialties,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if self.sleep_seconds:
|
||||||
|
time.sleep(self.sleep_seconds)
|
||||||
|
|
||||||
|
if not self.has_next_page(html):
|
||||||
|
break
|
||||||
|
|
||||||
|
def crawl(
|
||||||
|
self,
|
||||||
|
output_path: str,
|
||||||
|
max_cities: int = 0,
|
||||||
|
city_filter: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
cities = self.discover_cities()
|
||||||
|
print(f"[discover] 共发现城市 {len(cities)} 个")
|
||||||
|
if city_filter:
|
||||||
|
key = city_filter.strip().lower()
|
||||||
|
cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
|
||||||
|
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
||||||
|
if max_cities > 0:
|
||||||
|
cities = cities[:max_cities]
|
||||||
|
print(f"[discover] 截断城市数 {len(cities)}")
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
seen_ids: Set[str] = set()
|
||||||
|
if os.path.exists(output_path):
|
||||||
|
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||||
|
for line in old_file:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
item = json.loads(line)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
rid = item.get("record_id")
|
||||||
|
if rid:
|
||||||
|
seen_ids.add(rid)
|
||||||
|
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||||
|
|
||||||
|
total_new_json = 0
|
||||||
|
total_new_db = 0
|
||||||
|
total_skip_db = 0
|
||||||
|
with open(output_path, "a", encoding="utf-8") as out:
|
||||||
|
for idx, target in enumerate(cities, start=1):
|
||||||
|
print(
|
||||||
|
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||||
|
f"({target.city_py})"
|
||||||
|
)
|
||||||
|
city_records = list(self.crawl_city(target))
|
||||||
|
|
||||||
|
city_new_json = 0
|
||||||
|
for record in city_records:
|
||||||
|
rid = record["record_id"]
|
||||||
|
if rid in seen_ids:
|
||||||
|
continue
|
||||||
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
seen_ids.add(rid)
|
||||||
|
city_new_json += 1
|
||||||
|
total_new_json += 1
|
||||||
|
|
||||||
|
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||||
|
total_new_db += city_new_db
|
||||||
|
total_skip_db += city_skip_db
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||||
|
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||||
|
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="大律师全新采集脚本(新数据结构)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="/www/wwwroot/lawyers/data/dls_records_all.jsonl",
|
||||||
|
help="输出 jsonl 文件路径",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-cities",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="最多采集多少个城市,0 表示不限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
default=9999,
|
||||||
|
help="每个城市最多采集多少页",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--city-filter",
|
||||||
|
default="",
|
||||||
|
help="按城市拼音或城市名过滤,如 beijing",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sleep",
|
||||||
|
type=float,
|
||||||
|
default=0.2,
|
||||||
|
help="详情页请求间隔秒数",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--direct",
|
||||||
|
action="store_true",
|
||||||
|
help="直连模式,不使用 proxy_settings.json 代理",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-db",
|
||||||
|
action="store_true",
|
||||||
|
help="只输出 JSONL,不写入数据库",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
if args.no_db:
|
||||||
|
crawler = DlsFreshCrawler(
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
sleep_seconds=args.sleep,
|
||||||
|
use_proxy=not args.direct,
|
||||||
|
db_connection=None,
|
||||||
|
)
|
||||||
|
crawler.crawl(
|
||||||
|
output_path=args.output,
|
||||||
|
max_cities=args.max_cities,
|
||||||
|
city_filter=args.city_filter or None,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
with Db() as db:
|
||||||
|
crawler = DlsFreshCrawler(
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
sleep_seconds=args.sleep,
|
||||||
|
use_proxy=not args.direct,
|
||||||
|
db_connection=db,
|
||||||
|
)
|
||||||
|
crawler.crawl(
|
||||||
|
output_path=args.output,
|
||||||
|
max_cities=args.max_cities,
|
||||||
|
city_filter=args.city_filter or None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,290 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
from openpyxl import Workbook
|
||||||
|
from openpyxl.styles import Font
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.insert(0, project_root)
|
||||||
|
|
||||||
|
from Db import Db
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="",
|
||||||
|
help="输出 xlsx 文件路径,默认输出到 data/export_lawyers_时间戳.xlsx",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--start-ts",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="create_time 起始时间戳(含),不传时默认取最近7天",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--end-ts",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="create_time 结束时间戳(含),默认不限制上限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--domain",
|
||||||
|
default="",
|
||||||
|
help="按 domain 过滤,例如:大律师 / 找法网 / 华律",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--province",
|
||||||
|
default="",
|
||||||
|
help="按省份过滤,例如:北京、广东",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--city",
|
||||||
|
default="",
|
||||||
|
help="按城市过滤,例如:北京、深圳",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--keyword",
|
||||||
|
default="",
|
||||||
|
help="关键词过滤(匹配姓名/律所/手机号)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--limit",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="最多导出多少条,0 表示不限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--include-extra",
|
||||||
|
action="store_true",
|
||||||
|
help="导出更多扩展字段(url/domain/create_time/site_time 等)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-parse-params",
|
||||||
|
action="store_true",
|
||||||
|
help="关闭 params JSON 扩展信息解析(默认开启)",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def apply_default_time_filter(args: argparse.Namespace) -> None:
|
||||||
|
# 未显式传时间范围时,默认导出最近7天的数据
|
||||||
|
if args.start_ts is None and args.end_ts is None:
|
||||||
|
args.start_ts = int(time.time()) - 7 * 24 * 3600
|
||||||
|
args.end_ts = 0
|
||||||
|
return
|
||||||
|
if args.start_ts is None:
|
||||||
|
args.start_ts = 0
|
||||||
|
if args.end_ts is None:
|
||||||
|
args.end_ts = 0
|
||||||
|
|
||||||
|
|
||||||
|
def build_output_path(user_output: str) -> str:
|
||||||
|
if user_output:
|
||||||
|
return os.path.abspath(user_output)
|
||||||
|
ts = int(time.time())
|
||||||
|
return os.path.abspath(f"/www/wwwroot/lawyers/data/export_lawyers_{ts}.xlsx")
|
||||||
|
|
||||||
|
|
||||||
|
def ts_to_text(ts_value: Optional[int]) -> str:
|
||||||
|
if ts_value in (None, 0, ""):
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
return datetime.fromtimestamp(int(ts_value)).strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def build_query(args: argparse.Namespace) -> (str, List):
|
||||||
|
where: List[str] = []
|
||||||
|
params: List = []
|
||||||
|
|
||||||
|
if args.start_ts > 0:
|
||||||
|
where.append("create_time >= %s")
|
||||||
|
params.append(args.start_ts)
|
||||||
|
if args.end_ts > 0:
|
||||||
|
where.append("create_time <= %s")
|
||||||
|
params.append(args.end_ts)
|
||||||
|
if args.domain.strip():
|
||||||
|
where.append("domain = %s")
|
||||||
|
params.append(args.domain.strip())
|
||||||
|
if args.province.strip():
|
||||||
|
where.append("province = %s")
|
||||||
|
params.append(args.province.strip())
|
||||||
|
if args.city.strip():
|
||||||
|
where.append("city = %s")
|
||||||
|
params.append(args.city.strip())
|
||||||
|
if args.keyword.strip():
|
||||||
|
like = f"%{args.keyword.strip()}%"
|
||||||
|
where.append("(name LIKE %s OR law_firm LIKE %s OR phone LIKE %s)")
|
||||||
|
params.extend([like, like, like])
|
||||||
|
|
||||||
|
where_sql = f"WHERE {' AND '.join(where)}" if where else ""
|
||||||
|
limit_sql = f"LIMIT {int(args.limit)}" if args.limit and args.limit > 0 else ""
|
||||||
|
sql = (
|
||||||
|
"SELECT id, name, phone, law_firm, province, city, url, domain, "
|
||||||
|
"create_time, site_time, params "
|
||||||
|
f"FROM lawyer {where_sql} ORDER BY id ASC {limit_sql}"
|
||||||
|
)
|
||||||
|
return sql, params
|
||||||
|
|
||||||
|
|
||||||
|
def parse_params(params_text: str) -> Dict[str, str]:
|
||||||
|
if not params_text:
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
data = json.loads(params_text)
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
profile = data.get("profile") or {}
|
||||||
|
source = data.get("source") or {}
|
||||||
|
if not isinstance(profile, dict):
|
||||||
|
profile = {}
|
||||||
|
if not isinstance(source, dict):
|
||||||
|
source = {}
|
||||||
|
specialties = profile.get("specialties")
|
||||||
|
if isinstance(specialties, list):
|
||||||
|
specialties_text = ",".join(str(x) for x in specialties if x)
|
||||||
|
else:
|
||||||
|
specialties_text = ""
|
||||||
|
|
||||||
|
return {
|
||||||
|
"email": str(profile.get("email") or ""),
|
||||||
|
"address": str(profile.get("address") or ""),
|
||||||
|
"license_no": str(profile.get("license_no") or ""),
|
||||||
|
"practice_years": str(profile.get("practice_years") or ""),
|
||||||
|
"specialties": specialties_text,
|
||||||
|
"source_site": str(source.get("site") or ""),
|
||||||
|
"detail_url": str(source.get("detail_url") or ""),
|
||||||
|
"list_url": str(source.get("list_url") or ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, parse_params_flag: bool) -> int:
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "lawyers"
|
||||||
|
|
||||||
|
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"]
|
||||||
|
if include_extra:
|
||||||
|
headers.extend(
|
||||||
|
[
|
||||||
|
"URL",
|
||||||
|
"站点",
|
||||||
|
"create_time",
|
||||||
|
"create_time_text",
|
||||||
|
"site_time",
|
||||||
|
"site_time_text",
|
||||||
|
"ID",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if parse_params_flag:
|
||||||
|
headers.extend(
|
||||||
|
[
|
||||||
|
"邮箱",
|
||||||
|
"地址",
|
||||||
|
"执业证号",
|
||||||
|
"执业年限",
|
||||||
|
"擅长领域",
|
||||||
|
"source_site",
|
||||||
|
"detail_url",
|
||||||
|
"list_url",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
ws.append(headers)
|
||||||
|
for cell in ws[1]:
|
||||||
|
cell.font = Font(bold=True)
|
||||||
|
|
||||||
|
exported = 0
|
||||||
|
for row in rows:
|
||||||
|
info = parse_params(row.get("params", "") or "") if parse_params_flag else {}
|
||||||
|
site_name = info.get("source_site") or (row.get("domain", "") or "")
|
||||||
|
line = [
|
||||||
|
row.get("phone", "") or "",
|
||||||
|
row.get("name", "") or "",
|
||||||
|
row.get("law_firm", "") or "",
|
||||||
|
row.get("province", "") or "",
|
||||||
|
row.get("city", "") or "",
|
||||||
|
site_name,
|
||||||
|
row.get("domain", "") or "",
|
||||||
|
]
|
||||||
|
|
||||||
|
if include_extra:
|
||||||
|
line.extend(
|
||||||
|
[
|
||||||
|
row.get("url", "") or "",
|
||||||
|
row.get("domain", "") or "",
|
||||||
|
row.get("create_time", "") or "",
|
||||||
|
ts_to_text(row.get("create_time")),
|
||||||
|
row.get("site_time", "") or "",
|
||||||
|
ts_to_text(row.get("site_time")),
|
||||||
|
row.get("id", "") or "",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if parse_params_flag:
|
||||||
|
line.extend(
|
||||||
|
[
|
||||||
|
info.get("email", ""),
|
||||||
|
info.get("address", ""),
|
||||||
|
info.get("license_no", ""),
|
||||||
|
info.get("practice_years", ""),
|
||||||
|
info.get("specialties", ""),
|
||||||
|
info.get("source_site", ""),
|
||||||
|
info.get("detail_url", ""),
|
||||||
|
info.get("list_url", ""),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
ws.append(line)
|
||||||
|
exported += 1
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||||
|
wb.save(output_path)
|
||||||
|
return exported
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
apply_default_time_filter(args)
|
||||||
|
output_path = build_output_path(args.output)
|
||||||
|
sql, sql_params = build_query(args)
|
||||||
|
|
||||||
|
with Db() as db:
|
||||||
|
cursor = db.db.cursor(pymysql.cursors.DictCursor)
|
||||||
|
try:
|
||||||
|
cursor.execute(sql, sql_params)
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
count = export_to_excel(
|
||||||
|
rows=rows,
|
||||||
|
output_path=output_path,
|
||||||
|
include_extra=args.include_extra,
|
||||||
|
parse_params_flag=not args.no_parse_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"[export] 导出完成,共 {count} 条")
|
||||||
|
print(f"[export] 文件路径: {output_path}")
|
||||||
|
print(
|
||||||
|
f"[export] 时间筛选 create_time: start={args.start_ts or '-'} end={args.end_ts or '-'}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
+420
-150
@@ -1,9 +1,16 @@
|
|||||||
|
import argparse
|
||||||
|
import ast
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import random
|
from dataclasses import dataclass
|
||||||
from typing import Dict, List, Set, Optional
|
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||||
|
|
||||||
|
import urllib3
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -13,21 +20,50 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
from request.requests_client import RequestClientError, RequestSSLError, RequestsClient
|
|
||||||
from Db import Db
|
from Db import Db
|
||||||
|
from request.requests_client import RequestClientError, RequestsClient
|
||||||
|
from utils.rate_limiter import wait_for_request
|
||||||
|
|
||||||
DOMAIN = "找法网"
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
|
|
||||||
|
SITE_NAME = "findlaw"
|
||||||
|
LEGACY_DOMAIN = "找法网"
|
||||||
|
SITE_BASE = "https://m.findlaw.cn"
|
||||||
|
CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
|
||||||
|
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
|
||||||
|
|
||||||
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||||
|
|
||||||
|
|
||||||
class FindlawSpider:
|
@dataclass
|
||||||
def __init__(self, db_connection):
|
class CityTarget:
|
||||||
|
province_id: str
|
||||||
|
province_name: str
|
||||||
|
province_py: str
|
||||||
|
city_id: str
|
||||||
|
city_name: str
|
||||||
|
city_py: str
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(text: str) -> str:
|
||||||
|
compact = re.sub(r"\D", "", text or "")
|
||||||
|
match = PHONE_RE.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
class FindlawCrawler:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_pages: int = 9999,
|
||||||
|
sleep_seconds: float = 0.1,
|
||||||
|
use_proxy: bool = True,
|
||||||
|
db_connection=None,
|
||||||
|
):
|
||||||
|
self.max_pages = max_pages
|
||||||
|
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = self._build_session()
|
self.client = RequestsClient(
|
||||||
self.cities = self._load_cities()
|
headers={
|
||||||
|
|
||||||
def _build_session(self) -> RequestsClient:
|
|
||||||
return RequestsClient(headers={
|
|
||||||
"User-Agent": (
|
"User-Agent": (
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||||
@@ -36,174 +72,408 @@ class FindlawSpider:
|
|||||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||||
"X-Requested-With": "XMLHttpRequest",
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
"Connection": "close",
|
"Connection": "close",
|
||||||
})
|
},
|
||||||
|
use_proxy=use_proxy,
|
||||||
|
retry_total=2,
|
||||||
|
retry_backoff_factor=1,
|
||||||
|
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
retry_allowed_methods=("GET",),
|
||||||
|
)
|
||||||
|
|
||||||
def _refresh_session(self) -> None:
|
def _get_text(
|
||||||
self.client.refresh()
|
self,
|
||||||
|
url: str,
|
||||||
def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
|
timeout: int = 20,
|
||||||
|
max_retries: int = 3,
|
||||||
|
referer: str = SITE_BASE,
|
||||||
|
) -> str:
|
||||||
headers = {"Referer": referer}
|
headers = {"Referer": referer}
|
||||||
|
last_error: Optional[Exception] = None
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
|
wait_for_request()
|
||||||
try:
|
try:
|
||||||
resp = self.client.get_text(url, timeout=15, verify=verify, headers=headers)
|
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
|
||||||
status_code = resp.status_code
|
code = resp.status_code
|
||||||
text = resp.text
|
if code == 403:
|
||||||
if status_code == 403:
|
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
self.client.refresh()
|
||||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
self._refresh_session()
|
|
||||||
time.sleep(wait_time)
|
|
||||||
continue
|
continue
|
||||||
print(f"请求失败 {url}: 403 Forbidden")
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
return None
|
if code >= 500 and attempt < max_retries - 1:
|
||||||
if status_code >= 400:
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
raise RequestClientError(f"{status_code} Error: {url}")
|
continue
|
||||||
return text
|
if code >= 400:
|
||||||
except RequestSSLError:
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
if verify:
|
return resp.text
|
||||||
return self._get(url, referer, verify=False, max_retries=max_retries)
|
except Exception as exc:
|
||||||
print(f"SSL错误 {url}")
|
last_error = exc
|
||||||
return None
|
if attempt < max_retries - 1:
|
||||||
except RequestClientError as exc:
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
print(f"请求失败 {url}: {exc}")
|
continue
|
||||||
return None
|
raise
|
||||||
|
|
||||||
|
if last_error is not None:
|
||||||
|
raise last_error
|
||||||
|
raise RequestClientError(f"Unknown request error: {url}")
|
||||||
|
|
||||||
|
def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
|
||||||
|
pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
|
||||||
|
match = re.search(pattern, script_text)
|
||||||
|
if not match:
|
||||||
|
return []
|
||||||
|
raw = match.group(1)
|
||||||
|
try:
|
||||||
|
rows = ast.literal_eval(raw)
|
||||||
|
return rows if isinstance(rows, list) else []
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def discover_cities(self) -> List[CityTarget]:
|
||||||
|
js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
|
||||||
|
provinces = self._parse_city_js_array(js_text, "iosProvinces")
|
||||||
|
cities = self._parse_city_js_array(js_text, "iosCitys")
|
||||||
|
|
||||||
|
province_map: Dict[str, Dict] = {}
|
||||||
|
for item in provinces:
|
||||||
|
pid = str(item.get("id") or "").strip()
|
||||||
|
if pid:
|
||||||
|
province_map[pid] = item
|
||||||
|
|
||||||
|
results: List[CityTarget] = []
|
||||||
|
seen_py: Set[str] = set()
|
||||||
|
for city in cities:
|
||||||
|
city_py = str(city.get("pinyin") or "").strip()
|
||||||
|
city_name = str(city.get("value") or "").strip()
|
||||||
|
city_id = str(city.get("id") or "").strip()
|
||||||
|
province_id = str(city.get("parentId") or "").strip()
|
||||||
|
if not city_py or not city_name or not city_id:
|
||||||
|
continue
|
||||||
|
if city_py in seen_py:
|
||||||
|
continue
|
||||||
|
seen_py.add(city_py)
|
||||||
|
|
||||||
|
province_row = province_map.get(province_id, {})
|
||||||
|
province_name = str(province_row.get("value") or city_name).strip()
|
||||||
|
province_py = str(province_row.get("pinyin") or city_py).strip()
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
CityTarget(
|
||||||
|
province_id=province_id,
|
||||||
|
province_name=province_name,
|
||||||
|
province_py=province_py,
|
||||||
|
city_id=city_id,
|
||||||
|
city_name=city_name,
|
||||||
|
city_py=city_py,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _parse_list_payload(self, text: str) -> Dict:
|
||||||
|
cleaned = (text or "").strip().lstrip("\ufeff")
|
||||||
|
try:
|
||||||
|
return json.loads(cleaned)
|
||||||
|
except ValueError:
|
||||||
|
start = cleaned.find("{")
|
||||||
|
end = cleaned.rfind("}")
|
||||||
|
if start == -1 or end == -1:
|
||||||
|
return {}
|
||||||
|
return json.loads(cleaned[start:end + 1])
|
||||||
|
|
||||||
|
def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
|
||||||
|
list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
|
||||||
|
referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
|
||||||
|
text = self._get_text(list_url, referer=referer)
|
||||||
|
payload = self._parse_list_payload(text)
|
||||||
|
if payload.get("errcode") != 0:
|
||||||
|
return [], False, list_url
|
||||||
|
|
||||||
|
data = payload.get("data", {}) or {}
|
||||||
|
items = data.get("lawyer_list", []) or []
|
||||||
|
has_more = str(data.get("has_more", "0")) == "1"
|
||||||
|
return items, has_more, list_url
|
||||||
|
|
||||||
|
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||||
|
for page in range(1, self.max_pages + 1):
|
||||||
|
try:
|
||||||
|
items, has_more, list_url = self.fetch_list_page(target.city_py, page)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
detail_url = item.get("siteask_m") or item.get("site_url") or ""
|
||||||
|
detail_url = str(detail_url).strip()
|
||||||
|
if not detail_url.startswith("http"):
|
||||||
|
detail_url = list_url
|
||||||
|
|
||||||
|
phone = normalize_phone(item.get("mobile", ""))
|
||||||
|
profile = {
|
||||||
|
"uid": str(item.get("uid") or ""),
|
||||||
|
"name": str(item.get("username") or "").strip(),
|
||||||
|
"law_firm": str(item.get("lawyer_lawroom") or "").strip(),
|
||||||
|
"phone": phone,
|
||||||
|
"lawyer_year": item.get("lawyer_year"),
|
||||||
|
"service_area": str(item.get("service_area") or "").strip(),
|
||||||
|
"address": str(item.get("addr") or "").strip(),
|
||||||
|
"specialties": item.get("professionArr") or [],
|
||||||
|
"answer_count": item.get("ansnum"),
|
||||||
|
"comment_count": item.get("askcommentnum"),
|
||||||
|
}
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
uid = profile.get("uid", "")
|
||||||
|
record_key = uid or detail_url
|
||||||
|
record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
area = item.get("areaInfo", {}) or {}
|
||||||
|
yield {
|
||||||
|
"record_id": record_id,
|
||||||
|
"collected_at": now,
|
||||||
|
"source": {
|
||||||
|
"site": SITE_NAME,
|
||||||
|
"list_url": list_url,
|
||||||
|
"detail_url": detail_url,
|
||||||
|
"province": str(area.get("province") or target.province_name),
|
||||||
|
"province_py": target.province_py,
|
||||||
|
"city": str(area.get("city") or target.city_name),
|
||||||
|
"city_py": target.city_py,
|
||||||
|
"page": page,
|
||||||
|
},
|
||||||
|
"list_snapshot": {
|
||||||
|
"uid": uid,
|
||||||
|
"name": profile["name"],
|
||||||
|
"law_firm": profile["law_firm"],
|
||||||
|
"answer_count": profile["answer_count"],
|
||||||
|
"comment_count": profile["comment_count"],
|
||||||
|
},
|
||||||
|
"profile": profile,
|
||||||
|
"raw": item,
|
||||||
|
}
|
||||||
|
if self.sleep_seconds:
|
||||||
|
time.sleep(self.sleep_seconds)
|
||||||
|
|
||||||
|
if not has_more:
|
||||||
|
break
|
||||||
|
|
||||||
|
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||||
|
source = record.get("source", {}) or {}
|
||||||
|
profile = record.get("profile", {}) or {}
|
||||||
|
phone = normalize_phone(profile.get("phone", ""))
|
||||||
|
if not phone:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
province = (source.get("province") or "").strip()
|
||||||
if not phones:
|
city = (source.get("city") or province).strip()
|
||||||
|
return {
|
||||||
|
"name": (profile.get("name") or "").strip(),
|
||||||
|
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||||
|
"province": province,
|
||||||
|
"city": city,
|
||||||
|
"phone": phone,
|
||||||
|
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
|
||||||
|
"domain": LEGACY_DOMAIN,
|
||||||
|
"create_time": int(record.get("collected_at") or time.time()),
|
||||||
|
"params": json.dumps(record, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||||
|
if not self.db or not phones:
|
||||||
return set()
|
return set()
|
||||||
|
deduped = sorted({p for p in phones if p})
|
||||||
|
if not deduped:
|
||||||
|
return set()
|
||||||
|
|
||||||
existing: Set[str] = set()
|
existing: Set[str] = set()
|
||||||
cur = self.db.db.cursor()
|
cur = self.db.db.cursor()
|
||||||
try:
|
try:
|
||||||
chunk_size = 500
|
chunk_size = 500
|
||||||
for i in range(0, len(phones), chunk_size):
|
for i in range(0, len(deduped), chunk_size):
|
||||||
chunk = phones[i:i + chunk_size]
|
chunk = deduped[i:i + chunk_size]
|
||||||
placeholders = ",".join(["%s"] * len(chunk))
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
cur.execute(sql, [DOMAIN, *chunk])
|
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||||
for row in cur.fetchall():
|
for row in cur.fetchall():
|
||||||
existing.add(row[0])
|
existing.add(row[0])
|
||||||
finally:
|
finally:
|
||||||
cur.close()
|
cur.close()
|
||||||
return existing
|
return existing
|
||||||
|
|
||||||
def _load_cities(self):
|
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||||
condition = "domain='findlaw' AND level=2"
|
if not self.db:
|
||||||
tables = ("area_new", "area2", "area")
|
return 0, 0
|
||||||
last_error = None
|
|
||||||
for table in tables:
|
rows: List[Dict[str, str]] = []
|
||||||
try:
|
for record in records:
|
||||||
rows = self.db.select_data(table, "city, province, pinyin", condition) or []
|
row = self._to_legacy_lawyer_row(record)
|
||||||
except Exception as exc:
|
if row:
|
||||||
last_error = exc
|
rows.append(row)
|
||||||
|
if not rows:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||||
|
inserted = 0
|
||||||
|
skipped = 0
|
||||||
|
for row in rows:
|
||||||
|
phone = row.get("phone", "")
|
||||||
|
if not phone or phone in existing:
|
||||||
|
skipped += 1
|
||||||
continue
|
continue
|
||||||
if rows:
|
|
||||||
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
|
|
||||||
print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
|
||||||
return rows
|
|
||||||
|
|
||||||
if last_error:
|
|
||||||
print(f"[找法网] 加载地区数据失败: {last_error}")
|
|
||||||
print("[找法网] 无城市数据(已尝试 area_new/area2/area)")
|
|
||||||
for table in tables:
|
|
||||||
try:
|
try:
|
||||||
cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
|
self.db.insert_data("lawyer", row)
|
||||||
c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
|
existing.add(phone)
|
||||||
print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
|
inserted += 1
|
||||||
|
except Exception as exc:
|
||||||
|
skipped += 1
|
||||||
|
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||||
|
return inserted, skipped
|
||||||
|
|
||||||
|
def crawl(
|
||||||
|
self,
|
||||||
|
output_path: str,
|
||||||
|
max_cities: int = 0,
|
||||||
|
city_filter: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
cities = self.discover_cities()
|
||||||
|
print(f"[discover] 共发现城市 {len(cities)} 个")
|
||||||
|
if city_filter:
|
||||||
|
key = city_filter.strip().lower()
|
||||||
|
cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
|
||||||
|
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
||||||
|
if max_cities > 0:
|
||||||
|
cities = cities[:max_cities]
|
||||||
|
print(f"[discover] 截断城市数 {len(cities)}")
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
seen_ids: Set[str] = set()
|
||||||
|
if os.path.exists(output_path):
|
||||||
|
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||||
|
for line in old_file:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
item = json.loads(line)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
continue
|
||||||
return []
|
rid = item.get("record_id")
|
||||||
|
if rid:
|
||||||
|
seen_ids.add(rid)
|
||||||
|
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||||
|
|
||||||
def _fetch_page(self, url: str, referer: str) -> List[Dict]:
|
total_new_json = 0
|
||||||
text = self._get(url, referer, verify=True)
|
total_new_db = 0
|
||||||
if not text:
|
total_skip_db = 0
|
||||||
return []
|
|
||||||
|
|
||||||
try:
|
with open(output_path, "a", encoding="utf-8") as out:
|
||||||
# 某些返回体前会携带 BOM 或包装脚本,此处做兼容
|
for idx, target in enumerate(cities, start=1):
|
||||||
text = text.strip().lstrip("\ufeff")
|
print(
|
||||||
try:
|
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||||
data = json.loads(text)
|
f"({target.city_py})"
|
||||||
except ValueError:
|
)
|
||||||
json_start = text.find('{')
|
city_records = list(self.crawl_city(target))
|
||||||
json_end = text.rfind('}')
|
|
||||||
if json_start == -1 or json_end == -1:
|
|
||||||
print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
|
|
||||||
return []
|
|
||||||
cleaned = text[json_start:json_end + 1]
|
|
||||||
data = json.loads(cleaned)
|
|
||||||
if isinstance(data, str):
|
|
||||||
try:
|
|
||||||
data = json.loads(data)
|
|
||||||
except ValueError:
|
|
||||||
print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}")
|
|
||||||
return []
|
|
||||||
except ValueError as exc:
|
|
||||||
print(f"解析JSON失败 {url}: {exc}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
items = data.get("data", {}).get("lawyer_list", [])
|
city_new_json = 0
|
||||||
parsed = []
|
for record in city_records:
|
||||||
for item in items:
|
rid = record["record_id"]
|
||||||
phone = (item.get("mobile") or "").replace("-", "")
|
if rid in seen_ids:
|
||||||
parsed.append({
|
continue
|
||||||
"name": item.get("username", ""),
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
"law_firm": item.get("lawyer_lawroom", ""),
|
seen_ids.add(rid)
|
||||||
"province": item.get("areaInfo", {}).get("province", ""),
|
city_new_json += 1
|
||||||
"city": item.get("areaInfo", {}).get("city", ""),
|
total_new_json += 1
|
||||||
"phone": phone,
|
|
||||||
"url": url,
|
|
||||||
"domain": DOMAIN,
|
|
||||||
"create_time": int(time.time()),
|
|
||||||
"params": json.dumps(item, ensure_ascii=False)
|
|
||||||
})
|
|
||||||
return parsed
|
|
||||||
|
|
||||||
def run(self):
|
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||||
print("启动找法网采集...")
|
total_new_db += city_new_db
|
||||||
if not self.cities:
|
total_skip_db += city_skip_db
|
||||||
print("无城市数据")
|
print(
|
||||||
|
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||||
|
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||||
|
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="找法网全新采集脚本(重写版)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
|
||||||
|
help="输出 jsonl 文件路径",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-cities",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="最多采集多少个城市,0 表示不限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
default=9999,
|
||||||
|
help="每个城市最多采集多少页",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--city-filter",
|
||||||
|
default="",
|
||||||
|
help="按城市拼音或城市名过滤,如 beijing",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sleep",
|
||||||
|
type=float,
|
||||||
|
default=0.1,
|
||||||
|
help="每条记录采集间隔秒数",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--direct",
|
||||||
|
action="store_true",
|
||||||
|
help="直连模式,不使用 proxy_settings.json 代理",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-db",
|
||||||
|
action="store_true",
|
||||||
|
help="只输出 JSONL,不写入数据库",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
if args.no_db:
|
||||||
|
crawler = FindlawCrawler(
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
sleep_seconds=args.sleep,
|
||||||
|
use_proxy=not args.direct,
|
||||||
|
db_connection=None,
|
||||||
|
)
|
||||||
|
crawler.crawl(
|
||||||
|
output_path=args.output,
|
||||||
|
max_cities=args.max_cities,
|
||||||
|
city_filter=args.city_filter or None,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
for city in self.cities:
|
with Db() as db:
|
||||||
pinyin = city.get("pinyin")
|
crawler = FindlawCrawler(
|
||||||
province = city.get("province", "")
|
max_pages=args.max_pages,
|
||||||
city_name = city.get("city", "")
|
sleep_seconds=args.sleep,
|
||||||
if not pinyin:
|
use_proxy=not args.direct,
|
||||||
continue
|
db_connection=db,
|
||||||
print(f"采集 {province}-{city_name}")
|
)
|
||||||
page = 1
|
crawler.crawl(
|
||||||
while True:
|
output_path=args.output,
|
||||||
url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
max_cities=args.max_cities,
|
||||||
referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
|
city_filter=args.city_filter or None,
|
||||||
print(f" 第 {page} 页: {url}")
|
)
|
||||||
items = self._fetch_page(url, referer)
|
|
||||||
if not items:
|
|
||||||
break
|
|
||||||
|
|
||||||
phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
|
|
||||||
existing = self._existing_phones(phones)
|
|
||||||
|
|
||||||
for entry in items:
|
|
||||||
phone = entry.get("phone")
|
|
||||||
if not phone:
|
|
||||||
continue
|
|
||||||
if phone in existing:
|
|
||||||
print(f" -- 已存在: {entry['name']} ({phone})")
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
self.db.insert_data("lawyer", entry)
|
|
||||||
print(f" -> 新增: {entry['name']} ({phone})")
|
|
||||||
except Exception as exc:
|
|
||||||
print(f" 插入失败: {exc}")
|
|
||||||
|
|
||||||
page += 1
|
|
||||||
|
|
||||||
print("找法网采集完成")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
with Db() as db:
|
main()
|
||||||
spider = FindlawSpider(db)
|
|
||||||
spider.run()
|
|
||||||
|
|||||||
+609
-275
@@ -1,10 +1,18 @@
|
|||||||
|
import argparse
|
||||||
|
import ast
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import random
|
from dataclasses import dataclass
|
||||||
from typing import Dict, Optional
|
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import urllib3
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -14,312 +22,638 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from request.requests_client import RequestClientError, RequestsClient
|
|
||||||
|
|
||||||
from Db import Db
|
from Db import Db
|
||||||
from config import HEADERS
|
from request.requests_client import RequestClientError, RequestsClient
|
||||||
|
from utils.rate_limiter import wait_for_request
|
||||||
|
|
||||||
LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
DOMAIN = "华律"
|
|
||||||
|
SITE_NAME = "hualv"
|
||||||
|
LEGACY_DOMAIN = "华律"
|
||||||
|
SITE_BASE = "https://m.66law.cn"
|
||||||
|
CITY_DATA_URL = "https://cache.66law.cn/dist/main-v2.0.js"
|
||||||
|
LIST_API_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
|
||||||
|
|
||||||
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||||
|
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||||
|
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
|
||||||
|
|
||||||
|
|
||||||
class HualvSpider:
|
@dataclass
|
||||||
def __init__(self, db_connection):
|
class CityTarget:
|
||||||
|
province_id: int
|
||||||
|
province_name: str
|
||||||
|
city_id: int
|
||||||
|
city_name: str
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(text: str) -> str:
|
||||||
|
compact = re.sub(r"\D", "", text or "")
|
||||||
|
match = PHONE_RE.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
def strip_html_tags(text: str) -> str:
|
||||||
|
return re.sub(r"<[^>]+>", "", text or "").strip()
|
||||||
|
|
||||||
|
|
||||||
|
class HualvCrawler:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_pages: int = 9999,
|
||||||
|
sleep_seconds: float = 0.15,
|
||||||
|
use_proxy: bool = True,
|
||||||
|
db_connection=None,
|
||||||
|
):
|
||||||
|
self.max_pages = max_pages
|
||||||
|
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = self._build_session()
|
self.client = RequestsClient(
|
||||||
self.areas = self._load_areas()
|
headers={
|
||||||
|
"User-Agent": (
|
||||||
def _build_session(self) -> RequestsClient:
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||||
custom_headers = HEADERS.copy()
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||||
custom_headers['User-Agent'] = (
|
"Mobile/15E148 Safari/604.1"
|
||||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
|
),
|
||||||
'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||||
'Mobile/15E148 Safari/604.1'
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
|
"Connection": "close",
|
||||||
|
},
|
||||||
|
use_proxy=use_proxy,
|
||||||
|
retry_total=2,
|
||||||
|
retry_backoff_factor=1,
|
||||||
|
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
retry_allowed_methods=("GET", "POST"),
|
||||||
)
|
)
|
||||||
custom_headers["Connection"] = "close"
|
|
||||||
return RequestsClient(headers=custom_headers)
|
|
||||||
|
|
||||||
def _refresh_session(self) -> None:
|
def _request_text(
|
||||||
self.client.refresh()
|
self,
|
||||||
|
method: str,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
timeout: int = 20,
|
||||||
|
max_retries: int = 3,
|
||||||
|
referer: str = SITE_BASE,
|
||||||
|
data: Optional[Dict] = None,
|
||||||
|
) -> str:
|
||||||
|
headers = {"Referer": referer}
|
||||||
|
last_error: Optional[Exception] = None
|
||||||
|
|
||||||
def _load_areas(self):
|
for attempt in range(max_retries):
|
||||||
tables = ("area_new", "area2", "area")
|
wait_for_request()
|
||||||
last_error = None
|
|
||||||
for table in tables:
|
|
||||||
try:
|
try:
|
||||||
provinces = self.db.select_data(
|
if method.upper() == "POST":
|
||||||
table,
|
resp = self.client.post_text(
|
||||||
"code, province, pinyin, id",
|
url,
|
||||||
"domain='66law' AND level=1"
|
timeout=timeout,
|
||||||
) or []
|
verify=False,
|
||||||
cities = self.db.select_data(
|
headers=headers,
|
||||||
table,
|
data=data,
|
||||||
"code, city, province, pid",
|
)
|
||||||
"domain='66law' AND level=2"
|
else:
|
||||||
) or []
|
resp = self.client.get_text(
|
||||||
|
url,
|
||||||
|
timeout=timeout,
|
||||||
|
verify=False,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
code = resp.status_code
|
||||||
|
if code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
self.client.refresh()
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
|
continue
|
||||||
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
|
if code >= 500 and attempt < max_retries - 1:
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
|
continue
|
||||||
|
if code >= 400:
|
||||||
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
|
return resp.text
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
last_error = exc
|
last_error = exc
|
||||||
continue
|
|
||||||
|
|
||||||
if not cities:
|
|
||||||
continue
|
|
||||||
|
|
||||||
province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
|
|
||||||
city_map = {}
|
|
||||||
for city in cities:
|
|
||||||
province_info = province_map.get(city.get('pid'), {}) or {}
|
|
||||||
province_code = province_info.get('code')
|
|
||||||
city_map[city.get('code')] = {
|
|
||||||
"name": city.get('city'),
|
|
||||||
"province": city.get('province'),
|
|
||||||
"province_code": province_code,
|
|
||||||
}
|
|
||||||
print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
|
|
||||||
return city_map
|
|
||||||
|
|
||||||
if last_error:
|
|
||||||
print(f"[华律] 加载地区数据失败: {last_error}")
|
|
||||||
print("[华律] 无城市数据(已尝试 area_new/area2/area)")
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
resp = self.client.post_text(LIST_URL, data=data, timeout=20, verify=False)
|
|
||||||
status_code = resp.status_code
|
|
||||||
text = resp.text
|
|
||||||
if status_code == 403:
|
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
|
||||||
self._refresh_session()
|
|
||||||
time.sleep(wait_time)
|
|
||||||
continue
|
continue
|
||||||
print("请求失败: 403 Forbidden")
|
raise
|
||||||
return None
|
|
||||||
if status_code >= 400:
|
|
||||||
raise RequestClientError(f"{status_code} Error")
|
|
||||||
try:
|
|
||||||
return json.loads(text)
|
|
||||||
except ValueError as exc:
|
|
||||||
print(f"解析JSON失败: {exc}")
|
|
||||||
return None
|
|
||||||
except RequestClientError as exc:
|
|
||||||
print(f"请求失败: {exc}")
|
|
||||||
return None
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
if last_error is not None:
|
||||||
contact_url = f"{url}lawyer_contact.aspx"
|
raise last_error
|
||||||
print(f" 详情: {contact_url}")
|
raise RequestClientError(f"Unknown request error: {url}")
|
||||||
existing = self.db.select_data(
|
|
||||||
"lawyer",
|
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
|
||||||
"id, avatar_url",
|
return self._request_text(
|
||||||
f"domain='{DOMAIN}' AND url='{contact_url}'"
|
"GET",
|
||||||
|
url,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=max_retries,
|
||||||
|
referer=referer,
|
||||||
)
|
)
|
||||||
existing_id = None
|
|
||||||
if existing:
|
|
||||||
existing_id = existing[0].get("id")
|
|
||||||
avatar = (existing[0].get("avatar_url") or "").strip()
|
|
||||||
if avatar:
|
|
||||||
print(" -- 已存在且头像已补全,跳过")
|
|
||||||
return None
|
|
||||||
|
|
||||||
html = self._get_detail(contact_url)
|
def _post_text(
|
||||||
if not html:
|
self,
|
||||||
return None
|
url: str,
|
||||||
|
*,
|
||||||
|
data: Dict,
|
||||||
|
timeout: int = 20,
|
||||||
|
max_retries: int = 3,
|
||||||
|
referer: str = SITE_BASE,
|
||||||
|
) -> str:
|
||||||
|
return self._request_text(
|
||||||
|
"POST",
|
||||||
|
url,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=max_retries,
|
||||||
|
referer=referer,
|
||||||
|
data=data,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _extract_spc_location(self, script_text: str) -> List:
|
||||||
|
# main-v2.js 内置了 sPCLocation=new Array(...),后面紧跟 cateinfo 数组
|
||||||
|
marker = "sPCLocation = new Array("
|
||||||
|
start = script_text.find(marker)
|
||||||
|
if start == -1:
|
||||||
|
marker = "sPCLocation=new Array("
|
||||||
|
start = script_text.find(marker)
|
||||||
|
if start == -1:
|
||||||
|
return []
|
||||||
|
start += len(marker)
|
||||||
|
|
||||||
|
next_marker = script_text.find("cateinfo = new Array(", start)
|
||||||
|
if next_marker == -1:
|
||||||
|
next_marker = script_text.find("cateinfo=new Array(", start)
|
||||||
|
|
||||||
|
if next_marker != -1:
|
||||||
|
end = script_text.rfind(");", start, next_marker)
|
||||||
|
else:
|
||||||
|
end = script_text.find(");", start)
|
||||||
|
|
||||||
|
if end == -1 or end <= start:
|
||||||
|
return []
|
||||||
|
|
||||||
|
raw = "[" + script_text[start:end] + "]"
|
||||||
|
try:
|
||||||
|
data = ast.literal_eval(raw)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
return data if isinstance(data, list) else []
|
||||||
|
|
||||||
|
def discover_cities(self) -> List[CityTarget]:
|
||||||
|
script_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyer/")
|
||||||
|
rows = self._extract_spc_location(script_text)
|
||||||
|
|
||||||
|
targets: List[CityTarget] = []
|
||||||
|
seen: Set[Tuple[int, int]] = set()
|
||||||
|
|
||||||
|
for province in rows:
|
||||||
|
if not isinstance(province, list) or len(province) < 3:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
province_id = int(province[0])
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
province_name = str(province[1] or "").strip()
|
||||||
|
city_rows = province[2] if isinstance(province[2], list) else []
|
||||||
|
|
||||||
|
for city in city_rows:
|
||||||
|
if not isinstance(city, list) or len(city) < 2:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
city_id = int(city[0])
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
city_name = str(city[1] or "").strip()
|
||||||
|
if city_id <= 0 or not city_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
key = (province_id, city_id)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
|
||||||
|
targets.append(
|
||||||
|
CityTarget(
|
||||||
|
province_id=province_id,
|
||||||
|
province_name=province_name,
|
||||||
|
city_id=city_id,
|
||||||
|
city_name=city_name,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return targets
|
||||||
|
|
||||||
|
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[Dict], int]:
|
||||||
|
payload = {
|
||||||
|
"pid": str(target.province_id),
|
||||||
|
"cid": str(target.city_id),
|
||||||
|
"page": str(page),
|
||||||
|
}
|
||||||
|
text = self._post_text(
|
||||||
|
LIST_API_URL,
|
||||||
|
data=payload,
|
||||||
|
referer=SITE_BASE + "/findlawyer/",
|
||||||
|
)
|
||||||
|
data = json.loads((text or "").strip().lstrip("\ufeff") or "{}")
|
||||||
|
items = data.get("lawyerList") or data.get("queryLawyerList") or []
|
||||||
|
if not isinstance(items, list):
|
||||||
|
items = []
|
||||||
|
|
||||||
|
page_count = 0
|
||||||
|
try:
|
||||||
|
page_count = int((data.get("lawyerItems") or {}).get("pageCount") or 0)
|
||||||
|
except Exception:
|
||||||
|
page_count = 0
|
||||||
|
return items, page_count
|
||||||
|
|
||||||
|
def parse_detail(self, detail_url: str) -> Dict:
|
||||||
|
contact_url = detail_url.rstrip("/") + "/lawyer_contact.aspx"
|
||||||
|
html = self._get_text(contact_url, referer=detail_url)
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
info_list = soup.find("ul", class_="information-list")
|
full_text = soup.get_text(" ", strip=True)
|
||||||
if not info_list:
|
|
||||||
return None
|
|
||||||
|
|
||||||
phone = ""
|
|
||||||
law_firm = ""
|
|
||||||
for li in info_list.find_all("li"):
|
|
||||||
text = li.get_text(strip=True)
|
|
||||||
if "手机号" in text:
|
|
||||||
cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
|
|
||||||
match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
|
|
||||||
if match:
|
|
||||||
phone = match.group(0)
|
|
||||||
if "执业单位" in text:
|
|
||||||
law_firm = text.replace("执业单位", "").strip()
|
|
||||||
|
|
||||||
name = ""
|
name = ""
|
||||||
breadcrumb = soup.find("div", class_="weizhi")
|
law_firm = ""
|
||||||
if breadcrumb:
|
phone = ""
|
||||||
links = breadcrumb.find_all("a")
|
email = ""
|
||||||
if len(links) > 2:
|
address = ""
|
||||||
name = links[2].get_text(strip=True)
|
license_no = ""
|
||||||
|
practice_years: Optional[int] = None
|
||||||
|
|
||||||
phone = phone.replace('-', '').strip()
|
name_tag = soup.select_one(".logo-box .title b")
|
||||||
if not phone or not re.fullmatch(r"1\d{10}", phone):
|
if name_tag:
|
||||||
print(" 无手机号,跳过")
|
name = name_tag.get_text(strip=True).replace("律师", "").strip()
|
||||||
|
if not name and soup.title:
|
||||||
|
match = re.search(r"([^\s,,。_]+?)律师", soup.title.get_text(" ", strip=True))
|
||||||
|
if match:
|
||||||
|
name = match.group(1).strip()
|
||||||
|
|
||||||
|
phone_candidates = [
|
||||||
|
soup.select_one(".logo-box .r-bar .tel").get_text(" ", strip=True)
|
||||||
|
if soup.select_one(".logo-box .r-bar .tel")
|
||||||
|
else "",
|
||||||
|
soup.select_one(".lawyer-show ul.info").get_text(" ", strip=True)
|
||||||
|
if soup.select_one(".lawyer-show ul.info")
|
||||||
|
else "",
|
||||||
|
full_text,
|
||||||
|
]
|
||||||
|
for candidate in phone_candidates:
|
||||||
|
phone = normalize_phone(candidate)
|
||||||
|
if phone:
|
||||||
|
break
|
||||||
|
|
||||||
|
for li in soup.select(".lawyer-show ul.info li"):
|
||||||
|
li_text = li.get_text(" ", strip=True)
|
||||||
|
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
|
||||||
|
law_firm = li_text
|
||||||
|
|
||||||
|
if not law_firm:
|
||||||
|
match = re.search(r'"affiliation":\{"@type":"Organization","name":"([^"]+)"', html)
|
||||||
|
if match:
|
||||||
|
law_firm = match.group(1).strip()
|
||||||
|
|
||||||
|
match = re.search(r'"identifier":"([^"]+)"', html)
|
||||||
|
if match:
|
||||||
|
license_no = match.group(1).strip()
|
||||||
|
|
||||||
|
match = re.search(r'"streetAddress":"([^"]+)"', html)
|
||||||
|
if match:
|
||||||
|
address = match.group(1).strip()
|
||||||
|
|
||||||
|
email_match = EMAIL_RE.search(html)
|
||||||
|
if email_match:
|
||||||
|
email = email_match.group(0).strip()
|
||||||
|
|
||||||
|
year_match = YEAR_RE.search(full_text)
|
||||||
|
if year_match:
|
||||||
|
try:
|
||||||
|
practice_years = int(year_match.group(1))
|
||||||
|
except Exception:
|
||||||
|
practice_years = None
|
||||||
|
|
||||||
|
specialties = [node.get_text(strip=True) for node in soup.select(".tag-h38 span")]
|
||||||
|
specialties = [x for x in specialties if x]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"phone": phone,
|
||||||
|
"email": email,
|
||||||
|
"address": address,
|
||||||
|
"license_no": license_no,
|
||||||
|
"practice_years": practice_years,
|
||||||
|
"specialties": specialties,
|
||||||
|
"detail_url": detail_url,
|
||||||
|
"contact_url": contact_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||||
|
seen_details: Set[str] = set()
|
||||||
|
|
||||||
|
for page in range(1, self.max_pages + 1):
|
||||||
|
try:
|
||||||
|
items, page_count = self.fetch_list_page(target, page)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[list] 失败 pid={target.province_id} cid={target.city_id} p{page}: {exc}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
detail_url = str(item.get("lawyerUrl") or "").strip()
|
||||||
|
if not detail_url:
|
||||||
|
continue
|
||||||
|
if detail_url.startswith("//"):
|
||||||
|
detail_url = "https:" + detail_url
|
||||||
|
if not detail_url.startswith("http"):
|
||||||
|
detail_url = urljoin(SITE_BASE, detail_url)
|
||||||
|
|
||||||
|
if detail_url in seen_details:
|
||||||
|
continue
|
||||||
|
seen_details.add(detail_url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
detail = self.parse_detail(detail_url)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[detail] 失败 {detail_url}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
uid = str(item.get("lawyerId") or item.get("globalUserId") or detail_url)
|
||||||
|
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
list_name = str(item.get("name") or "").replace("律师", "").strip()
|
||||||
|
category_text = str(item.get("categoryNames") or "").strip()
|
||||||
|
category_arr = [x.strip() for x in re.split(r"[、,,]", category_text) if x.strip()]
|
||||||
|
|
||||||
|
yield {
|
||||||
|
"record_id": record_id,
|
||||||
|
"collected_at": now,
|
||||||
|
"source": {
|
||||||
|
"site": SITE_NAME,
|
||||||
|
"province_id": target.province_id,
|
||||||
|
"province": target.province_name,
|
||||||
|
"city_id": target.city_id,
|
||||||
|
"city": target.city_name,
|
||||||
|
"page": page,
|
||||||
|
"detail_url": detail_url,
|
||||||
|
"contact_url": detail.get("contact_url", ""),
|
||||||
|
},
|
||||||
|
"list_snapshot": {
|
||||||
|
"lawyer_id": item.get("lawyerId"),
|
||||||
|
"name": list_name,
|
||||||
|
"category_names": category_arr,
|
||||||
|
"help_count": strip_html_tags(str(item.get("helpCount") or "")),
|
||||||
|
"comment_score": strip_html_tags(str(item.get("commentScore") or "")),
|
||||||
|
"response_time": str(item.get("responseTime") or "").strip(),
|
||||||
|
"year": item.get("year"),
|
||||||
|
"is_adv": bool(item.get("isAdv")),
|
||||||
|
},
|
||||||
|
"profile": {
|
||||||
|
"name": detail.get("name") or list_name,
|
||||||
|
"law_firm": detail.get("law_firm") or "",
|
||||||
|
"phone": detail.get("phone") or "",
|
||||||
|
"email": detail.get("email") or "",
|
||||||
|
"address": detail.get("address") or "",
|
||||||
|
"license_no": detail.get("license_no") or "",
|
||||||
|
"practice_years": detail.get("practice_years"),
|
||||||
|
"specialties": detail.get("specialties") or category_arr,
|
||||||
|
},
|
||||||
|
"raw": item,
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.sleep_seconds:
|
||||||
|
time.sleep(self.sleep_seconds)
|
||||||
|
|
||||||
|
if page_count > 0 and page >= page_count:
|
||||||
|
break
|
||||||
|
|
||||||
|
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||||
|
source = record.get("source", {}) or {}
|
||||||
|
profile = record.get("profile", {}) or {}
|
||||||
|
|
||||||
|
phone = normalize_phone(profile.get("phone", ""))
|
||||||
|
if not phone:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
avatar_url, site_time = self._extract_avatar_and_time(soup)
|
province = (source.get("province") or "").strip()
|
||||||
data = {
|
city = (source.get("city") or province).strip()
|
||||||
"phone": phone,
|
return {
|
||||||
|
"name": (profile.get("name") or "").strip(),
|
||||||
|
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||||
"province": province,
|
"province": province,
|
||||||
"city": city,
|
"city": city,
|
||||||
"law_firm": law_firm,
|
"phone": phone,
|
||||||
"url": contact_url,
|
"url": (source.get("contact_url") or source.get("detail_url") or "").strip(),
|
||||||
"avatar_url": avatar_url,
|
"domain": LEGACY_DOMAIN,
|
||||||
"create_time": int(time.time()),
|
"create_time": int(record.get("collected_at") or time.time()),
|
||||||
"site_time": site_time,
|
"params": json.dumps(record, ensure_ascii=False),
|
||||||
"domain": DOMAIN,
|
|
||||||
"name": name,
|
|
||||||
"params": json.dumps({"source": url}, ensure_ascii=False)
|
|
||||||
}
|
}
|
||||||
if existing_id:
|
|
||||||
update_data = {
|
|
||||||
"avatar_url": avatar_url,
|
|
||||||
"site_time": site_time,
|
|
||||||
}
|
|
||||||
if name:
|
|
||||||
update_data["name"] = name
|
|
||||||
if law_firm:
|
|
||||||
update_data["law_firm"] = law_firm
|
|
||||||
if province:
|
|
||||||
update_data["province"] = province
|
|
||||||
if city:
|
|
||||||
update_data["city"] = city
|
|
||||||
if phone:
|
|
||||||
update_data["phone"] = phone
|
|
||||||
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
|
|
||||||
try:
|
|
||||||
self.db.update_data("lawyer", update_data, f"id={existing_id}")
|
|
||||||
print(" -- 已存在,已补全头像/时间")
|
|
||||||
except Exception as exc:
|
|
||||||
print(f" 更新失败: {exc}")
|
|
||||||
return None
|
|
||||||
# 若手机号已存在,则更新头像/时间,不再插入新记录
|
|
||||||
existing_phone = self.db.select_data(
|
|
||||||
"lawyer",
|
|
||||||
"id, avatar_url, url",
|
|
||||||
f"domain='{DOMAIN}' AND phone='{phone}'"
|
|
||||||
)
|
|
||||||
if existing_phone:
|
|
||||||
existing_row = existing_phone[0]
|
|
||||||
avatar = (existing_row.get("avatar_url") or "").strip()
|
|
||||||
if avatar:
|
|
||||||
print(" -- 已存在手机号且头像已补全,跳过")
|
|
||||||
return None
|
|
||||||
update_data = {
|
|
||||||
"avatar_url": avatar_url,
|
|
||||||
"site_time": site_time,
|
|
||||||
}
|
|
||||||
if name:
|
|
||||||
update_data["name"] = name
|
|
||||||
if law_firm:
|
|
||||||
update_data["law_firm"] = law_firm
|
|
||||||
if province:
|
|
||||||
update_data["province"] = province
|
|
||||||
if city:
|
|
||||||
update_data["city"] = city
|
|
||||||
if phone:
|
|
||||||
update_data["phone"] = phone
|
|
||||||
if not existing_row.get("url"):
|
|
||||||
update_data["url"] = contact_url
|
|
||||||
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
|
|
||||||
try:
|
|
||||||
self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
|
|
||||||
print(" -- 已存在手机号,已补全头像/时间")
|
|
||||||
except Exception as exc:
|
|
||||||
print(f" 更新失败: {exc}")
|
|
||||||
return None
|
|
||||||
return data
|
|
||||||
|
|
||||||
def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
|
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||||
avatar_url = ""
|
if not self.db or not phones:
|
||||||
site_time = None
|
return set()
|
||||||
img_tag = soup.select_one(
|
|
||||||
"div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
|
|
||||||
)
|
|
||||||
if img_tag:
|
|
||||||
src = (img_tag.get("src") or "").strip()
|
|
||||||
if src:
|
|
||||||
if src.startswith("//"):
|
|
||||||
avatar_url = f"https:{src}"
|
|
||||||
else:
|
|
||||||
avatar_url = src
|
|
||||||
match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
|
|
||||||
if match:
|
|
||||||
site_time = int(f"{match.group(1)}{match.group(2)}")
|
|
||||||
else:
|
|
||||||
match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
|
|
||||||
if match:
|
|
||||||
site_time = int(f"{match.group(1)}{match.group(2)}")
|
|
||||||
return avatar_url, site_time
|
|
||||||
|
|
||||||
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
|
deduped = sorted({p for p in phones if p})
|
||||||
for attempt in range(max_retries):
|
if not deduped:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
existing: Set[str] = set()
|
||||||
|
cur = self.db.db.cursor()
|
||||||
try:
|
try:
|
||||||
resp = self.client.get_text(url, timeout=15, verify=False)
|
chunk_size = 500
|
||||||
status_code = resp.status_code
|
for i in range(0, len(deduped), chunk_size):
|
||||||
text = resp.text
|
chunk = deduped[i:i + chunk_size]
|
||||||
if status_code == 403:
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
if attempt < max_retries - 1:
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||||
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
for row in cur.fetchall():
|
||||||
self._refresh_session()
|
existing.add(row[0])
|
||||||
time.sleep(wait_time)
|
finally:
|
||||||
|
cur.close()
|
||||||
|
|
||||||
|
return existing
|
||||||
|
|
||||||
|
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||||
|
if not self.db:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
rows: List[Dict[str, str]] = []
|
||||||
|
for record in records:
|
||||||
|
row = self._to_legacy_lawyer_row(record)
|
||||||
|
if row:
|
||||||
|
rows.append(row)
|
||||||
|
if not rows:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||||
|
inserted = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
phone = row.get("phone", "")
|
||||||
|
if not phone or phone in existing:
|
||||||
|
skipped += 1
|
||||||
continue
|
continue
|
||||||
print(" 请求失败: 403 Forbidden")
|
try:
|
||||||
return None
|
self.db.insert_data("lawyer", row)
|
||||||
if status_code >= 400:
|
existing.add(phone)
|
||||||
raise RequestClientError(f"{status_code} Error")
|
inserted += 1
|
||||||
return text
|
except Exception as exc:
|
||||||
except RequestClientError as exc:
|
skipped += 1
|
||||||
print(f" 请求失败: {exc}")
|
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||||
return None
|
|
||||||
return None
|
|
||||||
|
|
||||||
def run(self):
|
return inserted, skipped
|
||||||
print("启动华律网采集...")
|
|
||||||
if not self.areas:
|
def crawl(
|
||||||
print("无城市数据")
|
self,
|
||||||
|
output_path: str,
|
||||||
|
max_cities: int = 0,
|
||||||
|
city_filter: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
cities = self.discover_cities()
|
||||||
|
print(f"[discover] 共发现城市 {len(cities)} 个")
|
||||||
|
|
||||||
|
if city_filter:
|
||||||
|
key = city_filter.strip().lower()
|
||||||
|
cities = [
|
||||||
|
c for c in cities
|
||||||
|
if key in c.city_name.lower() or key in str(c.city_id).lower()
|
||||||
|
]
|
||||||
|
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
||||||
|
|
||||||
|
if max_cities > 0:
|
||||||
|
cities = cities[:max_cities]
|
||||||
|
print(f"[discover] 截断城市数 {len(cities)}")
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
seen_ids: Set[str] = set()
|
||||||
|
if os.path.exists(output_path):
|
||||||
|
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||||
|
for line in old_file:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
item = json.loads(line)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
rid = item.get("record_id")
|
||||||
|
if rid:
|
||||||
|
seen_ids.add(rid)
|
||||||
|
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||||
|
|
||||||
|
total_new_json = 0
|
||||||
|
total_new_db = 0
|
||||||
|
total_skip_db = 0
|
||||||
|
|
||||||
|
with open(output_path, "a", encoding="utf-8") as out:
|
||||||
|
for idx, target in enumerate(cities, start=1):
|
||||||
|
print(
|
||||||
|
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||||
|
f"(pid={target.province_id}, cid={target.city_id})"
|
||||||
|
)
|
||||||
|
city_records = list(self.crawl_city(target))
|
||||||
|
|
||||||
|
city_new_json = 0
|
||||||
|
for record in city_records:
|
||||||
|
rid = record["record_id"]
|
||||||
|
if rid in seen_ids:
|
||||||
|
continue
|
||||||
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
seen_ids.add(rid)
|
||||||
|
city_new_json += 1
|
||||||
|
total_new_json += 1
|
||||||
|
|
||||||
|
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||||
|
total_new_db += city_new_db
|
||||||
|
total_skip_db += city_skip_db
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||||
|
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||||
|
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="华律网全新采集脚本(站点数据直采)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="/www/wwwroot/lawyers/data/hualv_records_all.jsonl",
|
||||||
|
help="输出 jsonl 文件路径",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-cities",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="最多采集多少个城市,0 表示不限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
default=9999,
|
||||||
|
help="每个城市最多采集多少页",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--city-filter",
|
||||||
|
default="",
|
||||||
|
help="按城市名称或城市编码过滤,如 beijing / 110100",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sleep",
|
||||||
|
type=float,
|
||||||
|
default=0.15,
|
||||||
|
help="详情页请求间隔秒数",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--direct",
|
||||||
|
action="store_true",
|
||||||
|
help="直连模式,不使用 proxy_settings.json 代理",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-db",
|
||||||
|
action="store_true",
|
||||||
|
help="只输出 JSONL,不写入数据库",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
if args.no_db:
|
||||||
|
crawler = HualvCrawler(
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
sleep_seconds=args.sleep,
|
||||||
|
use_proxy=not args.direct,
|
||||||
|
db_connection=None,
|
||||||
|
)
|
||||||
|
crawler.crawl(
|
||||||
|
output_path=args.output,
|
||||||
|
max_cities=args.max_cities,
|
||||||
|
city_filter=args.city_filter or None,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
for city_code, city_info in self.areas.items():
|
with Db() as db:
|
||||||
province_code = city_info.get("province_code")
|
crawler = HualvCrawler(
|
||||||
if not province_code:
|
max_pages=args.max_pages,
|
||||||
continue
|
sleep_seconds=args.sleep,
|
||||||
province_name = city_info.get("province", "")
|
use_proxy=not args.direct,
|
||||||
city_name = city_info.get("name", "")
|
db_connection=db,
|
||||||
print(f"采集 {province_name}-{city_name}")
|
)
|
||||||
|
crawler.crawl(
|
||||||
page = 1
|
output_path=args.output,
|
||||||
while True:
|
max_cities=args.max_cities,
|
||||||
payload = {"pid": province_code, "cid": city_code, "page": str(page)}
|
city_filter=args.city_filter or None,
|
||||||
data = self._post(payload)
|
)
|
||||||
if not data or not data.get("lawyerList"):
|
|
||||||
break
|
|
||||||
|
|
||||||
for item in data["lawyerList"]:
|
|
||||||
result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
|
|
||||||
if not result:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
self.db.insert_data("lawyer", result)
|
|
||||||
print(f" -> 新增: {result['name']} ({result['phone']})")
|
|
||||||
except Exception as exc:
|
|
||||||
print(f" 插入失败: {exc}")
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
page_count = data.get("lawyerItems", {}).get("pageCount", page)
|
|
||||||
if page >= page_count:
|
|
||||||
break
|
|
||||||
page += 1
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
time.sleep(1)
|
|
||||||
print("华律网采集完成")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
with Db() as db:
|
main()
|
||||||
spider = HualvSpider(db)
|
|
||||||
spider.run()
|
|
||||||
|
|||||||
+585
-216
@@ -1,13 +1,16 @@
|
|||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import random
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict, Optional, List, Set
|
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||||
from urllib.parse import urljoin
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
import urllib3
|
||||||
import threading
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -17,262 +20,628 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
import urllib3
|
from Db import Db
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from request.requests_client import RequestClientError, RequestsClient
|
from request.requests_client import RequestClientError, RequestsClient
|
||||||
|
from utils.rate_limiter import wait_for_request
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
from Db import Db
|
SITE_NAME = "lawtime"
|
||||||
from config import LAWTIME_CONFIG
|
LEGACY_DOMAIN = "法律快车"
|
||||||
|
SITE_BASE = "https://www.lawtime.cn"
|
||||||
|
PROVINCE_API = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode=0"
|
||||||
|
CITY_API_TEMPLATE = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode={province_id}"
|
||||||
|
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/lawyer"
|
||||||
|
|
||||||
LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||||
DETAIL_BASE = "https://m.lawtime.cn"
|
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
|
||||||
DOMAIN = "法律快车"
|
|
||||||
|
|
||||||
|
|
||||||
class LawtimeSpider:
|
@dataclass
|
||||||
def __init__(self, db_connection):
|
class CityTarget:
|
||||||
|
province_id: str
|
||||||
|
province_name: str
|
||||||
|
province_py: str
|
||||||
|
city_id: str
|
||||||
|
city_name: str
|
||||||
|
city_py: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ListCard:
|
||||||
|
detail_url: str
|
||||||
|
name: str
|
||||||
|
phone: str
|
||||||
|
address: str = ""
|
||||||
|
specialties: List[str] = field(default_factory=list)
|
||||||
|
metric_text: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(text: str) -> str:
|
||||||
|
compact = re.sub(r"\D", "", text or "")
|
||||||
|
match = PHONE_RE.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
class LawtimeCrawler:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_pages: int = 9999,
|
||||||
|
sleep_seconds: float = 0.1,
|
||||||
|
use_proxy: bool = True,
|
||||||
|
db_connection=None,
|
||||||
|
):
|
||||||
|
self.max_pages = max_pages
|
||||||
|
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = self._build_session()
|
self.client = RequestsClient(
|
||||||
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
|
headers={
|
||||||
self._tls = threading.local()
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "text/html,application/json,*/*;q=0.8",
|
||||||
|
"Connection": "close",
|
||||||
|
},
|
||||||
|
use_proxy=use_proxy,
|
||||||
|
retry_total=2,
|
||||||
|
retry_backoff_factor=1,
|
||||||
|
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
retry_allowed_methods=("GET",),
|
||||||
|
)
|
||||||
|
|
||||||
def _build_session(self) -> RequestsClient:
|
def _get_text(
|
||||||
headers = LAWTIME_CONFIG.get("HEADERS", {})
|
self,
|
||||||
custom_headers = dict(headers) if headers else {}
|
url: str,
|
||||||
custom_headers.setdefault("Connection", "close")
|
*,
|
||||||
return RequestsClient(headers=custom_headers)
|
timeout: int = 20,
|
||||||
|
max_retries: int = 3,
|
||||||
|
referer: str = SITE_BASE,
|
||||||
|
) -> str:
|
||||||
|
headers = {"Referer": referer}
|
||||||
|
last_error: Optional[Exception] = None
|
||||||
|
|
||||||
def _refresh_session(self) -> None:
|
for attempt in range(max_retries):
|
||||||
|
wait_for_request()
|
||||||
|
try:
|
||||||
|
resp = self.client.get_text(
|
||||||
|
url,
|
||||||
|
timeout=timeout,
|
||||||
|
verify=False,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
code = resp.status_code
|
||||||
|
if code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
self.client.refresh()
|
self.client.refresh()
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
|
continue
|
||||||
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
|
if code >= 500 and attempt < max_retries - 1:
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
|
continue
|
||||||
|
if code >= 400:
|
||||||
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
|
return resp.text
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = exc
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
|
||||||
def _get_thread_session(self) -> RequestsClient:
|
if last_error is not None:
|
||||||
s = getattr(self._tls, "session", None)
|
raise last_error
|
||||||
if s is not None:
|
raise RequestClientError(f"Unknown request error: {url}")
|
||||||
return s
|
|
||||||
s = self.client.clone()
|
|
||||||
self._tls.session = s
|
|
||||||
return s
|
|
||||||
|
|
||||||
def _refresh_thread_session(self) -> None:
|
def _get_json(self, url: str, *, referer: str) -> List[Dict]:
|
||||||
s = getattr(self._tls, "session", None)
|
text = self._get_text(url, referer=referer)
|
||||||
if s is not None:
|
cleaned = (text or "").strip().lstrip("\ufeff")
|
||||||
s.close()
|
if not cleaned or cleaned.startswith("<"):
|
||||||
self._tls.session = None
|
return []
|
||||||
|
try:
|
||||||
|
data = json.loads(cleaned)
|
||||||
|
except ValueError:
|
||||||
|
return []
|
||||||
|
return data if isinstance(data, list) else []
|
||||||
|
|
||||||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
def discover_cities(self) -> List[CityTarget]:
|
||||||
if not phones:
|
provinces = self._get_json(PROVINCE_API, referer=SITE_BASE)
|
||||||
|
if not provinces:
|
||||||
|
print("[discover] 地区接口未返回有效数据")
|
||||||
|
return []
|
||||||
|
|
||||||
|
results: List[CityTarget] = []
|
||||||
|
seen_py: Set[str] = set()
|
||||||
|
|
||||||
|
for province in provinces:
|
||||||
|
province_id = str(province.get("id") or "").strip()
|
||||||
|
province_name = str(province.get("province") or province.get("city") or "").strip()
|
||||||
|
province_py = str(province.get("pinyin") or "").strip()
|
||||||
|
if not province_id or not province_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
city_api = CITY_API_TEMPLATE.format(province_id=province_id)
|
||||||
|
try:
|
||||||
|
cities = self._get_json(city_api, referer=LIST_URL_TEMPLATE.format(city_py=province_py or ""))
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[city] 获取失败 province={province_id}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not cities:
|
||||||
|
cities = [
|
||||||
|
{
|
||||||
|
"id": province_id,
|
||||||
|
"province": province_name,
|
||||||
|
"city": province_name,
|
||||||
|
"pinyin": province_py,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
for city in cities:
|
||||||
|
city_id = str(city.get("id") or "").strip()
|
||||||
|
city_name = str(city.get("city") or city.get("province") or "").strip()
|
||||||
|
city_py = str(city.get("pinyin") or "").strip()
|
||||||
|
if not city_id or not city_name or not city_py:
|
||||||
|
continue
|
||||||
|
if city_py in seen_py:
|
||||||
|
continue
|
||||||
|
seen_py.add(city_py)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
CityTarget(
|
||||||
|
province_id=province_id,
|
||||||
|
province_name=province_name,
|
||||||
|
province_py=province_py,
|
||||||
|
city_id=city_id,
|
||||||
|
city_name=city_name,
|
||||||
|
city_py=city_py,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _build_list_url(self, city_py: str, page: int) -> str:
|
||||||
|
base = LIST_URL_TEMPLATE.format(city_py=city_py)
|
||||||
|
if page <= 1:
|
||||||
|
return base
|
||||||
|
return f"{base}?page={page}"
|
||||||
|
|
||||||
|
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[ListCard], bool, str]:
|
||||||
|
list_url = self._build_list_url(target.city_py, page)
|
||||||
|
html = self._get_text(list_url, referer=SITE_BASE + "/")
|
||||||
|
|
||||||
|
cards = self.parse_list_cards(html)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
next_link = soup.select_one(f"div.page a[href*='page={page + 1}']")
|
||||||
|
has_next = next_link is not None
|
||||||
|
|
||||||
|
return cards, has_next, list_url
|
||||||
|
|
||||||
|
def parse_list_cards(self, html: str) -> List[ListCard]:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
cards: List[ListCard] = []
|
||||||
|
seen: Set[str] = set()
|
||||||
|
|
||||||
|
for item in soup.select("li.lawyer-item-card"):
|
||||||
|
link_tag = item.select_one("a.name[href]") or item.select_one("a.lawyer-img-box[href]")
|
||||||
|
if not link_tag:
|
||||||
|
continue
|
||||||
|
detail_url = (link_tag.get("href") or "").strip()
|
||||||
|
if not detail_url.startswith("http"):
|
||||||
|
continue
|
||||||
|
if detail_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(detail_url)
|
||||||
|
|
||||||
|
name = link_tag.get_text(strip=True)
|
||||||
|
phone = ""
|
||||||
|
phone_tag = item.select_one("div.phone")
|
||||||
|
if phone_tag:
|
||||||
|
phone = normalize_phone(phone_tag.get_text(" ", strip=True))
|
||||||
|
|
||||||
|
address = ""
|
||||||
|
addr_tag = item.select_one("div.location .txt")
|
||||||
|
if addr_tag:
|
||||||
|
address = addr_tag.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
specialties: List[str] = []
|
||||||
|
prof_tag = item.select_one("div.prof .txt")
|
||||||
|
if prof_tag:
|
||||||
|
specialties = [
|
||||||
|
x.strip() for x in re.split(r"[、,,]", prof_tag.get_text(" ", strip=True)) if x.strip()
|
||||||
|
]
|
||||||
|
|
||||||
|
metric_text = ""
|
||||||
|
metric_tag = item.select_one("div.num-msg")
|
||||||
|
if metric_tag:
|
||||||
|
metric_text = metric_tag.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
cards.append(
|
||||||
|
ListCard(
|
||||||
|
detail_url=detail_url,
|
||||||
|
name=name,
|
||||||
|
phone=phone,
|
||||||
|
address=address,
|
||||||
|
specialties=specialties,
|
||||||
|
metric_text=metric_text,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return cards
|
||||||
|
|
||||||
|
def parse_detail(self, detail_url: str) -> Dict:
|
||||||
|
html = self._get_text(detail_url, referer=SITE_BASE)
|
||||||
|
if "网站防火墙" in html or "访问被拒绝" in html or "/ipfilter/verify" in html:
|
||||||
|
raise RequestClientError(f"firewall blocked: {detail_url}")
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
text = soup.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
name = ""
|
||||||
|
law_firm = ""
|
||||||
|
phone = ""
|
||||||
|
address = ""
|
||||||
|
practice_years: Optional[int] = None
|
||||||
|
specialties: List[str] = []
|
||||||
|
|
||||||
|
if soup.title:
|
||||||
|
title = soup.title.get_text(" ", strip=True)
|
||||||
|
match = re.search(r"([^\s_,,。]+?)律师", title)
|
||||||
|
if match:
|
||||||
|
name = match.group(1).strip()
|
||||||
|
|
||||||
|
phone_candidates = [
|
||||||
|
soup.select_one(".data-w .tel-b b").get_text(" ", strip=True)
|
||||||
|
if soup.select_one(".data-w .tel-b b")
|
||||||
|
else "",
|
||||||
|
soup.select_one(".law-info-b .item .two-r.b").get_text(" ", strip=True)
|
||||||
|
if soup.select_one(".law-info-b .item .two-r.b")
|
||||||
|
else "",
|
||||||
|
text,
|
||||||
|
]
|
||||||
|
for candidate in phone_candidates:
|
||||||
|
phone = normalize_phone(candidate)
|
||||||
|
if phone:
|
||||||
|
break
|
||||||
|
|
||||||
|
law_firm_tag = soup.select_one(".law-info-b .item .two-nowrap")
|
||||||
|
if law_firm_tag:
|
||||||
|
law_firm = law_firm_tag.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
for li in soup.select(".law-info-b .item"):
|
||||||
|
li_text = li.get_text(" ", strip=True)
|
||||||
|
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
|
||||||
|
law_firm = li_text
|
||||||
|
|
||||||
|
addr_tag = soup.select_one(".law-info-b .item .two-r[title]")
|
||||||
|
if addr_tag:
|
||||||
|
addr_value = (addr_tag.get("title") or "").strip()
|
||||||
|
if len(addr_value) > 8:
|
||||||
|
address = addr_value
|
||||||
|
|
||||||
|
if not address:
|
||||||
|
addr_tag = soup.select_one(".law-info-b .item .two-r")
|
||||||
|
if addr_tag:
|
||||||
|
addr_value = addr_tag.get_text(" ", strip=True)
|
||||||
|
if len(addr_value) > 8 and "律师" not in addr_value:
|
||||||
|
address = addr_value
|
||||||
|
|
||||||
|
year_match = YEAR_RE.search(text)
|
||||||
|
if year_match:
|
||||||
|
try:
|
||||||
|
practice_years = int(year_match.group(1))
|
||||||
|
except Exception:
|
||||||
|
practice_years = None
|
||||||
|
|
||||||
|
specialties = [x.get_text(strip=True) for x in soup.select(".profession-b .item") if x.get_text(strip=True)]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"phone": phone,
|
||||||
|
"address": address,
|
||||||
|
"practice_years": practice_years,
|
||||||
|
"specialties": specialties,
|
||||||
|
"detail_url": detail_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||||
|
seen_details: Set[str] = set()
|
||||||
|
|
||||||
|
for page in range(1, self.max_pages + 1):
|
||||||
|
try:
|
||||||
|
cards, has_next, list_url = self.fetch_list_page(target, page)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not cards:
|
||||||
|
break
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
if card.detail_url in seen_details:
|
||||||
|
continue
|
||||||
|
seen_details.add(card.detail_url)
|
||||||
|
|
||||||
|
detail: Dict = {}
|
||||||
|
try:
|
||||||
|
detail = self.parse_detail(card.detail_url)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[detail] 失败 {card.detail_url}: {exc}")
|
||||||
|
|
||||||
|
phone = normalize_phone(detail.get("phone") or card.phone)
|
||||||
|
profile_name = (detail.get("name") or card.name).replace("律师", "").strip()
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
yield {
|
||||||
|
"record_id": record_id,
|
||||||
|
"collected_at": now,
|
||||||
|
"source": {
|
||||||
|
"site": SITE_NAME,
|
||||||
|
"province_id": target.province_id,
|
||||||
|
"province": target.province_name,
|
||||||
|
"province_py": target.province_py,
|
||||||
|
"city_id": target.city_id,
|
||||||
|
"city": target.city_name,
|
||||||
|
"city_py": target.city_py,
|
||||||
|
"page": page,
|
||||||
|
"list_url": list_url,
|
||||||
|
"detail_url": card.detail_url,
|
||||||
|
},
|
||||||
|
"list_snapshot": {
|
||||||
|
"name": card.name,
|
||||||
|
"phone": card.phone,
|
||||||
|
"address": card.address,
|
||||||
|
"specialties": card.specialties,
|
||||||
|
"metric_text": card.metric_text,
|
||||||
|
},
|
||||||
|
"profile": {
|
||||||
|
"name": profile_name,
|
||||||
|
"law_firm": (detail.get("law_firm") or "").strip(),
|
||||||
|
"phone": phone,
|
||||||
|
"address": (detail.get("address") or card.address or "").strip(),
|
||||||
|
"practice_years": detail.get("practice_years"),
|
||||||
|
"specialties": detail.get("specialties") or card.specialties,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.sleep_seconds:
|
||||||
|
time.sleep(self.sleep_seconds)
|
||||||
|
|
||||||
|
if not has_next:
|
||||||
|
break
|
||||||
|
|
||||||
|
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||||
|
source = record.get("source", {}) or {}
|
||||||
|
profile = record.get("profile", {}) or {}
|
||||||
|
|
||||||
|
phone = normalize_phone(profile.get("phone", ""))
|
||||||
|
if not phone:
|
||||||
|
return None
|
||||||
|
|
||||||
|
province = (source.get("province") or "").strip()
|
||||||
|
city = (source.get("city") or province).strip()
|
||||||
|
return {
|
||||||
|
"name": (profile.get("name") or "").strip(),
|
||||||
|
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||||
|
"province": province,
|
||||||
|
"city": city,
|
||||||
|
"phone": phone,
|
||||||
|
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
|
||||||
|
"domain": LEGACY_DOMAIN,
|
||||||
|
"create_time": int(record.get("collected_at") or time.time()),
|
||||||
|
"params": json.dumps(record, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||||
|
if not self.db or not phones:
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
|
deduped = sorted({p for p in phones if p})
|
||||||
|
if not deduped:
|
||||||
|
return set()
|
||||||
|
|
||||||
existing: Set[str] = set()
|
existing: Set[str] = set()
|
||||||
cur = self.db.db.cursor()
|
cur = self.db.db.cursor()
|
||||||
try:
|
try:
|
||||||
chunk_size = 500
|
chunk_size = 500
|
||||||
for i in range(0, len(phones), chunk_size):
|
for i in range(0, len(deduped), chunk_size):
|
||||||
chunk = phones[i:i + chunk_size]
|
chunk = deduped[i:i + chunk_size]
|
||||||
placeholders = ",".join(["%s"] * len(chunk))
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
cur.execute(sql, [DOMAIN, *chunk])
|
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||||
for row in cur.fetchall():
|
for row in cur.fetchall():
|
||||||
existing.add(row[0])
|
existing.add(row[0])
|
||||||
finally:
|
finally:
|
||||||
cur.close()
|
cur.close()
|
||||||
|
|
||||||
return existing
|
return existing
|
||||||
|
|
||||||
def _load_areas(self):
|
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||||
condition = "level = 2 and domain='法律快车'"
|
if not self.db:
|
||||||
tables = ("area_new", "area", "area2")
|
return 0, 0
|
||||||
last_error = None
|
|
||||||
for table in tables:
|
rows: List[Dict[str, str]] = []
|
||||||
|
for record in records:
|
||||||
|
row = self._to_legacy_lawyer_row(record)
|
||||||
|
if row:
|
||||||
|
rows.append(row)
|
||||||
|
if not rows:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||||
|
inserted = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
phone = row.get("phone", "")
|
||||||
|
if not phone or phone in existing:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
rows = self.db.select_data(table, "pinyin, province, city", condition) or []
|
self.db.insert_data("lawyer", row)
|
||||||
|
existing.add(phone)
|
||||||
|
inserted += 1
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
last_error = exc
|
skipped += 1
|
||||||
continue
|
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||||
if rows:
|
|
||||||
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
|
|
||||||
print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
|
||||||
return rows
|
|
||||||
|
|
||||||
if last_error:
|
return inserted, skipped
|
||||||
print(f"[法律快车] 加载地区数据失败: {last_error}")
|
|
||||||
print("[法律快车] 无城市数据(已尝试 area_new/area/area2)")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
|
def crawl(
|
||||||
return self._get_with_session(self.client, url, max_retries=max_retries, is_thread=False)
|
self,
|
||||||
|
output_path: str,
|
||||||
|
max_cities: int = 0,
|
||||||
|
city_filter: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
cities = self.discover_cities()
|
||||||
|
print(f"[discover] 共发现城市 {len(cities)} 个")
|
||||||
|
|
||||||
def _get_with_session(self, session: RequestsClient, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
|
if city_filter:
|
||||||
for attempt in range(max_retries):
|
key = city_filter.strip().lower()
|
||||||
try:
|
cities = [
|
||||||
resp = session.get_text(url, timeout=15, verify=False)
|
c for c in cities
|
||||||
status_code = resp.status_code
|
if key in c.city_py.lower() or key in c.city_name.lower()
|
||||||
text = resp.text
|
]
|
||||||
if status_code == 403:
|
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
||||||
if attempt < max_retries - 1:
|
|
||||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
|
||||||
print(f"请求失败 {url}: 403,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
|
||||||
if is_thread:
|
|
||||||
self._refresh_thread_session()
|
|
||||||
session = self._get_thread_session()
|
|
||||||
else:
|
|
||||||
self._refresh_session()
|
|
||||||
session = self.client
|
|
||||||
time.sleep(wait_time)
|
|
||||||
continue
|
|
||||||
print(f"请求失败 {url}: 403 Forbidden")
|
|
||||||
return None
|
|
||||||
if status_code >= 400:
|
|
||||||
raise RequestClientError(f"{status_code} Error: {url}")
|
|
||||||
return text
|
|
||||||
except RequestClientError as exc:
|
|
||||||
print(f"请求失败 {url}: {exc}")
|
|
||||||
return None
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _parse_list(self, html: str, province: str, city: str) -> int:
|
if max_cities > 0:
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
cities = cities[:max_cities]
|
||||||
links = [a.get("href", "") for a in soup.select("a.hide_link")]
|
print(f"[discover] 截断城市数 {len(cities)}")
|
||||||
links = [link.replace("lll", "int") for link in links if link]
|
|
||||||
if not links:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
results: List[Dict[str, str]] = []
|
seen_ids: Set[str] = set()
|
||||||
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
if os.path.exists(output_path):
|
||||||
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
|
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||||
for fut in as_completed(futs):
|
for line in old_file:
|
||||||
try:
|
line = line.strip()
|
||||||
data = fut.result()
|
if not line:
|
||||||
except Exception as exc:
|
|
||||||
print(f" 详情解析异常: {exc}")
|
|
||||||
continue
|
|
||||||
if data and data.get("phone"):
|
|
||||||
results.append(data)
|
|
||||||
|
|
||||||
if not results:
|
|
||||||
return len(detail_urls)
|
|
||||||
|
|
||||||
phones = [d["phone"] for d in results if d.get("phone")]
|
|
||||||
existing = self._existing_phones(phones)
|
|
||||||
|
|
||||||
for data in results:
|
|
||||||
phone = data.get("phone")
|
|
||||||
if not phone:
|
|
||||||
continue
|
|
||||||
if phone in existing:
|
|
||||||
print(f" -- 已存在: {data['name']} ({phone})")
|
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
self.db.insert_data("lawyer", data)
|
item = json.loads(line)
|
||||||
print(f" -> 新增: {data['name']} ({phone})")
|
except Exception:
|
||||||
except Exception as exc:
|
|
||||||
print(f" 插入失败 {data.get('url')}: {exc}")
|
|
||||||
|
|
||||||
return len(detail_urls)
|
|
||||||
|
|
||||||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
|
||||||
html = None
|
|
||||||
sess = self._get_thread_session()
|
|
||||||
html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
|
|
||||||
if not html:
|
|
||||||
return None
|
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
text = soup.get_text(" ")
|
|
||||||
|
|
||||||
name = ""
|
|
||||||
title_tag = soup.find("title")
|
|
||||||
if title_tag:
|
|
||||||
match = re.search(r"(\S+)律师", title_tag.get_text())
|
|
||||||
if match:
|
|
||||||
name = match.group(1)
|
|
||||||
if not name:
|
|
||||||
intl_div = soup.find("div", class_="intl")
|
|
||||||
if intl_div:
|
|
||||||
match = re.search(r"(\S+)律师", intl_div.get_text())
|
|
||||||
if match:
|
|
||||||
name = match.group(1)
|
|
||||||
|
|
||||||
phone = ""
|
|
||||||
phone_pattern = r"1[3-9]\d{9}"
|
|
||||||
for item in soup.select("div.item.flex"):
|
|
||||||
label = item.find("div", class_="label")
|
|
||||||
desc = item.find("div", class_="desc")
|
|
||||||
if not label or not desc:
|
|
||||||
continue
|
continue
|
||||||
label_text = label.get_text()
|
rid = item.get("record_id")
|
||||||
desc_text = desc.get_text().replace("-", "")
|
if rid:
|
||||||
if "联系电话" in label_text or "电话" in label_text:
|
seen_ids.add(rid)
|
||||||
matches = re.findall(phone_pattern, desc_text)
|
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||||
if matches:
|
|
||||||
phone = matches[0]
|
|
||||||
break
|
|
||||||
if not phone:
|
|
||||||
matches = re.findall(phone_pattern, text.replace("-", ""))
|
|
||||||
if matches:
|
|
||||||
phone = matches[0]
|
|
||||||
if not phone:
|
|
||||||
print(f" 无手机号: {url}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
law_firm = ""
|
total_new_json = 0
|
||||||
for item in soup.select("div.item.flex"):
|
total_new_db = 0
|
||||||
label = item.find("div", class_="label")
|
total_skip_db = 0
|
||||||
desc = item.find("div", class_="desc")
|
|
||||||
if not label or not desc:
|
with open(output_path, "a", encoding="utf-8") as out:
|
||||||
|
for idx, target in enumerate(cities, start=1):
|
||||||
|
print(
|
||||||
|
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||||
|
f"({target.city_py})"
|
||||||
|
)
|
||||||
|
city_records = list(self.crawl_city(target))
|
||||||
|
|
||||||
|
city_new_json = 0
|
||||||
|
for record in city_records:
|
||||||
|
rid = record["record_id"]
|
||||||
|
if rid in seen_ids:
|
||||||
continue
|
continue
|
||||||
if "执业律所" in label.get_text() or "律所" in label.get_text():
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
law_firm = desc.get_text(strip=True).replace("已认证", "")
|
seen_ids.add(rid)
|
||||||
break
|
city_new_json += 1
|
||||||
|
total_new_json += 1
|
||||||
|
|
||||||
params = {
|
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||||
"list_url": url,
|
total_new_db += city_new_db
|
||||||
"province": province,
|
total_skip_db += city_skip_db
|
||||||
"city": city,
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
print(
|
||||||
"name": name or "",
|
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||||
"law_firm": law_firm,
|
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||||
"province": province,
|
)
|
||||||
"city": city,
|
|
||||||
"phone": phone,
|
|
||||||
"url": url,
|
|
||||||
"domain": DOMAIN,
|
|
||||||
"create_time": int(time.time()),
|
|
||||||
"params": json.dumps(params, ensure_ascii=False)
|
|
||||||
}
|
|
||||||
|
|
||||||
def run(self):
|
print(
|
||||||
print("启动法律快车采集...")
|
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||||
areas = self._load_areas()
|
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||||
if not areas:
|
)
|
||||||
print("无地区数据")
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="法律快车全新采集脚本(站点数据直采)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="/www/wwwroot/lawyers/data/lawtime_records_all.jsonl",
|
||||||
|
help="输出 jsonl 文件路径",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-cities",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="最多采集多少个城市,0 表示不限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
default=9999,
|
||||||
|
help="每个城市最多采集多少页",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--city-filter",
|
||||||
|
default="",
|
||||||
|
help="按城市拼音或城市名过滤,如 beijing",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sleep",
|
||||||
|
type=float,
|
||||||
|
default=0.1,
|
||||||
|
help="详情页请求间隔秒数",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--direct",
|
||||||
|
action="store_true",
|
||||||
|
help="直连模式,不使用 proxy_settings.json 代理",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-db",
|
||||||
|
action="store_true",
|
||||||
|
help="只输出 JSONL,不写入数据库",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
if args.no_db:
|
||||||
|
crawler = LawtimeCrawler(
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
sleep_seconds=args.sleep,
|
||||||
|
use_proxy=not args.direct,
|
||||||
|
db_connection=None,
|
||||||
|
)
|
||||||
|
crawler.crawl(
|
||||||
|
output_path=args.output,
|
||||||
|
max_cities=args.max_cities,
|
||||||
|
city_filter=args.city_filter or None,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
for area in areas:
|
with Db() as db:
|
||||||
pinyin = area.get("pinyin")
|
crawler = LawtimeCrawler(
|
||||||
province = area.get("province", "")
|
max_pages=args.max_pages,
|
||||||
city = area.get("city", "")
|
sleep_seconds=args.sleep,
|
||||||
if not pinyin:
|
use_proxy=not args.direct,
|
||||||
continue
|
db_connection=db,
|
||||||
page = 1
|
)
|
||||||
while True:
|
crawler.crawl(
|
||||||
list_url = LIST_BASE.format(pinyin=pinyin, page=page)
|
output_path=args.output,
|
||||||
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
max_cities=args.max_cities,
|
||||||
html = self._get(list_url)
|
city_filter=args.city_filter or None,
|
||||||
if not html:
|
)
|
||||||
break
|
|
||||||
link_count = self._parse_list(html, province, city)
|
|
||||||
if link_count == 0:
|
|
||||||
break
|
|
||||||
page += 1
|
|
||||||
print("法律快车采集完成")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
with Db() as db:
|
main()
|
||||||
spider = LawtimeSpider(db)
|
|
||||||
spider.run()
|
|
||||||
|
|||||||
+604
-240
@@ -1,11 +1,17 @@
|
|||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import random
|
from dataclasses import dataclass
|
||||||
from typing import Dict, Optional, List, Set
|
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from urllib.parse import urljoin
|
||||||
import threading
|
|
||||||
|
import urllib3
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -15,146 +21,237 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
import urllib3
|
from Db import Db
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from request.requests_client import RequestClientError, RequestsClient
|
from request.requests_client import RequestClientError, RequestsClient
|
||||||
|
from utils.rate_limiter import wait_for_request
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
from Db import Db
|
SITE_NAME = "64365"
|
||||||
|
LEGACY_DOMAIN = "律图"
|
||||||
|
SITE_BASE = "https://m.64365.com"
|
||||||
|
AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js"
|
||||||
|
LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
|
||||||
|
|
||||||
DOMAIN = "律图"
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||||
LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
|
YEAR_RE = re.compile(r"(\d+)\s*年")
|
||||||
|
|
||||||
|
|
||||||
class Six4365Spider:
|
@dataclass
|
||||||
def __init__(self, db_connection):
|
class CityTarget:
|
||||||
|
area_id: str
|
||||||
|
province_id: str
|
||||||
|
province_name: str
|
||||||
|
province_py: str
|
||||||
|
city_name: str
|
||||||
|
city_py: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ListCard:
|
||||||
|
detail_url: str
|
||||||
|
name: str
|
||||||
|
specialties: List[str]
|
||||||
|
score_text: str
|
||||||
|
service_text: str
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(text: str) -> str:
|
||||||
|
compact = re.sub(r"\D", "", text or "")
|
||||||
|
match = PHONE_RE.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
class Six4365Crawler:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_pages: int = 9999,
|
||||||
|
sleep_seconds: float = 0.1,
|
||||||
|
use_proxy: bool = True,
|
||||||
|
db_connection=None,
|
||||||
|
):
|
||||||
|
self.max_pages = max_pages
|
||||||
|
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = self._build_session()
|
self.client = RequestsClient(
|
||||||
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
|
headers={
|
||||||
self._tls = threading.local()
|
|
||||||
self.cities = self._load_cities()
|
|
||||||
|
|
||||||
def _build_session(self) -> RequestsClient:
|
|
||||||
return RequestsClient(headers={
|
|
||||||
"User-Agent": (
|
"User-Agent": (
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||||
"Mobile/15E148 Safari/604.1"
|
"Mobile/15E148 Safari/604.1"
|
||||||
),
|
),
|
||||||
|
"Accept": "text/html, */*; q=0.01",
|
||||||
"Connection": "close",
|
"Connection": "close",
|
||||||
})
|
},
|
||||||
|
use_proxy=use_proxy,
|
||||||
|
retry_total=2,
|
||||||
|
retry_backoff_factor=1,
|
||||||
|
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
retry_allowed_methods=("GET", "POST"),
|
||||||
|
)
|
||||||
|
|
||||||
def _refresh_session(self) -> None:
|
def _request_text(
|
||||||
|
self,
|
||||||
|
method: str,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
timeout: int = 20,
|
||||||
|
max_retries: int = 3,
|
||||||
|
referer: str = SITE_BASE,
|
||||||
|
data: Optional[Dict] = None,
|
||||||
|
) -> str:
|
||||||
|
headers = {"Referer": referer}
|
||||||
|
last_error: Optional[Exception] = None
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
wait_for_request()
|
||||||
|
try:
|
||||||
|
if method.upper() == "POST":
|
||||||
|
resp = self.client.post_text(
|
||||||
|
url,
|
||||||
|
timeout=timeout,
|
||||||
|
verify=False,
|
||||||
|
headers=headers,
|
||||||
|
data=data,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
resp = self.client.get_text(
|
||||||
|
url,
|
||||||
|
timeout=timeout,
|
||||||
|
verify=False,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
code = resp.status_code
|
||||||
|
if code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
self.client.refresh()
|
self.client.refresh()
|
||||||
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
def _get_thread_session(self) -> RequestsClient:
|
continue
|
||||||
"""每个线程使用独立请求客户端(共享相同 headers/代理配置)。"""
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
s = getattr(self._tls, "session", None)
|
if code >= 500 and attempt < max_retries - 1:
|
||||||
if s is not None:
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
return s
|
continue
|
||||||
s = self.client.clone()
|
if code >= 400:
|
||||||
self._tls.session = s
|
raise RequestClientError(f"{code} Error: {url}")
|
||||||
return s
|
return resp.text
|
||||||
|
|
||||||
def _refresh_thread_session(self) -> None:
|
|
||||||
s = getattr(self._tls, "session", None)
|
|
||||||
if s is not None:
|
|
||||||
s.close()
|
|
||||||
self._tls.session = None
|
|
||||||
|
|
||||||
def _existing_urls(self, urls: List[str]) -> Set[str]:
|
|
||||||
"""批量查重,减少 N 次 is_data_exist"""
|
|
||||||
if not urls:
|
|
||||||
return set()
|
|
||||||
existing: Set[str] = set()
|
|
||||||
cur = self.db.db.cursor()
|
|
||||||
try:
|
|
||||||
# IN 参数过多会失败,分批
|
|
||||||
chunk_size = 500
|
|
||||||
for i in range(0, len(urls), chunk_size):
|
|
||||||
chunk = urls[i:i + chunk_size]
|
|
||||||
placeholders = ",".join(["%s"] * len(chunk))
|
|
||||||
sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
|
|
||||||
cur.execute(sql, chunk)
|
|
||||||
for row in cur.fetchall():
|
|
||||||
# pymysql 默认返回 tuple
|
|
||||||
existing.add(row[0])
|
|
||||||
finally:
|
|
||||||
cur.close()
|
|
||||||
return existing
|
|
||||||
|
|
||||||
def _load_cities(self):
|
|
||||||
tables = ("area_new", "area2", "area")
|
|
||||||
last_error = None
|
|
||||||
for table in tables:
|
|
||||||
try:
|
|
||||||
provinces = self.db.select_data(
|
|
||||||
table,
|
|
||||||
"id, code, province",
|
|
||||||
"domain='64365' AND level=1"
|
|
||||||
) or []
|
|
||||||
cities = self.db.select_data(
|
|
||||||
table,
|
|
||||||
"code, city, province, pid",
|
|
||||||
"domain='64365' AND level=2"
|
|
||||||
) or []
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
last_error = exc
|
last_error = exc
|
||||||
continue
|
|
||||||
|
|
||||||
if not cities:
|
|
||||||
continue
|
|
||||||
|
|
||||||
province_map = {row.get('id'): row for row in provinces}
|
|
||||||
data = {}
|
|
||||||
for city in cities:
|
|
||||||
province_row = province_map.get(city.get('pid'), {}) or {}
|
|
||||||
data[str(city.get('code'))] = {
|
|
||||||
"name": city.get('city'),
|
|
||||||
"province": city.get('province'),
|
|
||||||
"province_name": province_row.get('province', city.get('province')),
|
|
||||||
}
|
|
||||||
print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
|
|
||||||
return data
|
|
||||||
|
|
||||||
if last_error:
|
|
||||||
print(f"[律图] 加载地区数据失败: {last_error}")
|
|
||||||
print("[律图] 无城市数据(已尝试 area_new/area2/area)")
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
resp = self.client.post_text(LIST_URL, data=payload, timeout=10, verify=False)
|
|
||||||
status_code = resp.status_code
|
|
||||||
text = resp.text
|
|
||||||
if status_code == 403:
|
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
|
||||||
self._refresh_session()
|
|
||||||
time.sleep(wait_time)
|
|
||||||
continue
|
continue
|
||||||
print("请求失败: 403 Forbidden")
|
raise
|
||||||
return None
|
|
||||||
if status_code >= 400:
|
|
||||||
raise RequestClientError(f"{status_code} Error")
|
|
||||||
return text
|
|
||||||
except RequestClientError as exc:
|
|
||||||
print(f"请求失败: {exc}")
|
|
||||||
return None
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
|
if last_error is not None:
|
||||||
|
raise last_error
|
||||||
|
raise RequestClientError(f"Unknown request error: {url}")
|
||||||
|
|
||||||
|
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
|
||||||
|
return self._request_text(
|
||||||
|
"GET",
|
||||||
|
url,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=max_retries,
|
||||||
|
referer=referer,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _post_text(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
data: Dict,
|
||||||
|
timeout: int = 20,
|
||||||
|
max_retries: int = 3,
|
||||||
|
referer: str = SITE_BASE,
|
||||||
|
) -> str:
|
||||||
|
return self._request_text(
|
||||||
|
"POST",
|
||||||
|
url,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=max_retries,
|
||||||
|
referer=referer,
|
||||||
|
data=data,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _extract_area_data(self, text: str) -> List[Dict]:
|
||||||
|
match = re.search(
|
||||||
|
r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData",
|
||||||
|
text,
|
||||||
|
re.S,
|
||||||
|
)
|
||||||
|
if not match:
|
||||||
|
return []
|
||||||
|
|
||||||
|
raw = match.group(1)
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
return data if isinstance(data, list) else []
|
||||||
|
|
||||||
|
def discover_cities(self) -> List[CityTarget]:
|
||||||
|
text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/")
|
||||||
|
provinces = self._extract_area_data(text)
|
||||||
|
|
||||||
|
targets: List[CityTarget] = []
|
||||||
|
seen_area: Set[str] = set()
|
||||||
|
|
||||||
|
for province in provinces:
|
||||||
|
province_id = str(province.get("id") or "").strip()
|
||||||
|
province_name = str(province.get("name") or "").strip()
|
||||||
|
province_py = str(province.get("py") or "").strip()
|
||||||
|
child_rows = province.get("child") or []
|
||||||
|
|
||||||
|
# 常规省份 child 是地级市;直辖市 child 是区县,此时使用省级 id 抓取
|
||||||
|
if child_rows and any((row.get("child") or []) for row in child_rows):
|
||||||
|
for city in child_rows:
|
||||||
|
area_id = str(city.get("id") or "").strip()
|
||||||
|
city_name = str(city.get("name") or "").strip()
|
||||||
|
city_py = str(city.get("py") or "").strip()
|
||||||
|
if not area_id or not city_name:
|
||||||
|
continue
|
||||||
|
if area_id in seen_area:
|
||||||
|
continue
|
||||||
|
seen_area.add(area_id)
|
||||||
|
targets.append(
|
||||||
|
CityTarget(
|
||||||
|
area_id=area_id,
|
||||||
|
province_id=province_id,
|
||||||
|
province_name=province_name,
|
||||||
|
province_py=province_py,
|
||||||
|
city_name=city_name,
|
||||||
|
city_py=city_py,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if not province_id or not province_name:
|
||||||
|
continue
|
||||||
|
if province_id in seen_area:
|
||||||
|
continue
|
||||||
|
seen_area.add(province_id)
|
||||||
|
targets.append(
|
||||||
|
CityTarget(
|
||||||
|
area_id=province_id,
|
||||||
|
province_id=province_id,
|
||||||
|
province_name=province_name,
|
||||||
|
province_py=province_py,
|
||||||
|
city_name=province_name,
|
||||||
|
city_py=province_py,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return targets
|
||||||
|
|
||||||
|
def _build_payload(self, area_id: str, page: int) -> Dict[str, str]:
|
||||||
|
ua = self.client.headers.get("User-Agent", "")
|
||||||
return {
|
return {
|
||||||
"AdCode": "",
|
"AdCode": "",
|
||||||
"RegionId": str(city_code),
|
"RegionId": str(area_id),
|
||||||
"CategoryId": "",
|
"CategoryId": "",
|
||||||
"MaxNumber": "",
|
"MaxNumber": "",
|
||||||
"OnlyData": "true",
|
"OnlyData": "true",
|
||||||
"IgnoreButton": "",
|
"IgnoreButton": "",
|
||||||
"LawyerRecommendRequest[AreaId]": str(city_code),
|
"LawyerRecommendRequest[AreaId]": str(area_id),
|
||||||
"LawyerRecommendRequest[LawCategoryIds]": "",
|
"LawyerRecommendRequest[LawCategoryIds]": "",
|
||||||
"LawyerRecommendRequest[LawFirmPersonCount]": "",
|
"LawyerRecommendRequest[LawFirmPersonCount]": "",
|
||||||
"LawyerRecommendRequest[LawFirmScale]": "",
|
"LawyerRecommendRequest[LawFirmScale]": "",
|
||||||
@@ -171,162 +268,429 @@ class Six4365Spider:
|
|||||||
"LawyerRecommendRequest[RefferUrl]": "",
|
"LawyerRecommendRequest[RefferUrl]": "",
|
||||||
"LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
|
"LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
|
||||||
"LawyerRecommendRequest[resource_type_name]": "",
|
"LawyerRecommendRequest[resource_type_name]": "",
|
||||||
"LawyerRecommendRequest[UserAgent]": self.client.headers["User-Agent"],
|
"LawyerRecommendRequest[UserAgent]": ua,
|
||||||
"LawyerRecommendRequest[AddLawyerWithNoData]": "false",
|
"LawyerRecommendRequest[AddLawyerWithNoData]": "false",
|
||||||
"ShowCaseButton": "true",
|
"ShowCaseButton": "true",
|
||||||
}
|
}
|
||||||
|
|
||||||
def _parse_list(self, html: str, province: str, city: str) -> int:
|
def fetch_list_html(self, target: CityTarget, page: int) -> str:
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
payload = self._build_payload(target.area_id, page)
|
||||||
lawyers = soup.find_all("a", class_="lawyer")
|
return self._post_text(
|
||||||
if not lawyers:
|
LIST_API_URL,
|
||||||
return 0
|
data=payload,
|
||||||
|
referer=SITE_BASE + "/findlawyer/",
|
||||||
|
)
|
||||||
|
|
||||||
detail_urls: List[str] = []
|
def parse_list_cards(self, html: str) -> List[ListCard]:
|
||||||
for lawyer in lawyers:
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
href = lawyer.get("href")
|
cards: List[ListCard] = []
|
||||||
|
seen: Set[str] = set()
|
||||||
|
|
||||||
|
for anchor in soup.select("a.lawyer[href]"):
|
||||||
|
href = (anchor.get("href") or "").strip()
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
detail_urls.append(f"{href.rstrip('/')}/info/")
|
detail_url = urljoin(SITE_BASE, href)
|
||||||
|
if detail_url in seen:
|
||||||
if not detail_urls:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
results: List[Dict[str, str]] = []
|
|
||||||
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
|
||||||
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
|
|
||||||
for fut in as_completed(futs):
|
|
||||||
try:
|
|
||||||
data = fut.result()
|
|
||||||
except Exception as exc:
|
|
||||||
print(f" 详情解析异常: {exc}")
|
|
||||||
continue
|
continue
|
||||||
if data:
|
seen.add(detail_url)
|
||||||
results.append(data)
|
|
||||||
|
|
||||||
if not results:
|
name = ""
|
||||||
return len(detail_urls)
|
name_tag = anchor.select_one("b.name")
|
||||||
|
if name_tag:
|
||||||
|
name = name_tag.get_text(strip=True)
|
||||||
|
|
||||||
existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
|
specialties: List[str] = []
|
||||||
for data in results:
|
skill_tag = anchor.select_one("div.skill")
|
||||||
if not data:
|
if skill_tag:
|
||||||
continue
|
raw = skill_tag.get_text(" ", strip=True).replace("擅长:", "")
|
||||||
url = data.get("url", "")
|
specialties = [x.strip() for x in re.split(r"[、,,]", raw) if x.strip()]
|
||||||
if not url:
|
|
||||||
continue
|
|
||||||
if url in existing:
|
|
||||||
print(f" -- 已存在URL: {url}")
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
self.db.insert_data("lawyer", data)
|
|
||||||
print(f" -> 新增: {data['name']} ({data['phone']})")
|
|
||||||
except Exception as exc:
|
|
||||||
print(f" 插入失败 {url}: {exc}")
|
|
||||||
|
|
||||||
return len(detail_urls)
|
score_text = ""
|
||||||
|
score_tag = anchor.select_one("div.info span[title='评分'] em")
|
||||||
|
if score_tag:
|
||||||
|
score_text = score_tag.get_text(strip=True)
|
||||||
|
|
||||||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
service_text = ""
|
||||||
html = self._get_detail(url)
|
service_tag = anchor.select_one("div.info")
|
||||||
if not html:
|
if service_tag:
|
||||||
return None
|
service_text = service_tag.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
cards.append(
|
||||||
|
ListCard(
|
||||||
|
detail_url=detail_url,
|
||||||
|
name=name,
|
||||||
|
specialties=specialties,
|
||||||
|
score_text=score_text,
|
||||||
|
service_text=service_text,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return cards
|
||||||
|
|
||||||
|
def parse_detail(self, detail_url: str) -> Dict:
|
||||||
|
info_url = detail_url.rstrip("/") + "/info/"
|
||||||
|
html = self._get_text(info_url, referer=detail_url)
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
base_info = soup.find("ul", class_="intro-basic-bar")
|
|
||||||
if not base_info:
|
|
||||||
return None
|
|
||||||
|
|
||||||
name = ""
|
name = ""
|
||||||
law_firm = ""
|
law_firm = ""
|
||||||
phone = ""
|
phone = ""
|
||||||
|
practice_years: Optional[int] = None
|
||||||
|
office_area = ""
|
||||||
|
address = ""
|
||||||
|
specialties: List[str] = []
|
||||||
|
|
||||||
for li in base_info.find_all("li"):
|
for li in soup.select("ul.intro-basic-bar li"):
|
||||||
label = li.find("span", class_="label")
|
label_tag = li.select_one("span.label")
|
||||||
txt = li.find("div", class_="txt")
|
value_tag = li.select_one("div.txt")
|
||||||
if not label or not txt:
|
if not label_tag or not value_tag:
|
||||||
continue
|
continue
|
||||||
label_text = label.get_text(strip=True)
|
|
||||||
if "姓名" in label_text:
|
|
||||||
name = txt.get_text(strip=True)
|
|
||||||
if "执业律所" in label_text:
|
|
||||||
law_firm = txt.get_text(strip=True)
|
|
||||||
|
|
||||||
more_section = soup.find("div", class_="more-intro-basic")
|
label = label_tag.get_text(" ", strip=True).replace(":", "")
|
||||||
if more_section:
|
value = value_tag.get_text(" ", strip=True)
|
||||||
phone_ul = more_section.find("ul", class_="intro-basic-bar")
|
|
||||||
if phone_ul:
|
if "姓名" in label and not name:
|
||||||
for li in phone_ul.find_all("li"):
|
name = value
|
||||||
label = li.find("span", class_="label")
|
elif "执业律所" in label and not law_firm:
|
||||||
txt = li.find("div", class_="txt")
|
law_firm = value
|
||||||
if label and txt and "联系电话" in label.get_text(strip=True):
|
elif "联系电话" in label and not phone:
|
||||||
phone = txt.get_text(strip=True).replace(" ", "")
|
phone = normalize_phone(value)
|
||||||
|
elif "执业年限" in label and practice_years is None:
|
||||||
|
year_match = YEAR_RE.search(value)
|
||||||
|
if year_match:
|
||||||
|
try:
|
||||||
|
practice_years = int(year_match.group(1))
|
||||||
|
except Exception:
|
||||||
|
practice_years = None
|
||||||
|
elif "办公地区" in label and not office_area:
|
||||||
|
office_area = value
|
||||||
|
elif "办公地址" in label and not address:
|
||||||
|
address = value
|
||||||
|
|
||||||
|
text = soup.get_text(" ", strip=True)
|
||||||
|
if not phone:
|
||||||
|
phone = normalize_phone(text)
|
||||||
|
|
||||||
|
if not name and soup.title:
|
||||||
|
title = soup.title.get_text(" ", strip=True)
|
||||||
|
match = re.search(r"([^\s_,,。]+?)律师", title)
|
||||||
|
if match:
|
||||||
|
name = match.group(1).strip()
|
||||||
|
|
||||||
|
skill_match = re.search(r"擅长:([^\n]+)", text)
|
||||||
|
if skill_match:
|
||||||
|
specialties = [x.strip() for x in re.split(r"[、,,]", skill_match.group(1)) if x.strip()]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"phone": phone,
|
||||||
|
"practice_years": practice_years,
|
||||||
|
"office_area": office_area,
|
||||||
|
"address": address,
|
||||||
|
"specialties": specialties,
|
||||||
|
"detail_url": detail_url,
|
||||||
|
"info_url": info_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||||
|
seen_detail_urls: Set[str] = set()
|
||||||
|
page_first_seen: Set[str] = set()
|
||||||
|
|
||||||
|
for page in range(1, self.max_pages + 1):
|
||||||
|
try:
|
||||||
|
html = self.fetch_list_html(target, page)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[list] 失败 area={target.area_id} p{page}: {exc}")
|
||||||
break
|
break
|
||||||
|
|
||||||
phone = phone.replace('-', '').strip()
|
cards = self.parse_list_cards(html)
|
||||||
if not name or not phone:
|
if not cards:
|
||||||
|
break
|
||||||
|
|
||||||
|
first_url = cards[0].detail_url
|
||||||
|
if first_url in page_first_seen:
|
||||||
|
break
|
||||||
|
page_first_seen.add(first_url)
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
if card.detail_url in seen_detail_urls:
|
||||||
|
continue
|
||||||
|
seen_detail_urls.add(card.detail_url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
detail = self.parse_detail(card.detail_url)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[detail] 失败 {card.detail_url}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url)
|
||||||
|
uid = uid_match.group(1) if uid_match else card.detail_url
|
||||||
|
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
yield {
|
||||||
|
"record_id": record_id,
|
||||||
|
"collected_at": now,
|
||||||
|
"source": {
|
||||||
|
"site": SITE_NAME,
|
||||||
|
"province_id": target.province_id,
|
||||||
|
"province": target.province_name,
|
||||||
|
"province_py": target.province_py,
|
||||||
|
"area_id": target.area_id,
|
||||||
|
"city": target.city_name,
|
||||||
|
"city_py": target.city_py,
|
||||||
|
"page": page,
|
||||||
|
"detail_url": card.detail_url,
|
||||||
|
"info_url": detail.get("info_url", ""),
|
||||||
|
},
|
||||||
|
"list_snapshot": {
|
||||||
|
"name": card.name,
|
||||||
|
"specialties": card.specialties,
|
||||||
|
"score_text": card.score_text,
|
||||||
|
"service_text": card.service_text,
|
||||||
|
},
|
||||||
|
"profile": {
|
||||||
|
"name": detail.get("name") or card.name,
|
||||||
|
"law_firm": detail.get("law_firm") or "",
|
||||||
|
"phone": detail.get("phone") or "",
|
||||||
|
"practice_years": detail.get("practice_years"),
|
||||||
|
"office_area": detail.get("office_area") or "",
|
||||||
|
"address": detail.get("address") or "",
|
||||||
|
"specialties": detail.get("specialties") or card.specialties,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.sleep_seconds:
|
||||||
|
time.sleep(self.sleep_seconds)
|
||||||
|
|
||||||
|
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||||
|
source = record.get("source", {}) or {}
|
||||||
|
profile = record.get("profile", {}) or {}
|
||||||
|
|
||||||
|
phone = normalize_phone(profile.get("phone", ""))
|
||||||
|
if not phone:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
data = {
|
province = (source.get("province") or "").strip()
|
||||||
"phone": phone,
|
city = (source.get("city") or province).strip()
|
||||||
|
return {
|
||||||
|
"name": (profile.get("name") or "").strip(),
|
||||||
|
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||||
"province": province,
|
"province": province,
|
||||||
"city": city,
|
"city": city,
|
||||||
"law_firm": law_firm,
|
"phone": phone,
|
||||||
"url": url,
|
"url": (source.get("info_url") or source.get("detail_url") or "").strip(),
|
||||||
"domain": DOMAIN,
|
"domain": LEGACY_DOMAIN,
|
||||||
"name": name,
|
"create_time": int(record.get("collected_at") or time.time()),
|
||||||
"create_time": int(time.time()),
|
"params": json.dumps(record, ensure_ascii=False),
|
||||||
"params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
|
|
||||||
}
|
}
|
||||||
return data
|
|
||||||
|
|
||||||
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
|
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||||
session = self._get_thread_session()
|
if not self.db or not phones:
|
||||||
for attempt in range(max_retries):
|
return set()
|
||||||
|
|
||||||
|
deduped = sorted({p for p in phones if p})
|
||||||
|
if not deduped:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
existing: Set[str] = set()
|
||||||
|
cur = self.db.db.cursor()
|
||||||
try:
|
try:
|
||||||
resp = session.get_text(url, timeout=10, verify=False)
|
chunk_size = 500
|
||||||
status_code = resp.status_code
|
for i in range(0, len(deduped), chunk_size):
|
||||||
text = resp.text
|
chunk = deduped[i:i + chunk_size]
|
||||||
if status_code == 403:
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
if attempt < max_retries - 1:
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||||
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
for row in cur.fetchall():
|
||||||
self._refresh_thread_session()
|
existing.add(row[0])
|
||||||
session = self._get_thread_session()
|
finally:
|
||||||
time.sleep(wait_time)
|
cur.close()
|
||||||
continue
|
|
||||||
print(" 请求失败: 403 Forbidden")
|
|
||||||
return None
|
|
||||||
if status_code >= 400:
|
|
||||||
raise RequestClientError(f"{status_code} Error")
|
|
||||||
return text
|
|
||||||
except RequestClientError as exc:
|
|
||||||
print(f" 请求失败: {exc}")
|
|
||||||
return None
|
|
||||||
return None
|
|
||||||
|
|
||||||
def run(self):
|
return existing
|
||||||
print("启动律图采集...")
|
|
||||||
if not self.cities:
|
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||||
print("无城市数据")
|
if not self.db:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
rows: List[Dict[str, str]] = []
|
||||||
|
for record in records:
|
||||||
|
row = self._to_legacy_lawyer_row(record)
|
||||||
|
if row:
|
||||||
|
rows.append(row)
|
||||||
|
if not rows:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||||
|
inserted = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
phone = row.get("phone", "")
|
||||||
|
if not phone or phone in existing:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
self.db.insert_data("lawyer", row)
|
||||||
|
existing.add(phone)
|
||||||
|
inserted += 1
|
||||||
|
except Exception as exc:
|
||||||
|
skipped += 1
|
||||||
|
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||||
|
|
||||||
|
return inserted, skipped
|
||||||
|
|
||||||
|
def crawl(
|
||||||
|
self,
|
||||||
|
output_path: str,
|
||||||
|
max_cities: int = 0,
|
||||||
|
city_filter: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
cities = self.discover_cities()
|
||||||
|
print(f"[discover] 共发现地区 {len(cities)} 个")
|
||||||
|
|
||||||
|
if city_filter:
|
||||||
|
key = city_filter.strip().lower()
|
||||||
|
cities = [
|
||||||
|
c for c in cities
|
||||||
|
if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id
|
||||||
|
]
|
||||||
|
print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}")
|
||||||
|
|
||||||
|
if max_cities > 0:
|
||||||
|
cities = cities[:max_cities]
|
||||||
|
print(f"[discover] 截断地区数 {len(cities)}")
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
seen_ids: Set[str] = set()
|
||||||
|
if os.path.exists(output_path):
|
||||||
|
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||||
|
for line in old_file:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
item = json.loads(line)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
rid = item.get("record_id")
|
||||||
|
if rid:
|
||||||
|
seen_ids.add(rid)
|
||||||
|
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||||
|
|
||||||
|
total_new_json = 0
|
||||||
|
total_new_db = 0
|
||||||
|
total_skip_db = 0
|
||||||
|
|
||||||
|
with open(output_path, "a", encoding="utf-8") as out:
|
||||||
|
for idx, target in enumerate(cities, start=1):
|
||||||
|
print(
|
||||||
|
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||||
|
f"(area={target.area_id})"
|
||||||
|
)
|
||||||
|
city_records = list(self.crawl_city(target))
|
||||||
|
|
||||||
|
city_new_json = 0
|
||||||
|
for record in city_records:
|
||||||
|
rid = record["record_id"]
|
||||||
|
if rid in seen_ids:
|
||||||
|
continue
|
||||||
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
seen_ids.add(rid)
|
||||||
|
city_new_json += 1
|
||||||
|
total_new_json += 1
|
||||||
|
|
||||||
|
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||||
|
total_new_db += city_new_db
|
||||||
|
total_skip_db += city_skip_db
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||||
|
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||||
|
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="律图全新采集脚本(站点数据直采)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl",
|
||||||
|
help="输出 jsonl 文件路径",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-cities",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="最多采集多少个地区,0 表示不限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
default=9999,
|
||||||
|
help="每个地区最多采集多少页",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--city-filter",
|
||||||
|
default="",
|
||||||
|
help="按城市名称/拼音/编码过滤",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sleep",
|
||||||
|
type=float,
|
||||||
|
default=0.1,
|
||||||
|
help="详情页请求间隔秒数",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--direct",
|
||||||
|
action="store_true",
|
||||||
|
help="直连模式,不使用 proxy_settings.json 代理",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-db",
|
||||||
|
action="store_true",
|
||||||
|
help="只输出 JSONL,不写入数据库",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
if args.no_db:
|
||||||
|
crawler = Six4365Crawler(
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
sleep_seconds=args.sleep,
|
||||||
|
use_proxy=not args.direct,
|
||||||
|
db_connection=None,
|
||||||
|
)
|
||||||
|
crawler.crawl(
|
||||||
|
output_path=args.output,
|
||||||
|
max_cities=args.max_cities,
|
||||||
|
city_filter=args.city_filter or None,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
for city_code, info in self.cities.items():
|
with Db() as db:
|
||||||
province = info.get("province_name", "")
|
crawler = Six4365Crawler(
|
||||||
city = info.get("name", "")
|
max_pages=args.max_pages,
|
||||||
print(f"采集 {province}-{city}")
|
sleep_seconds=args.sleep,
|
||||||
page = 1
|
use_proxy=not args.direct,
|
||||||
while True:
|
db_connection=db,
|
||||||
payload = self._build_payload(city_code, page)
|
)
|
||||||
html = self._post(payload)
|
crawler.crawl(
|
||||||
if not html:
|
output_path=args.output,
|
||||||
break
|
max_cities=args.max_cities,
|
||||||
link_count = self._parse_list(html, province, city)
|
city_filter=args.city_filter or None,
|
||||||
if link_count == 0:
|
)
|
||||||
break
|
|
||||||
page += 1
|
|
||||||
print("律图采集完成")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
with Db() as db:
|
main()
|
||||||
spider = Six4365Spider(db)
|
|
||||||
spider.run()
|
|
||||||
|
|||||||
+75
-8
@@ -1,13 +1,80 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# 切换到脚本所在目录,确保相对路径正确
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
cd "$(dirname "$0")"
|
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||||
|
LOG_DIR="${PROJECT_ROOT}/logs"
|
||||||
|
DATA_DIR="${PROJECT_ROOT}/data"
|
||||||
|
|
||||||
echo "使用 request/proxy_settings.json 读取代理配置"
|
mkdir -p "${LOG_DIR}" "${DATA_DIR}"
|
||||||
|
|
||||||
nohup python3 dls.py > dls.log 2>&1 & # 大律师
|
if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
|
||||||
nohup python3 findlaw.py > findlaw.log 2>&1 & # 找法网
|
PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
|
||||||
nohup python3 lawtime.py > lawtime.log 2>&1 & # 法律快车
|
else
|
||||||
nohup python3 six4365.py > six4365.log 2>&1 & # 律图
|
PYTHON_BIN="python3"
|
||||||
nohup python3 hualv.py > hualv.log 2>&1 & # 华律
|
fi
|
||||||
|
|
||||||
|
RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
|
||||||
|
|
||||||
|
echo "[start] project=${PROJECT_ROOT}"
|
||||||
|
echo "[start] python=${PYTHON_BIN}"
|
||||||
|
echo "[start] mode=${RUN_MODE}"
|
||||||
|
echo "[start] proxy=request/proxy_settings.json"
|
||||||
|
|
||||||
|
# 大律师(新结构采集 + 写库)可通过环境变量控制
|
||||||
|
DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
|
||||||
|
DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
|
||||||
|
DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
|
||||||
|
DLS_SLEEP="${DLS_SLEEP:-0.2}"
|
||||||
|
DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
|
||||||
|
DLS_EXTRA_ARGS=()
|
||||||
|
|
||||||
|
if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
|
||||||
|
DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
|
||||||
|
fi
|
||||||
|
if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
|
||||||
|
DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
|
||||||
|
fi
|
||||||
|
if [[ -n "${DLS_CITY_FILTER}" ]]; then
|
||||||
|
DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
|
||||||
|
fi
|
||||||
|
DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
|
||||||
|
|
||||||
|
if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
|
||||||
|
DLS_EXTRA_ARGS+=(--direct)
|
||||||
|
fi
|
||||||
|
if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
|
||||||
|
DLS_EXTRA_ARGS+=(--no-db)
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_bg() {
|
||||||
|
local name="$1"
|
||||||
|
shift
|
||||||
|
local logfile="${LOG_DIR}/${name}.log"
|
||||||
|
nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
|
||||||
|
echo "[start] ${name} pid=$! log=${logfile}"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_fg() {
|
||||||
|
local name="$1"
|
||||||
|
shift
|
||||||
|
local logfile="${LOG_DIR}/${name}.log"
|
||||||
|
echo "[start] ${name} fg log=${logfile}"
|
||||||
|
env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ "${RUN_MODE}" == "sequential" ]]; then
|
||||||
|
run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
||||||
|
run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
||||||
|
run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
||||||
|
run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
||||||
|
run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
||||||
|
echo "[done] sequential completed"
|
||||||
|
else
|
||||||
|
run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
||||||
|
run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
||||||
|
run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
||||||
|
run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
||||||
|
run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
||||||
|
echo "[done] all crawlers started in background"
|
||||||
|
fi
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ class RequestsClient:
|
|||||||
self,
|
self,
|
||||||
headers: Optional[Mapping[str, str]] = None,
|
headers: Optional[Mapping[str, str]] = None,
|
||||||
*,
|
*,
|
||||||
|
use_proxy: bool = True,
|
||||||
retry_total: int = 0,
|
retry_total: int = 0,
|
||||||
retry_backoff_factor: float = 0.0,
|
retry_backoff_factor: float = 0.0,
|
||||||
retry_status_forcelist: Optional[Iterable[int]] = None,
|
retry_status_forcelist: Optional[Iterable[int]] = None,
|
||||||
@@ -58,6 +59,7 @@ class RequestsClient:
|
|||||||
default_timeout: Optional[TimeoutType] = None,
|
default_timeout: Optional[TimeoutType] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._base_headers: Dict[str, str] = dict(headers or {})
|
self._base_headers: Dict[str, str] = dict(headers or {})
|
||||||
|
self.use_proxy = bool(use_proxy)
|
||||||
self.retry_total = int(retry_total)
|
self.retry_total = int(retry_total)
|
||||||
self.retry_backoff_factor = float(retry_backoff_factor)
|
self.retry_backoff_factor = float(retry_backoff_factor)
|
||||||
self.retry_status_forcelist = tuple(retry_status_forcelist or ())
|
self.retry_status_forcelist = tuple(retry_status_forcelist or ())
|
||||||
@@ -67,8 +69,13 @@ class RequestsClient:
|
|||||||
|
|
||||||
def _build_session(self) -> requests.Session:
|
def _build_session(self) -> requests.Session:
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
if self.use_proxy:
|
||||||
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
|
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
|
||||||
apply_proxy(session)
|
apply_proxy(session)
|
||||||
|
else:
|
||||||
|
# 强制直连:不读取环境代理,不走配置文件代理
|
||||||
|
session.trust_env = False
|
||||||
|
session.proxies.clear()
|
||||||
if self.retry_total > 0:
|
if self.retry_total > 0:
|
||||||
# 适配器级重试:主要处理连接波动与指定状态码的瞬时失败
|
# 适配器级重试:主要处理连接波动与指定状态码的瞬时失败
|
||||||
retries = Retry(
|
retries = Retry(
|
||||||
@@ -109,6 +116,7 @@ class RequestsClient:
|
|||||||
# 线程场景建议 clone:复用同配置,但使用独立连接池
|
# 线程场景建议 clone:复用同配置,但使用独立连接池
|
||||||
clone_client = RequestsClient(
|
clone_client = RequestsClient(
|
||||||
headers=dict(self.headers),
|
headers=dict(self.headers),
|
||||||
|
use_proxy=self.use_proxy,
|
||||||
retry_total=self.retry_total,
|
retry_total=self.retry_total,
|
||||||
retry_backoff_factor=self.retry_backoff_factor,
|
retry_backoff_factor=self.retry_backoff_factor,
|
||||||
retry_status_forcelist=self.retry_status_forcelist,
|
retry_status_forcelist=self.retry_status_forcelist,
|
||||||
|
|||||||
@@ -3,3 +3,4 @@ requests>=2.28.0
|
|||||||
beautifulsoup4>=4.11.0
|
beautifulsoup4>=4.11.0
|
||||||
urllib3>=1.26.0
|
urllib3>=1.26.0
|
||||||
lxml>=4.9.0
|
lxml>=4.9.0
|
||||||
|
openpyxl>=3.1.0
|
||||||
|
|||||||
Reference in New Issue
Block a user