Compare commits
10 Commits
19cf9ce901
...
f67cb30f0d
| Author | SHA1 | Date | |
|---|---|---|---|
| f67cb30f0d | |||
| ba04fe42fc | |||
| ff5e04d986 | |||
| 7d5f5b1054 | |||
| 38e7c284e8 | |||
| c2b77975c1 | |||
| e10437cd90 | |||
| 86cf933913 | |||
| a96b9a50e4 | |||
| bc4a2aa4d5 |
+5
-35
@@ -1,36 +1,6 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# Build / packaging
|
||||
build/
|
||||
dist/
|
||||
*.egg-info/
|
||||
.eggs/
|
||||
|
||||
# Virtual environments
|
||||
.venv/
|
||||
venv/
|
||||
env/
|
||||
|
||||
# Test / type caches
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Local runtime files
|
||||
*.log
|
||||
logs/
|
||||
data/
|
||||
|
||||
# accidental local files
|
||||
=*
|
||||
__pycache__/
|
||||
*.pyc
|
||||
common_sites/*.log
|
||||
logs/*
|
||||
data/*
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"launchOptions": {
|
||||
"chromiumSandbox": false
|
||||
}
|
||||
}
|
||||
@@ -1,62 +1,70 @@
|
||||
# lawyers
|
||||
# lawyers-common-sites
|
||||
|
||||
`common_sites` 独立采集项目。
|
||||
从 `/www/wwwroot/lawyer` 中抽离出的 `common_sites` 独立项目。
|
||||
|
||||
## 目录
|
||||
|
||||
- `common_sites/`:大律师、找法网、法律快车、律图、华律 5 个采集脚本
|
||||
- `request/proxy_config.py`:代理配置加载逻辑
|
||||
- `request/proxy_settings.json`:代理配置文件
|
||||
- `Db.py`:数据库连接与基础操作
|
||||
- `config.py`:数据库与请求头配置
|
||||
- `common_sites/`: 站点采集脚本
|
||||
- `request/`: 代理配置
|
||||
- `utils/`: 公共工具
|
||||
- `Db.py`: 数据库封装
|
||||
- `config.py`: 项目配置
|
||||
|
||||
## 运行
|
||||
## 快速启动
|
||||
|
||||
```bash
|
||||
cd /www/wwwroot/lawyers
|
||||
python3 -m venv .venv
|
||||
.venv/bin/pip install -r requirements.txt
|
||||
./common_sites/start.sh
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
bash common_sites/start.sh
|
||||
```
|
||||
|
||||
## 启动参数
|
||||
## 拆分运行(直连/代理)
|
||||
|
||||
`start.sh` 默认并行启动 5 个站点采集(大律师使用 `dls_fresh.py`)。
|
||||
本仓库支持用环境变量 `PROXY_ENABLED` 在一次运行内强制开/关代理:
|
||||
|
||||
- 日志目录:`/www/wwwroot/lawyers/logs`
|
||||
- 大律师 JSON 输出:`/www/wwwroot/lawyers/data/dls_records.jsonl`
|
||||
- **直连**:`PROXY_ENABLED=0`(不使用代理 IP)
|
||||
- **代理**:`PROXY_ENABLED=1`(强制使用 `request/proxy_settings.json` 的代理配置)
|
||||
- **默认**:不设置(跟随 `request/proxy_settings.json` 的 `enabled` 字段)
|
||||
|
||||
常用环境变量:
|
||||
对应提供两套入口脚本:
|
||||
|
||||
```bash
|
||||
# 顺序执行(默认 parallel)
|
||||
RUN_MODE=sequential ./common_sites/start.sh
|
||||
# 直连(默认包含:大律师/大律师PC/找法网/法律快车)
|
||||
bash common_sites/start_direct_twice_weekly.sh
|
||||
|
||||
# 大律师限制采集范围
|
||||
DLS_CITY_FILTER=beijing DLS_MAX_CITIES=1 DLS_MAX_PAGES=1 ./common_sites/start.sh
|
||||
|
||||
# 大律师直连(不走代理)/ 仅导出JSON不写库
|
||||
DLS_DIRECT=1 DLS_NO_DB=1 ./common_sites/start.sh
|
||||
# 代理(默认包含:华律/律图)
|
||||
bash common_sites/start_proxy_weekly.sh
|
||||
```
|
||||
|
||||
## 导出 Excel
|
||||
## cron 示例(每周两次直连 + 每周一次代理)
|
||||
|
||||
新增导出脚本:`common_sites/export_lawyers_excel.py`
|
||||
> 下面仅给示例,你可以按机器负载调整时间;日志会输出到 `common_sites/*.log`。
|
||||
|
||||
```bash
|
||||
# 无参数:默认导出最近7天数据(含手机号/姓名/律所/省份/市区/站点名称)
|
||||
# 并默认解析 params 扩展信息(邮箱/地址/执业证号/执业年限/擅长领域等)
|
||||
./.venv/bin/python ./common_sites/export_lawyers_excel.py
|
||||
# 编辑定时任务
|
||||
crontab -e
|
||||
|
||||
# 按 create_time 时间戳范围导出
|
||||
./.venv/bin/python ./common_sites/export_lawyers_excel.py \
|
||||
--start-ts 1772380000 --end-ts 1772429999 \
|
||||
--output ./data/lawyers_20260302.xlsx
|
||||
# 每周二、周五 02:10 直连跑一次
|
||||
10 2 * * 2,5 cd /www/wwwroot/lawyers && bash common_sites/start_direct_twice_weekly.sh
|
||||
|
||||
# 只导出某站点,并带技术字段(url/域名/时间等)
|
||||
./.venv/bin/python ./common_sites/export_lawyers_excel.py \
|
||||
--domain 大律师 --include-extra
|
||||
|
||||
# 如果不需要解析 params 扩展信息
|
||||
./.venv/bin/python ./common_sites/export_lawyers_excel.py --no-parse-params
|
||||
# 每周日 03:20 走代理跑一次(你手动续费代理 IP)
|
||||
20 3 * * 0 cd /www/wwwroot/lawyers && bash common_sites/start_proxy_weekly.sh
|
||||
```
|
||||
|
||||
### 常用参数(可选)
|
||||
|
||||
```bash
|
||||
# 限流(跨进程共享),直连可适当调高,代理建议保守
|
||||
export PROXY_MAX_REQUESTS_PER_SECOND=8
|
||||
|
||||
# 代理连通性输出(部分脚本会打印测试信息)
|
||||
export PROXY_TEST=1
|
||||
```
|
||||
|
||||
## 说明
|
||||
|
||||
- 当前项目直接复用原项目数据库配置和代理配置。
|
||||
- 采集依赖原库中的 `lawyer`、`area_new`、`area`、`area2` 等表。
|
||||
- 日志默认输出到 `common_sites/*.log`。
|
||||
|
||||
@@ -0,0 +1,473 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from urllib.parse import urlencode
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
request_dir = os.path.join(project_root, "request")
|
||||
if request_dir not in sys.path:
|
||||
sys.path.insert(0, request_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from Db import Db
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
|
||||
|
||||
DOMAIN = "百度法行宝"
|
||||
BASE_URL = "https://lvlin.baidu.com"
|
||||
CITY_API = f"{BASE_URL}/pc/api/law/sync/city"
|
||||
LIST_API = f"{BASE_URL}/pc/api/law/api/lawyerlist"
|
||||
DETAIL_API = f"{BASE_URL}/pc/api/law/api/lawyerhome"
|
||||
DEFAULT_PAGE_SIZE = 16
|
||||
DEFAULT_MAX_PAGES = 30
|
||||
DEFAULT_STOP_ZERO_NEW_PAGES = 3
|
||||
DEFAULT_SLEEP_SECONDS = 0.1
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="采集百度法行宝律师信息并落库")
|
||||
parser.add_argument("--province", default="", help="仅采集指定省份,例如:山东")
|
||||
parser.add_argument("--city", default="", help="仅采集指定城市,例如:聊城 / 聊城市")
|
||||
parser.add_argument(
|
||||
"--areas",
|
||||
default="",
|
||||
help="指定案件类型,逗号分隔;不传时自动发现顶级类型并追加不限",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit-cities",
|
||||
type=int,
|
||||
default=0,
|
||||
help="仅处理前 N 个城市,0 表示不限",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--page-size",
|
||||
type=int,
|
||||
default=DEFAULT_PAGE_SIZE,
|
||||
help=f"每次列表请求条数,默认 {DEFAULT_PAGE_SIZE}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages-per-query",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_PAGES,
|
||||
help=f"单城市单类型最大翻页数,默认 {DEFAULT_MAX_PAGES}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stop-zero-new-pages",
|
||||
type=int,
|
||||
default=DEFAULT_STOP_ZERO_NEW_PAGES,
|
||||
help=f"连续多少页无新增就停止当前查询,默认 {DEFAULT_STOP_ZERO_NEW_PAGES}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep-seconds",
|
||||
type=float,
|
||||
default=DEFAULT_SLEEP_SECONDS,
|
||||
help=f"请求间隔秒数,默认 {DEFAULT_SLEEP_SECONDS}",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
class BaiduLvlinSpider:
|
||||
def __init__(self, db_connection: Db, args: argparse.Namespace):
|
||||
self.db = db_connection
|
||||
self.args = args
|
||||
self.page_size = max(1, int(args.page_size or DEFAULT_PAGE_SIZE))
|
||||
self.max_pages_per_query = max(1, int(args.max_pages_per_query or DEFAULT_MAX_PAGES))
|
||||
self.stop_zero_new_pages = max(1, int(args.stop_zero_new_pages or DEFAULT_STOP_ZERO_NEW_PAGES))
|
||||
self.sleep_seconds = max(0.0, float(args.sleep_seconds or 0.0))
|
||||
self.proxy_enabled = False
|
||||
self.session = self._build_session()
|
||||
self.existing_urls = self._load_existing_urls()
|
||||
self.cities = self._load_cities()
|
||||
self.areas = self._load_areas()
|
||||
self.inserted_count = 0
|
||||
|
||||
def _build_session(self) -> requests.Session:
|
||||
report_proxy_status()
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
session.proxies.update(proxies)
|
||||
self.proxy_enabled = True
|
||||
else:
|
||||
session.proxies.clear()
|
||||
self.proxy_enabled = False
|
||||
|
||||
retries = Retry(
|
||||
total=3,
|
||||
backoff_factor=1,
|
||||
status_forcelist=(429, 500, 502, 503, 504),
|
||||
allowed_methods=frozenset(["GET"]),
|
||||
raise_on_status=False,
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retries)
|
||||
session.mount("https://", adapter)
|
||||
session.mount("http://", adapter)
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/123.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Referer": f"{BASE_URL}/pc/r?vn=law",
|
||||
"Connection": "close",
|
||||
}
|
||||
)
|
||||
return session
|
||||
|
||||
def _disable_proxy(self) -> None:
|
||||
if not self.proxy_enabled:
|
||||
return
|
||||
self.session.proxies.clear()
|
||||
self.proxy_enabled = False
|
||||
print(f"[{DOMAIN}] 代理不可用,已切换直连")
|
||||
|
||||
def _sleep(self) -> None:
|
||||
if self.sleep_seconds > 0:
|
||||
time.sleep(self.sleep_seconds)
|
||||
|
||||
def _get_json(self, url: str, params: Optional[Dict[str, object]] = None, referer: str = "") -> Dict:
|
||||
headers = {}
|
||||
if referer:
|
||||
headers["Referer"] = referer
|
||||
try:
|
||||
resp = self.session.get(url, params=params or {}, timeout=20, headers=headers)
|
||||
except requests.exceptions.ProxyError:
|
||||
self._disable_proxy()
|
||||
resp = self.session.get(url, params=params or {}, timeout=20, headers=headers)
|
||||
try:
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
finally:
|
||||
resp.close()
|
||||
|
||||
def _load_existing_urls(self) -> Set[str]:
|
||||
urls: Set[str] = set()
|
||||
cursor = self.db.db.cursor()
|
||||
try:
|
||||
cursor.execute("SELECT url FROM lawyer WHERE domain=%s AND url IS NOT NULL", (DOMAIN,))
|
||||
for row in cursor.fetchall():
|
||||
url = (row[0] or "").strip()
|
||||
if url:
|
||||
urls.add(url)
|
||||
finally:
|
||||
cursor.close()
|
||||
print(f"[{DOMAIN}] 已存在 URL 数: {len(urls)}")
|
||||
return urls
|
||||
|
||||
def _normalize_city_name(self, city_name: str) -> str:
|
||||
text = str(city_name or "").strip()
|
||||
if text.endswith("市"):
|
||||
return text[:-1]
|
||||
return text
|
||||
|
||||
def _city_matches(self, expected_city: str, actual_city: str) -> bool:
|
||||
left = self._normalize_city_name(expected_city)
|
||||
right = self._normalize_city_name(actual_city)
|
||||
if not left or not right:
|
||||
return False
|
||||
return left == right
|
||||
|
||||
def _load_cities(self) -> List[Dict[str, str]]:
|
||||
payload = self._get_json(CITY_API, params={"vn": "law"}, referer=f"{BASE_URL}/pc/r?vn=law")
|
||||
all_city_list = payload.get("data", {}).get("AllCityList", []) or []
|
||||
cities: List[Dict[str, str]] = []
|
||||
province_filter = self.args.province.strip()
|
||||
city_filter = self._normalize_city_name(self.args.city)
|
||||
|
||||
for block in all_city_list:
|
||||
for item in block.get("cityList", []) or []:
|
||||
city_name = str(item.get("name") or "").strip()
|
||||
province = str(item.get("province") or "").strip()
|
||||
city_code = str(item.get("code") or "").strip()
|
||||
if not city_name or not province or not city_code:
|
||||
continue
|
||||
if province_filter and province != province_filter:
|
||||
continue
|
||||
if city_filter and self._normalize_city_name(city_name) != city_filter:
|
||||
continue
|
||||
cities.append(
|
||||
{
|
||||
"province": province,
|
||||
"city": city_name,
|
||||
"city_code": city_code,
|
||||
}
|
||||
)
|
||||
|
||||
cities.sort(key=lambda item: (item["province"], item["city"]))
|
||||
if self.args.limit_cities and self.args.limit_cities > 0:
|
||||
cities = cities[: self.args.limit_cities]
|
||||
print(f"[{DOMAIN}] 本次待采城市数: {len(cities)}")
|
||||
return cities
|
||||
|
||||
def _discover_top_level_areas(self) -> List[str]:
|
||||
sample_city = self.cities[0]["city"] if self.cities else "北京"
|
||||
payload = self._get_json(
|
||||
LIST_API,
|
||||
params={
|
||||
"city_name": sample_city,
|
||||
"page_num": 1,
|
||||
"page_size": self.page_size,
|
||||
"ts": int(time.time()),
|
||||
"clientType": "pc",
|
||||
"list_type": 1,
|
||||
},
|
||||
referer=f"{BASE_URL}/pc/r?vn=law",
|
||||
)
|
||||
filters = payload.get("data", {}).get("filters", []) or []
|
||||
areas: List[str] = ["不限"]
|
||||
seen = {"不限"}
|
||||
for item in filters:
|
||||
if item.get("key") != "type":
|
||||
continue
|
||||
for option in item.get("options", []) or []:
|
||||
value = str(option.get("value") or "").strip()
|
||||
if not value or value in seen:
|
||||
continue
|
||||
seen.add(value)
|
||||
areas.append(value)
|
||||
return areas
|
||||
|
||||
def _load_areas(self) -> List[str]:
|
||||
if self.args.areas.strip():
|
||||
areas = [part.strip() for part in self.args.areas.split(",") if part.strip()]
|
||||
unique: List[str] = []
|
||||
seen = set()
|
||||
for area in areas:
|
||||
if area not in seen:
|
||||
seen.add(area)
|
||||
unique.append(area)
|
||||
print(f"[{DOMAIN}] 使用指定案件类型: {unique}")
|
||||
return unique
|
||||
|
||||
areas = self._discover_top_level_areas()
|
||||
print(f"[{DOMAIN}] 自动发现案件类型: {areas}")
|
||||
return areas
|
||||
|
||||
def _build_pc_detail_url(self, qc_no: str, rs_id: str) -> str:
|
||||
return f"{BASE_URL}/pc/lawyer?vn=law&qc_no={qc_no}&rs_id={rs_id}"
|
||||
|
||||
def _build_list_page_url(self, city_name: str, area_name: str) -> str:
|
||||
params = {"city": city_name, "vn": "law"}
|
||||
if area_name and area_name != "不限":
|
||||
params["expertiseArea"] = area_name
|
||||
return f"{BASE_URL}/pc/r?{urlencode(params)}"
|
||||
|
||||
def _fetch_list(self, city_name: str, area_name: str, page_num: int) -> List[Dict]:
|
||||
params: Dict[str, object] = {
|
||||
"city_name": city_name,
|
||||
"page_num": page_num,
|
||||
"page_size": self.page_size,
|
||||
"ts": int(time.time()),
|
||||
"clientType": "pc",
|
||||
"list_type": 1,
|
||||
}
|
||||
if area_name and area_name != "不限":
|
||||
params["expertiseArea"] = area_name
|
||||
payload = self._get_json(
|
||||
LIST_API,
|
||||
params=params,
|
||||
referer=self._build_list_page_url(city_name, area_name),
|
||||
)
|
||||
return payload.get("data", {}).get("lawyer_list", []) or []
|
||||
|
||||
def _fetch_detail(self, qc_no: str, rs_id: str) -> Dict:
|
||||
payload = self._get_json(
|
||||
DETAIL_API,
|
||||
params={"vn": "law", "qc_no": qc_no, "rs_id": rs_id},
|
||||
referer=self._build_pc_detail_url(qc_no, rs_id),
|
||||
)
|
||||
return payload.get("data", {}).get("lawyer", {}) or {}
|
||||
|
||||
def _extract_phone(self, detail: Dict) -> Optional[str]:
|
||||
for service in detail.get("lawyer_service", []) or []:
|
||||
phone = str(service.get("phone_num") or "").strip()
|
||||
if phone:
|
||||
return phone
|
||||
for service in detail.get("lawyer_service_new", []) or []:
|
||||
phone = str(service.get("phone_num") or "").strip()
|
||||
if phone:
|
||||
return phone
|
||||
return None
|
||||
|
||||
def _safe_json(self, payload: Dict) -> str:
|
||||
return json.dumps(payload, ensure_ascii=False)
|
||||
|
||||
def _build_record(
|
||||
self,
|
||||
city_info: Dict[str, str],
|
||||
area_name: str,
|
||||
page_num: int,
|
||||
list_item: Dict,
|
||||
detail: Dict,
|
||||
) -> Dict[str, object]:
|
||||
qc_no = str(list_item.get("qc_no") or detail.get("qc_no") or "").strip()
|
||||
rs_id = str(list_item.get("rs_id") or detail.get("rs_id") or "").strip()
|
||||
detail_url = self._build_pc_detail_url(qc_no, rs_id)
|
||||
name = str(detail.get("lawyer_name") or list_item.get("lawyer_name") or "").strip()
|
||||
law_firm = str(detail.get("practice_company") or list_item.get("practice_company") or "").strip()
|
||||
city_name = str(list_item.get("city") or city_info.get("city") or "").strip()
|
||||
avatar_url = str(detail.get("lawyer_avatar_big") or detail.get("lawyer_avatar") or list_item.get("lawyer_avatar_big") or list_item.get("lawyer_avatar") or "").strip()
|
||||
phone = self._extract_phone(detail)
|
||||
|
||||
params = {
|
||||
"source": {
|
||||
"site": "baidu_lvlin",
|
||||
"city_name": city_info.get("city"),
|
||||
"city_code": city_info.get("city_code"),
|
||||
"province": city_info.get("province"),
|
||||
"expertise_area": area_name,
|
||||
"page_num": page_num,
|
||||
"list_url": self._build_list_page_url(city_info.get("city", ""), area_name),
|
||||
"detail_url": detail_url,
|
||||
"list_api": LIST_API,
|
||||
"detail_api": DETAIL_API,
|
||||
},
|
||||
"list_item": list_item,
|
||||
"detail": detail,
|
||||
}
|
||||
|
||||
return {
|
||||
"name": name or None,
|
||||
"phone": phone or None,
|
||||
"law_firm": law_firm or None,
|
||||
"province": city_info.get("province") or None,
|
||||
"city": city_name or city_info.get("city") or None,
|
||||
"url": detail_url,
|
||||
"avatar_url": avatar_url or None,
|
||||
"domain": DOMAIN,
|
||||
"create_time": int(time.time()),
|
||||
"site_time": None,
|
||||
"params": self._safe_json(params),
|
||||
}
|
||||
|
||||
def _insert_record(self, record: Dict[str, object]) -> bool:
|
||||
url = str(record.get("url") or "").strip()
|
||||
if not url or url in self.existing_urls:
|
||||
return False
|
||||
self.db.insert_data("lawyer", record)
|
||||
self.existing_urls.add(url)
|
||||
self.inserted_count += 1
|
||||
return True
|
||||
|
||||
def _iter_city_area(self, city_info: Dict[str, str], area_name: str) -> Tuple[int, int]:
|
||||
inserted = 0
|
||||
pages = 0
|
||||
zero_new_pages = 0
|
||||
city_name = city_info["city"]
|
||||
|
||||
for page_num in range(1, self.max_pages_per_query + 1):
|
||||
pages = page_num
|
||||
try:
|
||||
items = self._fetch_list(city_name, area_name, page_num)
|
||||
except Exception as exc:
|
||||
print(f"[{DOMAIN}] 列表请求失败 {city_name}-{area_name}-p{page_num}: {exc}")
|
||||
break
|
||||
|
||||
if not items:
|
||||
print(f"[{DOMAIN}] {city_name}-{area_name} 第 {page_num} 页无数据,停止")
|
||||
break
|
||||
|
||||
page_new = 0
|
||||
for item in items:
|
||||
qc_no = str(item.get("qc_no") or "").strip()
|
||||
rs_id = str(item.get("rs_id") or "").strip()
|
||||
actual_city = str(item.get("city") or "").strip()
|
||||
if not qc_no or not rs_id:
|
||||
continue
|
||||
if actual_city and not self._city_matches(city_name, actual_city):
|
||||
continue
|
||||
|
||||
detail_url = self._build_pc_detail_url(qc_no, rs_id)
|
||||
if detail_url in self.existing_urls:
|
||||
continue
|
||||
|
||||
detail: Dict = {}
|
||||
try:
|
||||
detail = self._fetch_detail(qc_no, rs_id)
|
||||
except Exception as exc:
|
||||
print(f"[{DOMAIN}] 详情请求失败 {qc_no}-{rs_id}: {exc}")
|
||||
|
||||
record = self._build_record(city_info, area_name, page_num, item, detail)
|
||||
try:
|
||||
if self._insert_record(record):
|
||||
page_new += 1
|
||||
inserted += 1
|
||||
print(
|
||||
f"[{DOMAIN}] -> 新增 {record.get('name') or qc_no} "
|
||||
f"| {city_name} | {area_name} | p{page_num}"
|
||||
)
|
||||
except Exception as exc:
|
||||
print(f"[{DOMAIN}] 插入失败 {record.get('url')}: {exc}")
|
||||
self._sleep()
|
||||
|
||||
print(
|
||||
f"[{DOMAIN}] {city_name} | {area_name} | p{page_num} "
|
||||
f"| 列表 {len(items)} | 新增 {page_new}"
|
||||
)
|
||||
|
||||
if len(items) < self.page_size:
|
||||
break
|
||||
if page_new == 0:
|
||||
zero_new_pages += 1
|
||||
if zero_new_pages >= self.stop_zero_new_pages:
|
||||
print(
|
||||
f"[{DOMAIN}] {city_name}-{area_name} 连续 {zero_new_pages} 页无新增,停止"
|
||||
)
|
||||
break
|
||||
else:
|
||||
zero_new_pages = 0
|
||||
|
||||
self._sleep()
|
||||
|
||||
return inserted, pages
|
||||
|
||||
def run(self) -> None:
|
||||
print(f"[{DOMAIN}] 启动采集")
|
||||
if not self.cities:
|
||||
print(f"[{DOMAIN}] 无可采城市")
|
||||
return
|
||||
if not self.areas:
|
||||
print(f"[{DOMAIN}] 无可采案件类型")
|
||||
return
|
||||
|
||||
total_queries = len(self.cities) * len(self.areas)
|
||||
query_index = 0
|
||||
for city_info in self.cities:
|
||||
city_inserted = 0
|
||||
for area_name in self.areas:
|
||||
query_index += 1
|
||||
print(
|
||||
f"[{DOMAIN}] 进度 {query_index}/{total_queries} | "
|
||||
f"{city_info['province']}-{city_info['city']} | {area_name}"
|
||||
)
|
||||
inserted, pages = self._iter_city_area(city_info, area_name)
|
||||
city_inserted += inserted
|
||||
print(
|
||||
f"[{DOMAIN}] 完成 {city_info['city']} | {area_name} "
|
||||
f"| 翻页 {pages} | 新增 {inserted}"
|
||||
)
|
||||
print(
|
||||
f"[{DOMAIN}] 城市完成 {city_info['province']}-{city_info['city']} "
|
||||
f"| 本城新增 {city_inserted} | 总新增 {self.inserted_count}"
|
||||
)
|
||||
print(f"[{DOMAIN}] 采集完成,总新增 {self.inserted_count}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_args = parse_args()
|
||||
with Db() as db:
|
||||
spider = BaiduLvlinSpider(db, cli_args)
|
||||
spider.run()
|
||||
+186
-287
@@ -1,14 +1,9 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
import random
|
||||
from typing import Dict, Optional
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
@@ -18,144 +13,191 @@ if request_dir not in sys.path:
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from Db import Db
|
||||
from request.requests_client import (
|
||||
RequestClientError,
|
||||
RequestConnectTimeout,
|
||||
RequestConnectionError,
|
||||
RequestTimeout,
|
||||
RequestsClient,
|
||||
)
|
||||
from utils.rate_limiter import wait_for_request
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
|
||||
# 禁用 SSL 警告
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
from Db import Db
|
||||
from utils.rate_limiter import request_slot
|
||||
|
||||
DOMAIN = "大律师"
|
||||
SITE_BASE = "https://m.maxlaw.cn"
|
||||
LIST_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}"
|
||||
PHONE_PATTERN = re.compile(r"1[3-9]\d{9}")
|
||||
MAX_PAGES_PER_CITY = int(os.getenv("MAXLAW_MAX_PAGES", "0"))
|
||||
PROXY_TESTED = False
|
||||
LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
|
||||
_PROXY_TESTED = False
|
||||
|
||||
|
||||
class DlsSpider:
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
self.client = self._build_client()
|
||||
self.session = self._build_session()
|
||||
self.areas = self._load_areas()
|
||||
|
||||
def _build_client(self) -> RequestsClient:
|
||||
client = RequestsClient(
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
"Host": "m.maxlaw.cn",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Connection": "close",
|
||||
},
|
||||
retry_total=3,
|
||||
retry_backoff_factor=1,
|
||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||
retry_allowed_methods=("GET", "POST"),
|
||||
def _build_session(self) -> requests.Session:
|
||||
"""构建带重试机制的 session"""
|
||||
report_proxy_status()
|
||||
s = requests.Session()
|
||||
s.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
s.proxies.update(proxies)
|
||||
else:
|
||||
s.proxies.clear()
|
||||
self._proxy_test(s, proxies)
|
||||
# 配置重试策略
|
||||
retries = Retry(
|
||||
total=3, # 总共重试3次
|
||||
backoff_factor=1, # 重试间隔:1s, 2s, 4s
|
||||
status_forcelist=(429, 500, 502, 503, 504), # 对这些状态码进行重试
|
||||
allowed_methods=frozenset(["GET", "POST"]),
|
||||
raise_on_status=False # 不立即抛出异常,让代码处理
|
||||
)
|
||||
self._proxy_test(client, client.proxies or None)
|
||||
return client
|
||||
adapter = HTTPAdapter(max_retries=retries)
|
||||
s.mount("https://", adapter)
|
||||
s.mount("http://", adapter)
|
||||
s.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
|
||||
"Host": "m.maxlaw.cn",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Connection": "close",
|
||||
})
|
||||
return s
|
||||
|
||||
def _refresh_client(self) -> None:
|
||||
self.client.refresh()
|
||||
self._proxy_test(self.client, self.client.proxies or None)
|
||||
def _refresh_session(self) -> None:
|
||||
try:
|
||||
self.session.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.session = self._build_session()
|
||||
|
||||
def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
|
||||
global PROXY_TESTED
|
||||
if PROXY_TESTED or not os.getenv("PROXY_TEST"):
|
||||
def _proxy_test(self, session: requests.Session, proxies: Optional[Dict[str, str]]) -> None:
|
||||
global _PROXY_TESTED
|
||||
if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
|
||||
return
|
||||
PROXY_TESTED = True
|
||||
_PROXY_TESTED = True
|
||||
if not proxies:
|
||||
print("[proxy] test skipped: no proxy configured")
|
||||
return
|
||||
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
||||
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
||||
try:
|
||||
resp = client.get_text(test_url, timeout=timeout, headers={"Connection": "close"})
|
||||
with request_slot():
|
||||
resp = session.get(
|
||||
test_url,
|
||||
timeout=timeout,
|
||||
headers={"Connection": "close"},
|
||||
)
|
||||
print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
|
||||
except Exception as exc:
|
||||
print(f"[proxy] test failed: {exc}")
|
||||
|
||||
def _load_areas(self) -> List[Dict[str, str]]:
|
||||
tables = ("area_new", "area2", "area")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
try:
|
||||
rows = self.db.select_data(table, "province, city, pinyin", "domain='maxlaw'") or []
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
continue
|
||||
if rows:
|
||||
missing_pinyin = sum(1 for row in rows if not (row.get("pinyin") or "").strip())
|
||||
print(f"[大律师] 地区来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
||||
return rows
|
||||
if last_error:
|
||||
print(f"[大律师] 加载地区失败: {last_error}")
|
||||
print("[大律师] 无地区数据(已尝试 area_new/area2/area)")
|
||||
return []
|
||||
def _load_areas(self):
|
||||
try:
|
||||
return self.db.select_data(
|
||||
"area_new",
|
||||
"province, city, pinyin",
|
||||
"domain='maxlaw'"
|
||||
) or []
|
||||
except Exception as exc:
|
||||
print(f"加载地区失败: {exc}")
|
||||
return []
|
||||
|
||||
def _get(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
max_retries: int = 3,
|
||||
timeout: Tuple[int, int] = (10, 30),
|
||||
) -> Optional[str]:
|
||||
wait_for_request()
|
||||
def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
|
||||
"""发送 GET 请求,带重试机制"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
|
||||
if resp.status_code == 403:
|
||||
# 使用更长的超时时间,分别设置连接和读取超时
|
||||
with request_slot():
|
||||
resp = self.session.get(
|
||||
url,
|
||||
timeout=(10, 30), # (connect_timeout, read_timeout)
|
||||
verify=False,
|
||||
headers=headers,
|
||||
)
|
||||
status_code = resp.status_code
|
||||
content = resp.text
|
||||
resp.close()
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = (2 ** attempt) + random.uniform(0.3, 1.0)
|
||||
print(f"请求403,{wait_time:.2f}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
self._refresh_client()
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
self._refresh_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"请求失败 {url}: 403 Forbidden")
|
||||
return None
|
||||
if resp.status_code >= 400:
|
||||
raise RequestClientError(f"{resp.status_code} Error: {url}")
|
||||
return resp.text
|
||||
except RequestConnectTimeout as exc:
|
||||
if status_code >= 400:
|
||||
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||||
return content
|
||||
except requests.exceptions.ConnectTimeout as exc:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt # 指数退避:2s, 4s, 8s
|
||||
print(f"连接超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
|
||||
return None
|
||||
except requests.exceptions.Timeout as exc:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt
|
||||
print(f"连接超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
print(f"请求超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
|
||||
return None
|
||||
except RequestTimeout as exc:
|
||||
else:
|
||||
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
|
||||
return None
|
||||
except requests.exceptions.ConnectionError as exc:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt
|
||||
print(f"请求超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
print(f"连接错误,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
|
||||
return None
|
||||
except RequestConnectionError as exc:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt
|
||||
print(f"连接错误,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
|
||||
return None
|
||||
except RequestClientError as exc:
|
||||
else:
|
||||
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"请求失败 {url}: {exc}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
cards = soup.find_all("div", class_="lstx")
|
||||
if not cards:
|
||||
return 0
|
||||
|
||||
inserted = 0
|
||||
for card in cards:
|
||||
link = card.find("a")
|
||||
if not link or not link.get("href"):
|
||||
continue
|
||||
detail = self._parse_detail(link['href'], province, city, list_url)
|
||||
if not detail:
|
||||
continue
|
||||
phone = detail.get("phone")
|
||||
if not phone:
|
||||
continue
|
||||
condition = f"phone='{phone}' and domain='{DOMAIN}'"
|
||||
if self.db.is_data_exist("lawyer", condition):
|
||||
print(f" -- 已存在: {detail['name']} ({phone})")
|
||||
time.sleep(0.3)
|
||||
continue
|
||||
try:
|
||||
self.db.insert_data("lawyer", detail)
|
||||
inserted += 1
|
||||
print(f" -> 新增: {detail['name']} ({phone})")
|
||||
except Exception as exc:
|
||||
print(f" 插入失败: {exc}")
|
||||
time.sleep(1)
|
||||
time.sleep(0.3)
|
||||
# 列表页结束后再缓一缓,降低风控
|
||||
time.sleep(0.6)
|
||||
return inserted
|
||||
|
||||
def _detail_headers(self, referer: str) -> Dict[str, str]:
|
||||
return {
|
||||
"Referer": referer,
|
||||
@@ -166,215 +208,72 @@ class DlsSpider:
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
def _extract_detail_urls(self, html: str) -> List[str]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
urls: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
|
||||
# 主选择器:当前站点列表卡片
|
||||
for a_tag in soup.select("div.lstx a[href]"):
|
||||
href = (a_tag.get("href") or "").strip()
|
||||
if not href:
|
||||
continue
|
||||
url = urljoin(SITE_BASE, href)
|
||||
if url in seen:
|
||||
continue
|
||||
seen.add(url)
|
||||
urls.append(url)
|
||||
|
||||
# 回退选择器:页面结构轻微变化时尽量保活
|
||||
if not urls:
|
||||
for a_tag in soup.select("a[href]"):
|
||||
href = (a_tag.get("href") or "").strip()
|
||||
if "/lawyer/" not in href:
|
||||
continue
|
||||
url = urljoin(SITE_BASE, href)
|
||||
if url in seen:
|
||||
continue
|
||||
seen.add(url)
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
def _extract_name(self, soup: BeautifulSoup) -> str:
|
||||
for selector in ("h2.lawyerName", "h1.lawyerName", "h1", "h2"):
|
||||
tag = soup.select_one(selector)
|
||||
if tag:
|
||||
name = tag.get_text(strip=True)
|
||||
if name:
|
||||
return name
|
||||
title = soup.title.get_text(strip=True) if soup.title else ""
|
||||
match = re.search(r"(\S+律师)", title)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
def _extract_law_firm(self, soup: BeautifulSoup) -> str:
|
||||
for selector in ("p.law-firm", "div.law-firm", "p[class*=firm]"):
|
||||
tag = soup.select_one(selector)
|
||||
if tag:
|
||||
text = tag.get_text(strip=True)
|
||||
if text:
|
||||
return text
|
||||
page_text = soup.get_text(" ", strip=True)
|
||||
match = re.search(r"(执业机构|律所)\s*[::]?\s*([^\s,。,;;]{2,40})", page_text)
|
||||
if match:
|
||||
return match.group(2).strip()
|
||||
return ""
|
||||
|
||||
def _normalize_phone(self, text: str) -> str:
|
||||
compact = re.sub(r"\D", "", text or "")
|
||||
match = PHONE_PATTERN.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
def _extract_phone(self, soup: BeautifulSoup) -> str:
|
||||
contact = soup.select_one("ul.contact-content")
|
||||
if contact:
|
||||
phone = self._normalize_phone(contact.get_text(" ", strip=True))
|
||||
if phone:
|
||||
return phone
|
||||
for selector in ("a[href^='tel:']", "span.phone", "p.phone"):
|
||||
tag = soup.select_one(selector)
|
||||
if tag:
|
||||
phone = self._normalize_phone(tag.get_text(" ", strip=True))
|
||||
if phone:
|
||||
return phone
|
||||
return self._normalize_phone(soup.get_text(" ", strip=True))
|
||||
|
||||
def _parse_detail(self, detail_url: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
|
||||
print(f" 详情: {detail_url}")
|
||||
html = self._get(detail_url, headers=self._detail_headers(list_url))
|
||||
def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
|
||||
url = f"https://m.maxlaw.cn{path}"
|
||||
print(f" 详情: {url}")
|
||||
html = self._get(url, headers=self._detail_headers(list_url))
|
||||
if not html:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
name = self._extract_name(soup)
|
||||
phone = self._extract_phone(soup)
|
||||
name_tag = soup.find("h2", class_="lawyerName")
|
||||
law_firm_tag = soup.find("p", class_="law-firm")
|
||||
contact_list = soup.find("ul", class_="contact-content")
|
||||
|
||||
name = name_tag.get_text(strip=True) if name_tag else ""
|
||||
law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
|
||||
phone = ""
|
||||
|
||||
if contact_list:
|
||||
items = contact_list.find_all("li")
|
||||
if len(items) > 2:
|
||||
phone_tag = items[2].find("p")
|
||||
if phone_tag:
|
||||
phone = phone_tag.get_text(strip=True)
|
||||
phone = phone.split("咨询请说明来自大律师网")[0].strip()
|
||||
|
||||
phone = phone.replace('-', '').strip()
|
||||
if not name or not phone:
|
||||
print(" 信息不完整,跳过")
|
||||
return None
|
||||
|
||||
safe_city = city or province
|
||||
safe_city = city if city else province
|
||||
return {
|
||||
"name": name,
|
||||
"law_firm": self._extract_law_firm(soup),
|
||||
"law_firm": law_firm,
|
||||
"province": province,
|
||||
"city": safe_city,
|
||||
"phone": phone,
|
||||
"url": detail_url,
|
||||
"url": url,
|
||||
"domain": DOMAIN,
|
||||
"create_time": int(time.time()),
|
||||
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False),
|
||||
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
|
||||
}
|
||||
|
||||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||
if not phones:
|
||||
return set()
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for idx in range(0, len(phones), chunk_size):
|
||||
chunk = phones[idx:idx + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
return existing
|
||||
|
||||
def _save_lawyers(self, lawyers: List[Dict[str, str]]) -> Tuple[int, int]:
|
||||
if not lawyers:
|
||||
return 0, 0
|
||||
phones = [row["phone"] for row in lawyers if row.get("phone")]
|
||||
existing = self._existing_phones(phones)
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
|
||||
for row in lawyers:
|
||||
phone = row.get("phone", "")
|
||||
if not phone:
|
||||
skipped += 1
|
||||
continue
|
||||
if phone in existing:
|
||||
skipped += 1
|
||||
print(f" -- 已存在: {row.get('name', '')} ({phone})")
|
||||
continue
|
||||
try:
|
||||
self.db.insert_data("lawyer", row)
|
||||
existing.add(phone)
|
||||
inserted += 1
|
||||
print(f" -> 新增: {row.get('name', '')} ({phone})")
|
||||
except Exception as exc:
|
||||
skipped += 1
|
||||
print(f" 插入失败 {row.get('url', '')}: {exc}")
|
||||
return inserted, skipped
|
||||
|
||||
def _crawl_city(self, area: Dict[str, str]) -> Tuple[int, int]:
|
||||
pinyin = (area.get("pinyin") or "").strip()
|
||||
province = area.get("province", "")
|
||||
city = area.get("city", "")
|
||||
if not pinyin:
|
||||
return 0, 0
|
||||
|
||||
total_inserted = 0
|
||||
total_parsed = 0
|
||||
page = 1
|
||||
prev_fingerprint = ""
|
||||
|
||||
while True:
|
||||
if MAX_PAGES_PER_CITY > 0 and page > MAX_PAGES_PER_CITY:
|
||||
print(f"达到分页上限({MAX_PAGES_PER_CITY}),停止 {province}-{city}")
|
||||
break
|
||||
|
||||
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
||||
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
||||
html = self._get(list_url)
|
||||
if not html:
|
||||
break
|
||||
|
||||
detail_urls = self._extract_detail_urls(html)
|
||||
if not detail_urls:
|
||||
print(" 列表为空,结束当前城市")
|
||||
break
|
||||
|
||||
fingerprint = "|".join(detail_urls[:8])
|
||||
if fingerprint and fingerprint == prev_fingerprint:
|
||||
print(" 列表页重复,提前停止当前城市")
|
||||
break
|
||||
prev_fingerprint = fingerprint
|
||||
|
||||
lawyers: List[Dict[str, str]] = []
|
||||
for detail_url in detail_urls:
|
||||
row = self._parse_detail(detail_url, province, city, list_url)
|
||||
if row:
|
||||
lawyers.append(row)
|
||||
time.sleep(0.25)
|
||||
|
||||
inserted, skipped = self._save_lawyers(lawyers)
|
||||
total_inserted += inserted
|
||||
total_parsed += len(lawyers)
|
||||
print(
|
||||
f" 第 {page} 页完成: 列表{len(detail_urls)}条, "
|
||||
f"解析{len(lawyers)}条, 新增{inserted}条, 跳过{skipped}条"
|
||||
)
|
||||
|
||||
page += 1
|
||||
time.sleep(0.5)
|
||||
return total_inserted, total_parsed
|
||||
|
||||
def run(self):
|
||||
print("启动大律师采集...")
|
||||
if not self.areas:
|
||||
print("无地区数据")
|
||||
return
|
||||
|
||||
all_inserted = 0
|
||||
all_parsed = 0
|
||||
for area in self.areas:
|
||||
inserted, parsed = self._crawl_city(area)
|
||||
all_inserted += inserted
|
||||
all_parsed += parsed
|
||||
print(f"大律师采集完成: 解析{all_parsed}条, 新增{all_inserted}条")
|
||||
pinyin = area.get("pinyin")
|
||||
province = area.get("province", "")
|
||||
city = area.get("city", "")
|
||||
if not pinyin:
|
||||
continue
|
||||
page = 1
|
||||
while True:
|
||||
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
||||
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
||||
html = self._get(list_url)
|
||||
if not html:
|
||||
break
|
||||
inserted = self._parse_list(html, province, city, list_url)
|
||||
if inserted == 0:
|
||||
break
|
||||
page += 1
|
||||
print("大律师采集完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -22,7 +22,7 @@ if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
from utils.rate_limiter import wait_for_request
|
||||
from utils.rate_limiter import request_slot
|
||||
from Db import Db
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
@@ -107,9 +107,9 @@ class DlsFreshCrawler:
|
||||
def _get_text(self, url: str, timeout: int = 20, max_retries: int = 3) -> str:
|
||||
last_error: Optional[Exception] = None
|
||||
for attempt in range(max_retries):
|
||||
wait_for_request()
|
||||
try:
|
||||
resp = self.client.get_text(url, timeout=timeout, verify=False)
|
||||
with request_slot():
|
||||
resp = self.client.get_text(url, timeout=timeout, verify=False)
|
||||
code = resp.status_code
|
||||
if code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
|
||||
@@ -0,0 +1,438 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from urllib.parse import urljoin
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
request_dir = os.path.join(project_root, "request")
|
||||
if request_dir not in sys.path:
|
||||
sys.path.insert(0, request_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
from utils.rate_limiter import request_slot
|
||||
from Db import Db
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
DOMAIN = "大律师"
|
||||
SITE_BASE = "https://www.maxlaw.cn"
|
||||
LIST_URL_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}"
|
||||
PROVINCE_API = "https://js.maxlaw.cn/js/ajax/common/getprovice.js"
|
||||
CITY_API_TEMPLATE = "https://js.maxlaw.cn/js/ajax/common/getcity_{province_id}.js"
|
||||
|
||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||
REPLY_RE = re.compile(r"已回复[::]?\s*(\d+)")
|
||||
AREA_PREFIX_RE = re.compile(r"^[A-Za-z]\s*")
|
||||
|
||||
|
||||
def normalize_phone(text: str) -> str:
|
||||
compact = re.sub(r"\D", "", text or "")
|
||||
match = PHONE_RE.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
def clean_area_name(text: str) -> str:
|
||||
value = AREA_PREFIX_RE.sub("", (text or "").strip())
|
||||
return value.strip()
|
||||
|
||||
|
||||
def normalize_region_text(text: str) -> str:
|
||||
value = (text or "").strip()
|
||||
value = value.replace("\xa0", " ")
|
||||
value = value.replace("-", "-").replace("—", "-").replace("–", "-")
|
||||
value = re.sub(r"\s*-\s*", "-", value)
|
||||
value = re.sub(r"\s+", "", value)
|
||||
return value
|
||||
|
||||
|
||||
class DlsPcSpider:
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
self.session = self._build_session()
|
||||
self.max_pages = int(os.getenv("MAXLAW_PC_MAX_PAGES", "100"))
|
||||
self.areas = self._load_areas()
|
||||
|
||||
def _build_session(self) -> requests.Session:
|
||||
report_proxy_status()
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
session.proxies.update(proxies)
|
||||
else:
|
||||
session.proxies.clear()
|
||||
|
||||
retries = Retry(
|
||||
total=3,
|
||||
backoff_factor=1,
|
||||
status_forcelist=(429, 500, 502, 503, 504),
|
||||
allowed_methods=frozenset(["GET"]),
|
||||
raise_on_status=False,
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retries)
|
||||
session.mount("https://", adapter)
|
||||
session.mount("http://", adapter)
|
||||
session.headers.update({
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/136.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Connection": "close",
|
||||
})
|
||||
return session
|
||||
|
||||
def _refresh_session(self) -> None:
|
||||
try:
|
||||
self.session.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.session = self._build_session()
|
||||
|
||||
def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with request_slot():
|
||||
resp = self.session.get(url, timeout=(10, 25), verify=False, headers=headers)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
resp.close()
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"403被拦截,{wait_time:.1f}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
self._refresh_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"请求失败 {url}: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||||
return text
|
||||
except requests.exceptions.RequestException as exc:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"请求失败,{wait_time:.1f}秒后重试 ({attempt + 1}/{max_retries}): {url} -> {exc}")
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"请求失败 {url}: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _get_json(self, url: str) -> Optional[Dict]:
|
||||
text = self._get(url)
|
||||
if not text:
|
||||
return None
|
||||
try:
|
||||
return json.loads(text.strip().lstrip("\ufeff"))
|
||||
except ValueError as exc:
|
||||
print(f"解析JSON失败 {url}: {exc}")
|
||||
return None
|
||||
|
||||
def _load_areas(self) -> List[Dict[str, str]]:
|
||||
areas = self._load_areas_from_site()
|
||||
if areas:
|
||||
print(f"[大律师PC] 地区来源: site, 地区数: {len(areas)}")
|
||||
return areas
|
||||
|
||||
areas = self._load_areas_from_db()
|
||||
if areas:
|
||||
print(f"[大律师PC] 地区来源: db, 地区数: {len(areas)}")
|
||||
return areas
|
||||
|
||||
print("[大律师PC] 无地区数据")
|
||||
return []
|
||||
|
||||
def _load_areas_from_site(self) -> List[Dict[str, str]]:
|
||||
data = self._get_json(PROVINCE_API)
|
||||
if not data or str(data.get("status")) != "1":
|
||||
return []
|
||||
|
||||
result: List[Dict[str, str]] = []
|
||||
seen_pinyin: Set[str] = set()
|
||||
|
||||
for province in data.get("ds", []) or []:
|
||||
province_id = province.get("id")
|
||||
province_name = clean_area_name(province.get("name", ""))
|
||||
province_pinyin = (province.get("py_code") or "").strip()
|
||||
|
||||
city_rows = []
|
||||
if province_id:
|
||||
city_data = self._get_json(CITY_API_TEMPLATE.format(province_id=province_id))
|
||||
if city_data and str(city_data.get("status")) == "1":
|
||||
city_rows = city_data.get("ds", []) or []
|
||||
|
||||
if not city_rows and province_pinyin and province_pinyin not in seen_pinyin:
|
||||
seen_pinyin.add(province_pinyin)
|
||||
result.append({
|
||||
"province": province_name,
|
||||
"city": province_name,
|
||||
"pinyin": province_pinyin,
|
||||
})
|
||||
continue
|
||||
|
||||
for city in city_rows:
|
||||
city_name = clean_area_name(city.get("name", ""))
|
||||
city_pinyin = (city.get("py_code") or "").strip()
|
||||
if not city_pinyin or city_pinyin in seen_pinyin:
|
||||
continue
|
||||
seen_pinyin.add(city_pinyin)
|
||||
result.append({
|
||||
"province": province_name,
|
||||
"city": city_name or province_name,
|
||||
"pinyin": city_pinyin,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
def _load_areas_from_db(self) -> List[Dict[str, str]]:
|
||||
tables = ("area_new", "area", "area2")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
try:
|
||||
rows = self.db.select_data(
|
||||
table,
|
||||
"province, city, pinyin",
|
||||
"domain='maxlaw' AND level=2",
|
||||
) or []
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
continue
|
||||
|
||||
if rows:
|
||||
return rows
|
||||
|
||||
if last_error:
|
||||
print(f"[大律师PC] 加载数据库地区失败: {last_error}")
|
||||
return []
|
||||
|
||||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||
if not phones:
|
||||
return set()
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(phones), chunk_size):
|
||||
chunk = phones[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
return existing
|
||||
|
||||
def _build_list_url(self, pinyin: str, page: int) -> str:
|
||||
return LIST_URL_TEMPLATE.format(pinyin=pinyin, page=page)
|
||||
|
||||
def _parse_location_line(
|
||||
self,
|
||||
text: str,
|
||||
fallback_province: str,
|
||||
fallback_city: str,
|
||||
) -> Tuple[str, str, str]:
|
||||
raw = (text or "").replace("\xa0", " ")
|
||||
raw = re.sub(r"\s+", " ", raw).strip()
|
||||
if not raw:
|
||||
return fallback_province, fallback_city or fallback_province, ""
|
||||
|
||||
parts = raw.split(" ", 1)
|
||||
area_text = parts[0].strip()
|
||||
law_firm = parts[1].strip() if len(parts) > 1 else ""
|
||||
|
||||
province = fallback_province
|
||||
city = fallback_city or fallback_province
|
||||
if "-" in area_text:
|
||||
area_parts = [item.strip() for item in area_text.split("-", 1)]
|
||||
if area_parts[0]:
|
||||
province = area_parts[0]
|
||||
if len(area_parts) > 1 and area_parts[1]:
|
||||
city = area_parts[1]
|
||||
elif area_text:
|
||||
province = area_text
|
||||
city = area_text
|
||||
|
||||
return province, city, law_firm
|
||||
|
||||
def _extract_page_region(self, soup: BeautifulSoup) -> str:
|
||||
button = soup.select_one(".filter .filter-btn")
|
||||
if button:
|
||||
return normalize_region_text(button.get_text(" ", strip=True))
|
||||
title = soup.select_one(".findLawyer-title h1")
|
||||
if title:
|
||||
return normalize_region_text(title.get_text(strip=True).replace("律师", ""))
|
||||
return ""
|
||||
|
||||
def _page_matches_area(self, soup: BeautifulSoup, province: str, city: str) -> Tuple[bool, str]:
|
||||
current_region = self._extract_page_region(soup)
|
||||
if not current_region:
|
||||
return True, current_region
|
||||
if "全国" in current_region:
|
||||
return False, current_region
|
||||
|
||||
norm_province = normalize_region_text(province)
|
||||
norm_city = normalize_region_text(city or province)
|
||||
|
||||
if norm_city and norm_city != norm_province:
|
||||
matched = norm_province in current_region and norm_city in current_region
|
||||
else:
|
||||
matched = norm_province in current_region
|
||||
|
||||
if matched:
|
||||
return True, current_region
|
||||
|
||||
title = soup.select_one(".findLawyer-title h1")
|
||||
title_text = ""
|
||||
if title:
|
||||
title_text = normalize_region_text(title.get_text(strip=True).replace("律师", ""))
|
||||
|
||||
if norm_city and norm_city != norm_province:
|
||||
matched = norm_city in title_text
|
||||
else:
|
||||
matched = norm_province in title_text
|
||||
|
||||
return matched, current_region or title_text
|
||||
|
||||
def _parse_list(self, html: str, province: str, city: str, list_url: str, area_pinyin: str) -> Tuple[bool, int, int]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
matched, current_region = self._page_matches_area(soup, province, city)
|
||||
if not matched:
|
||||
print(f" 页面地区不匹配,停止分页: 目标={province}-{city} 当前={current_region or '未知'}")
|
||||
return False, 0, 0
|
||||
|
||||
cards = []
|
||||
seen_page_phone: Set[str] = set()
|
||||
|
||||
for item in soup.select("ul.findLawyer-list > li.clearfix"):
|
||||
name_link = item.select_one(".findLawyer-list-detail-name a[href]")
|
||||
phone_tag = item.select_one(".findLawyer-list-detail-name span")
|
||||
if not name_link or not phone_tag:
|
||||
continue
|
||||
|
||||
phone = normalize_phone(phone_tag.get_text(" ", strip=True))
|
||||
if not phone or phone in seen_page_phone:
|
||||
continue
|
||||
seen_page_phone.add(phone)
|
||||
|
||||
name = name_link.get_text(strip=True)
|
||||
detail_url = urljoin(SITE_BASE, name_link.get("href", "").strip())
|
||||
|
||||
location_tag = item.select_one(".findLawyer-list-detail-the")
|
||||
card_province, card_city, law_firm = self._parse_location_line(
|
||||
location_tag.get_text(" ", strip=True) if location_tag else "",
|
||||
province,
|
||||
city,
|
||||
)
|
||||
|
||||
specialties = []
|
||||
for dd in item.select(".findLawyer-list-detail-fields dd"):
|
||||
text = dd.get_text(strip=True)
|
||||
if text:
|
||||
specialties.append(text)
|
||||
|
||||
reply_count = None
|
||||
reply_tag = item.select_one(".findLawyer-list-detail-other a")
|
||||
if reply_tag:
|
||||
match = REPLY_RE.search(reply_tag.get_text(" ", strip=True))
|
||||
if match:
|
||||
reply_count = int(match.group(1))
|
||||
|
||||
cards.append({
|
||||
"name": name,
|
||||
"law_firm": law_firm,
|
||||
"province": card_province or province,
|
||||
"city": card_city or city or province,
|
||||
"phone": phone,
|
||||
"url": detail_url,
|
||||
"domain": DOMAIN,
|
||||
"create_time": int(time.time()),
|
||||
"params": json.dumps({
|
||||
"area_pinyin": area_pinyin,
|
||||
"source": list_url,
|
||||
"specialties": specialties,
|
||||
"reply_count": reply_count,
|
||||
}, ensure_ascii=False),
|
||||
})
|
||||
|
||||
if not cards:
|
||||
return True, 0, 0
|
||||
|
||||
phones = [item["phone"] for item in cards if item.get("phone")]
|
||||
existing = self._existing_phones(phones)
|
||||
inserted = 0
|
||||
|
||||
for item in cards:
|
||||
phone = item.get("phone")
|
||||
if not phone:
|
||||
continue
|
||||
if phone in existing:
|
||||
print(f" -- 已存在: {item['name']} ({phone})")
|
||||
continue
|
||||
try:
|
||||
self.db.insert_data("lawyer", item)
|
||||
inserted += 1
|
||||
print(f" -> 新增: {item['name']} ({phone})")
|
||||
except Exception as exc:
|
||||
print(f" 插入失败 {item.get('url')}: {exc}")
|
||||
|
||||
return True, inserted, len(cards)
|
||||
|
||||
def run(self):
|
||||
print("启动大律师 PC 站采集...")
|
||||
if not self.areas:
|
||||
print("无地区数据")
|
||||
return
|
||||
|
||||
for area in self.areas:
|
||||
province = (area.get("province") or "").strip()
|
||||
city = (area.get("city") or province).strip()
|
||||
pinyin = (area.get("pinyin") or "").strip()
|
||||
if not province or not pinyin:
|
||||
continue
|
||||
|
||||
area_label = province if not city or city == province else f"{province}-{city}"
|
||||
print(f"采集地区: {area_label} ({pinyin})")
|
||||
|
||||
for page in range(1, self.max_pages + 1):
|
||||
list_url = self._build_list_url(pinyin, page)
|
||||
print(f" 第 {page} 页: {list_url}")
|
||||
html = self._get(list_url, headers={"Referer": SITE_BASE + "/law"})
|
||||
if not html:
|
||||
break
|
||||
|
||||
page_ok, inserted, parsed_count = self._parse_list(html, province, city, list_url, pinyin)
|
||||
if not page_ok:
|
||||
break
|
||||
if parsed_count == 0:
|
||||
print(" 当前页无律师卡片,停止")
|
||||
break
|
||||
|
||||
if inserted == 0:
|
||||
print(" 当前页无新增数据")
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
print("大律师 PC 站采集完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with Db() as db:
|
||||
spider = DlsPcSpider(db)
|
||||
spider.run()
|
||||
@@ -19,6 +19,9 @@ if project_root not in sys.path:
|
||||
from Db import Db
|
||||
|
||||
|
||||
DEFAULT_EXPORT_START_TS = 1772932103
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
|
||||
parser.add_argument(
|
||||
@@ -30,7 +33,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--start-ts",
|
||||
type=int,
|
||||
default=None,
|
||||
help="create_time 起始时间戳(含),不传时默认取最近7天",
|
||||
help=(
|
||||
"create_time 起始时间戳(含),"
|
||||
f"不传时默认取 {DEFAULT_EXPORT_START_TS} 之后的数据"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end-ts",
|
||||
@@ -43,6 +49,11 @@ def parse_args() -> argparse.Namespace:
|
||||
default="",
|
||||
help="按 domain 过滤,例如:大律师 / 找法网 / 华律",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--exclude-domain",
|
||||
default="",
|
||||
help="排除指定 domain,例如:高德地图",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--province",
|
||||
default="",
|
||||
@@ -74,13 +85,18 @@ def parse_args() -> argparse.Namespace:
|
||||
action="store_true",
|
||||
help="关闭 params JSON 扩展信息解析(默认开启)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--douyin-only",
|
||||
action="store_true",
|
||||
help="仅导出抖音采集数据(domain=抖音),并追加抖音专用字段",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def apply_default_time_filter(args: argparse.Namespace) -> None:
|
||||
# 未显式传时间范围时,默认导出最近7天的数据
|
||||
# 未显式传时间范围时,默认导出指定时间戳之后的数据
|
||||
if args.start_ts is None and args.end_ts is None:
|
||||
args.start_ts = int(time.time()) - 7 * 24 * 3600
|
||||
args.start_ts = DEFAULT_EXPORT_START_TS
|
||||
args.end_ts = 0
|
||||
return
|
||||
if args.start_ts is None:
|
||||
@@ -109,15 +125,23 @@ def build_query(args: argparse.Namespace) -> (str, List):
|
||||
where: List[str] = []
|
||||
params: List = []
|
||||
|
||||
if args.douyin_only:
|
||||
target_domain = args.domain.strip() or "抖音"
|
||||
where.append("domain = %s")
|
||||
params.append(target_domain)
|
||||
|
||||
if args.start_ts > 0:
|
||||
where.append("create_time >= %s")
|
||||
params.append(args.start_ts)
|
||||
if args.end_ts > 0:
|
||||
where.append("create_time <= %s")
|
||||
params.append(args.end_ts)
|
||||
if args.domain.strip():
|
||||
if args.domain.strip() and not args.douyin_only:
|
||||
where.append("domain = %s")
|
||||
params.append(args.domain.strip())
|
||||
if args.exclude_domain.strip():
|
||||
where.append("domain <> %s")
|
||||
params.append(args.exclude_domain.strip())
|
||||
if args.province.strip():
|
||||
where.append("province = %s")
|
||||
params.append(args.province.strip())
|
||||
@@ -161,6 +185,13 @@ def parse_params(params_text: str) -> Dict[str, str]:
|
||||
else:
|
||||
specialties_text = ""
|
||||
|
||||
user_info = data.get("user_info") or {}
|
||||
if not isinstance(user_info, dict):
|
||||
user_info = {}
|
||||
|
||||
sec_uid = str(data.get("sec_uid") or "")
|
||||
douyin_url = f"https://www.douyin.com/user/{sec_uid}" if sec_uid else ""
|
||||
|
||||
return {
|
||||
"email": str(profile.get("email") or ""),
|
||||
"address": str(profile.get("address") or ""),
|
||||
@@ -170,19 +201,34 @@ def parse_params(params_text: str) -> Dict[str, str]:
|
||||
"source_site": str(source.get("site") or ""),
|
||||
"detail_url": str(source.get("detail_url") or ""),
|
||||
"list_url": str(source.get("list_url") or ""),
|
||||
"api_source": str(data.get("api_source") or ""),
|
||||
"api_url": str(data.get("api_url") or ""),
|
||||
"city_index": str(data.get("city_index") or ""),
|
||||
"captured_at": str(data.get("captured_at") or ""),
|
||||
"sec_uid": sec_uid,
|
||||
"douyin_uid": str(user_info.get("uid") or ""),
|
||||
"douyin_unique_id": str(user_info.get("unique_id") or ""),
|
||||
"douyin_signature": str(user_info.get("signature") or ""),
|
||||
"douyin_nickname": str(user_info.get("nickname") or ""),
|
||||
"douyin_url": douyin_url,
|
||||
}
|
||||
|
||||
|
||||
def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, parse_params_flag: bool) -> int:
|
||||
def export_to_excel(
|
||||
rows: List[Dict],
|
||||
output_path: str,
|
||||
include_extra: bool,
|
||||
parse_params_flag: bool,
|
||||
douyin_only: bool,
|
||||
) -> int:
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "lawyers"
|
||||
|
||||
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"]
|
||||
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain", "URL"]
|
||||
if include_extra:
|
||||
headers.extend(
|
||||
[
|
||||
"URL",
|
||||
"站点",
|
||||
"create_time",
|
||||
"create_time_text",
|
||||
@@ -204,6 +250,22 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
|
||||
"list_url",
|
||||
]
|
||||
)
|
||||
if parse_params_flag and douyin_only:
|
||||
headers.extend(
|
||||
[
|
||||
"sec_uid",
|
||||
"抖音uid",
|
||||
"抖音号",
|
||||
"抖音昵称",
|
||||
"抖音简介",
|
||||
"抖音主页URL",
|
||||
"api_source",
|
||||
"api_url",
|
||||
"city_index",
|
||||
"captured_at",
|
||||
"captured_at_text",
|
||||
]
|
||||
)
|
||||
|
||||
ws.append(headers)
|
||||
for cell in ws[1]:
|
||||
@@ -221,12 +283,12 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
|
||||
row.get("city", "") or "",
|
||||
site_name,
|
||||
row.get("domain", "") or "",
|
||||
row.get("url", "") or "",
|
||||
]
|
||||
|
||||
if include_extra:
|
||||
line.extend(
|
||||
[
|
||||
row.get("url", "") or "",
|
||||
row.get("domain", "") or "",
|
||||
row.get("create_time", "") or "",
|
||||
ts_to_text(row.get("create_time")),
|
||||
@@ -250,6 +312,29 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
|
||||
]
|
||||
)
|
||||
|
||||
if parse_params_flag and douyin_only:
|
||||
captured_at_text = ""
|
||||
try:
|
||||
captured_at_text = ts_to_text(int(info.get("captured_at", "") or 0))
|
||||
except Exception:
|
||||
captured_at_text = ""
|
||||
|
||||
line.extend(
|
||||
[
|
||||
info.get("sec_uid", ""),
|
||||
info.get("douyin_uid", ""),
|
||||
info.get("douyin_unique_id", ""),
|
||||
info.get("douyin_nickname", ""),
|
||||
info.get("douyin_signature", ""),
|
||||
info.get("douyin_url", ""),
|
||||
info.get("api_source", ""),
|
||||
info.get("api_url", ""),
|
||||
info.get("city_index", ""),
|
||||
info.get("captured_at", ""),
|
||||
captured_at_text,
|
||||
]
|
||||
)
|
||||
|
||||
ws.append(line)
|
||||
exported += 1
|
||||
|
||||
@@ -277,6 +362,7 @@ def main() -> None:
|
||||
output_path=output_path,
|
||||
include_extra=args.include_extra,
|
||||
parse_params_flag=not args.no_parse_params,
|
||||
douyin_only=args.douyin_only,
|
||||
)
|
||||
|
||||
print(f"[export] 导出完成,共 {count} 条")
|
||||
|
||||
+176
-429
@@ -1,16 +1,9 @@
|
||||
import argparse
|
||||
import ast
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||
|
||||
import urllib3
|
||||
import random
|
||||
from typing import Dict, List, Set, Optional
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
@@ -20,460 +13,214 @@ if request_dir not in sys.path:
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
import requests
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
from Db import Db
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
from utils.rate_limiter import wait_for_request
|
||||
from utils.rate_limiter import request_slot
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
SITE_NAME = "findlaw"
|
||||
LEGACY_DOMAIN = "找法网"
|
||||
SITE_BASE = "https://m.findlaw.cn"
|
||||
CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
|
||||
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
|
||||
|
||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||
DOMAIN = "找法网"
|
||||
LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CityTarget:
|
||||
province_id: str
|
||||
province_name: str
|
||||
province_py: str
|
||||
city_id: str
|
||||
city_name: str
|
||||
city_py: str
|
||||
|
||||
|
||||
def normalize_phone(text: str) -> str:
|
||||
compact = re.sub(r"\D", "", text or "")
|
||||
match = PHONE_RE.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
class FindlawCrawler:
|
||||
def __init__(
|
||||
self,
|
||||
max_pages: int = 9999,
|
||||
sleep_seconds: float = 0.1,
|
||||
use_proxy: bool = True,
|
||||
db_connection=None,
|
||||
):
|
||||
self.max_pages = max_pages
|
||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||
class FindlawSpider:
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
self.client = RequestsClient(
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Connection": "close",
|
||||
},
|
||||
use_proxy=use_proxy,
|
||||
retry_total=2,
|
||||
retry_backoff_factor=1,
|
||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||
retry_allowed_methods=("GET",),
|
||||
)
|
||||
self.session = self._build_session()
|
||||
self.cities = self._load_cities()
|
||||
|
||||
def _get_text(
|
||||
self,
|
||||
url: str,
|
||||
timeout: int = 20,
|
||||
max_retries: int = 3,
|
||||
referer: str = SITE_BASE,
|
||||
) -> str:
|
||||
headers = {"Referer": referer}
|
||||
last_error: Optional[Exception] = None
|
||||
def _build_session(self) -> requests.Session:
|
||||
report_proxy_status()
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
session.proxies.update(proxies)
|
||||
else:
|
||||
session.proxies.clear()
|
||||
session.headers.update({
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Connection": "close",
|
||||
})
|
||||
return session
|
||||
|
||||
for attempt in range(max_retries):
|
||||
wait_for_request()
|
||||
try:
|
||||
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
|
||||
code = resp.status_code
|
||||
if code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
self.client.refresh()
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
if code >= 500 and attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
if code >= 400:
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
return resp.text
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise
|
||||
|
||||
if last_error is not None:
|
||||
raise last_error
|
||||
raise RequestClientError(f"Unknown request error: {url}")
|
||||
|
||||
def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
|
||||
pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
|
||||
match = re.search(pattern, script_text)
|
||||
if not match:
|
||||
return []
|
||||
raw = match.group(1)
|
||||
def _refresh_session(self) -> None:
|
||||
try:
|
||||
rows = ast.literal_eval(raw)
|
||||
return rows if isinstance(rows, list) else []
|
||||
self.session.close()
|
||||
except Exception:
|
||||
return []
|
||||
pass
|
||||
self.session = self._build_session()
|
||||
|
||||
def discover_cities(self) -> List[CityTarget]:
|
||||
js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
|
||||
provinces = self._parse_city_js_array(js_text, "iosProvinces")
|
||||
cities = self._parse_city_js_array(js_text, "iosCitys")
|
||||
|
||||
province_map: Dict[str, Dict] = {}
|
||||
for item in provinces:
|
||||
pid = str(item.get("id") or "").strip()
|
||||
if pid:
|
||||
province_map[pid] = item
|
||||
|
||||
results: List[CityTarget] = []
|
||||
seen_py: Set[str] = set()
|
||||
for city in cities:
|
||||
city_py = str(city.get("pinyin") or "").strip()
|
||||
city_name = str(city.get("value") or "").strip()
|
||||
city_id = str(city.get("id") or "").strip()
|
||||
province_id = str(city.get("parentId") or "").strip()
|
||||
if not city_py or not city_name or not city_id:
|
||||
continue
|
||||
if city_py in seen_py:
|
||||
continue
|
||||
seen_py.add(city_py)
|
||||
|
||||
province_row = province_map.get(province_id, {})
|
||||
province_name = str(province_row.get("value") or city_name).strip()
|
||||
province_py = str(province_row.get("pinyin") or city_py).strip()
|
||||
|
||||
results.append(
|
||||
CityTarget(
|
||||
province_id=province_id,
|
||||
province_name=province_name,
|
||||
province_py=province_py,
|
||||
city_id=city_id,
|
||||
city_name=city_name,
|
||||
city_py=city_py,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def _parse_list_payload(self, text: str) -> Dict:
|
||||
cleaned = (text or "").strip().lstrip("\ufeff")
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except ValueError:
|
||||
start = cleaned.find("{")
|
||||
end = cleaned.rfind("}")
|
||||
if start == -1 or end == -1:
|
||||
return {}
|
||||
return json.loads(cleaned[start:end + 1])
|
||||
|
||||
def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
|
||||
list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
|
||||
referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
|
||||
text = self._get_text(list_url, referer=referer)
|
||||
payload = self._parse_list_payload(text)
|
||||
if payload.get("errcode") != 0:
|
||||
return [], False, list_url
|
||||
|
||||
data = payload.get("data", {}) or {}
|
||||
items = data.get("lawyer_list", []) or []
|
||||
has_more = str(data.get("has_more", "0")) == "1"
|
||||
return items, has_more, list_url
|
||||
|
||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||
for page in range(1, self.max_pages + 1):
|
||||
def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
|
||||
headers = {"Referer": referer}
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
items, has_more, list_url = self.fetch_list_page(target.city_py, page)
|
||||
except Exception as exc:
|
||||
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
|
||||
break
|
||||
with request_slot():
|
||||
resp = self.session.get(url, timeout=15, verify=verify, headers=headers)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
resp.close()
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||
self._refresh_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"请求失败 {url}: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||||
return text
|
||||
except requests.exceptions.SSLError:
|
||||
if verify:
|
||||
return self._get(url, referer, verify=False, max_retries=max_retries)
|
||||
print(f"SSL错误 {url}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"请求失败 {url}: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
if not items:
|
||||
break
|
||||
|
||||
for item in items:
|
||||
detail_url = item.get("siteask_m") or item.get("site_url") or ""
|
||||
detail_url = str(detail_url).strip()
|
||||
if not detail_url.startswith("http"):
|
||||
detail_url = list_url
|
||||
|
||||
phone = normalize_phone(item.get("mobile", ""))
|
||||
profile = {
|
||||
"uid": str(item.get("uid") or ""),
|
||||
"name": str(item.get("username") or "").strip(),
|
||||
"law_firm": str(item.get("lawyer_lawroom") or "").strip(),
|
||||
"phone": phone,
|
||||
"lawyer_year": item.get("lawyer_year"),
|
||||
"service_area": str(item.get("service_area") or "").strip(),
|
||||
"address": str(item.get("addr") or "").strip(),
|
||||
"specialties": item.get("professionArr") or [],
|
||||
"answer_count": item.get("ansnum"),
|
||||
"comment_count": item.get("askcommentnum"),
|
||||
}
|
||||
|
||||
now = int(time.time())
|
||||
uid = profile.get("uid", "")
|
||||
record_key = uid or detail_url
|
||||
record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
|
||||
|
||||
area = item.get("areaInfo", {}) or {}
|
||||
yield {
|
||||
"record_id": record_id,
|
||||
"collected_at": now,
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"list_url": list_url,
|
||||
"detail_url": detail_url,
|
||||
"province": str(area.get("province") or target.province_name),
|
||||
"province_py": target.province_py,
|
||||
"city": str(area.get("city") or target.city_name),
|
||||
"city_py": target.city_py,
|
||||
"page": page,
|
||||
},
|
||||
"list_snapshot": {
|
||||
"uid": uid,
|
||||
"name": profile["name"],
|
||||
"law_firm": profile["law_firm"],
|
||||
"answer_count": profile["answer_count"],
|
||||
"comment_count": profile["comment_count"],
|
||||
},
|
||||
"profile": profile,
|
||||
"raw": item,
|
||||
}
|
||||
if self.sleep_seconds:
|
||||
time.sleep(self.sleep_seconds)
|
||||
|
||||
if not has_more:
|
||||
break
|
||||
|
||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||
source = record.get("source", {}) or {}
|
||||
profile = record.get("profile", {}) or {}
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
return None
|
||||
|
||||
province = (source.get("province") or "").strip()
|
||||
city = (source.get("city") or province).strip()
|
||||
return {
|
||||
"name": (profile.get("name") or "").strip(),
|
||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||
"province": province,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
|
||||
"domain": LEGACY_DOMAIN,
|
||||
"create_time": int(record.get("collected_at") or time.time()),
|
||||
"params": json.dumps(record, ensure_ascii=False),
|
||||
}
|
||||
|
||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||
if not self.db or not phones:
|
||||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||
if not phones:
|
||||
return set()
|
||||
deduped = sorted({p for p in phones if p})
|
||||
if not deduped:
|
||||
return set()
|
||||
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
for i in range(0, len(phones), chunk_size):
|
||||
chunk = phones[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||
cur.execute(sql, [DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
return existing
|
||||
|
||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||
if not self.db:
|
||||
return 0, 0
|
||||
|
||||
rows: List[Dict[str, str]] = []
|
||||
for record in records:
|
||||
row = self._to_legacy_lawyer_row(record)
|
||||
if row:
|
||||
rows.append(row)
|
||||
if not rows:
|
||||
return 0, 0
|
||||
|
||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
for row in rows:
|
||||
phone = row.get("phone", "")
|
||||
if not phone or phone in existing:
|
||||
skipped += 1
|
||||
continue
|
||||
def _load_cities(self):
|
||||
condition = "domain='findlaw' AND level=2"
|
||||
tables = ("area_new", "area2", "area")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
try:
|
||||
self.db.insert_data("lawyer", row)
|
||||
existing.add(phone)
|
||||
inserted += 1
|
||||
rows = self.db.select_data(table, "city, province, pinyin", condition) or []
|
||||
except Exception as exc:
|
||||
skipped += 1
|
||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||
return inserted, skipped
|
||||
last_error = exc
|
||||
continue
|
||||
if rows:
|
||||
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
|
||||
print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
||||
return rows
|
||||
|
||||
def crawl(
|
||||
self,
|
||||
output_path: str,
|
||||
max_cities: int = 0,
|
||||
city_filter: Optional[str] = None,
|
||||
) -> None:
|
||||
cities = self.discover_cities()
|
||||
print(f"[discover] 共发现城市 {len(cities)} 个")
|
||||
if city_filter:
|
||||
key = city_filter.strip().lower()
|
||||
cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
|
||||
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
||||
if max_cities > 0:
|
||||
cities = cities[:max_cities]
|
||||
print(f"[discover] 截断城市数 {len(cities)}")
|
||||
if last_error:
|
||||
print(f"[找法网] 加载地区数据失败: {last_error}")
|
||||
print("[找法网] 无城市数据(已尝试 area_new/area2/area)")
|
||||
for table in tables:
|
||||
try:
|
||||
cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
|
||||
c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
|
||||
print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||
def _fetch_page(self, url: str, referer: str) -> List[Dict]:
|
||||
text = self._get(url, referer, verify=True)
|
||||
if not text:
|
||||
return []
|
||||
|
||||
seen_ids: Set[str] = set()
|
||||
if os.path.exists(output_path):
|
||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||
for line in old_file:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
try:
|
||||
# 某些返回体前会携带 BOM 或包装脚本,此处做兼容
|
||||
text = text.strip().lstrip("\ufeff")
|
||||
try:
|
||||
data = json.loads(text)
|
||||
except ValueError:
|
||||
json_start = text.find('{')
|
||||
json_end = text.rfind('}')
|
||||
if json_start == -1 or json_end == -1:
|
||||
print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
|
||||
return []
|
||||
cleaned = text[json_start:json_end + 1]
|
||||
data = json.loads(cleaned)
|
||||
if isinstance(data, str):
|
||||
try:
|
||||
data = json.loads(data)
|
||||
except ValueError:
|
||||
print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}")
|
||||
return []
|
||||
except ValueError as exc:
|
||||
print(f"解析JSON失败 {url}: {exc}")
|
||||
return []
|
||||
|
||||
items = data.get("data", {}).get("lawyer_list", [])
|
||||
parsed = []
|
||||
for item in items:
|
||||
phone = (item.get("mobile") or "").replace("-", "")
|
||||
parsed.append({
|
||||
"name": item.get("username", ""),
|
||||
"law_firm": item.get("lawyer_lawroom", ""),
|
||||
"province": item.get("areaInfo", {}).get("province", ""),
|
||||
"city": item.get("areaInfo", {}).get("city", ""),
|
||||
"phone": phone,
|
||||
"url": url,
|
||||
"domain": DOMAIN,
|
||||
"create_time": int(time.time()),
|
||||
"params": json.dumps(item, ensure_ascii=False)
|
||||
})
|
||||
return parsed
|
||||
|
||||
def run(self):
|
||||
print("启动找法网采集...")
|
||||
if not self.cities:
|
||||
print("无城市数据")
|
||||
return
|
||||
|
||||
for city in self.cities:
|
||||
pinyin = city.get("pinyin")
|
||||
province = city.get("province", "")
|
||||
city_name = city.get("city", "")
|
||||
if not pinyin:
|
||||
continue
|
||||
print(f"采集 {province}-{city_name}")
|
||||
page = 1
|
||||
while True:
|
||||
url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
||||
referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
|
||||
print(f" 第 {page} 页: {url}")
|
||||
items = self._fetch_page(url, referer)
|
||||
if not items:
|
||||
break
|
||||
|
||||
phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
|
||||
existing = self._existing_phones(phones)
|
||||
|
||||
for entry in items:
|
||||
phone = entry.get("phone")
|
||||
if not phone:
|
||||
continue
|
||||
if phone in existing:
|
||||
print(f" -- 已存在: {entry['name']} ({phone})")
|
||||
continue
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
rid = item.get("record_id")
|
||||
if rid:
|
||||
seen_ids.add(rid)
|
||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||
self.db.insert_data("lawyer", entry)
|
||||
print(f" -> 新增: {entry['name']} ({phone})")
|
||||
except Exception as exc:
|
||||
print(f" 插入失败: {exc}")
|
||||
|
||||
total_new_json = 0
|
||||
total_new_db = 0
|
||||
total_skip_db = 0
|
||||
page += 1
|
||||
|
||||
with open(output_path, "a", encoding="utf-8") as out:
|
||||
for idx, target in enumerate(cities, start=1):
|
||||
print(
|
||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||
f"({target.city_py})"
|
||||
)
|
||||
city_records = list(self.crawl_city(target))
|
||||
|
||||
city_new_json = 0
|
||||
for record in city_records:
|
||||
rid = record["record_id"]
|
||||
if rid in seen_ids:
|
||||
continue
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
seen_ids.add(rid)
|
||||
city_new_json += 1
|
||||
total_new_json += 1
|
||||
|
||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||
total_new_db += city_new_db
|
||||
total_skip_db += city_skip_db
|
||||
print(
|
||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||
)
|
||||
|
||||
print(
|
||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="找法网全新采集脚本(重写版)")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
|
||||
help="输出 jsonl 文件路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-cities",
|
||||
type=int,
|
||||
default=0,
|
||||
help="最多采集多少个城市,0 表示不限",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=9999,
|
||||
help="每个城市最多采集多少页",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--city-filter",
|
||||
default="",
|
||||
help="按城市拼音或城市名过滤,如 beijing",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="每条记录采集间隔秒数",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--direct",
|
||||
action="store_true",
|
||||
help="直连模式,不使用 proxy_settings.json 代理",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-db",
|
||||
action="store_true",
|
||||
help="只输出 JSONL,不写入数据库",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
if args.no_db:
|
||||
crawler = FindlawCrawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=None,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
return
|
||||
|
||||
with Db() as db:
|
||||
crawler = FindlawCrawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=db,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
print("找法网采集完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
with Db() as db:
|
||||
spider = FindlawSpider(db)
|
||||
spider.run()
|
||||
|
||||
+291
-606
@@ -1,18 +1,10 @@
|
||||
import argparse
|
||||
import ast
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
import random
|
||||
from typing import Dict, Optional
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
@@ -22,638 +14,331 @@ if request_dir not in sys.path:
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
|
||||
from Db import Db
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
from utils.rate_limiter import wait_for_request
|
||||
from config import HEADERS
|
||||
from utils.rate_limiter import request_slot
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
SITE_NAME = "hualv"
|
||||
LEGACY_DOMAIN = "华律"
|
||||
SITE_BASE = "https://m.66law.cn"
|
||||
CITY_DATA_URL = "https://cache.66law.cn/dist/main-v2.0.js"
|
||||
LIST_API_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
|
||||
|
||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
|
||||
LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
|
||||
DOMAIN = "华律"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CityTarget:
|
||||
province_id: int
|
||||
province_name: str
|
||||
city_id: int
|
||||
city_name: str
|
||||
|
||||
|
||||
def normalize_phone(text: str) -> str:
|
||||
compact = re.sub(r"\D", "", text or "")
|
||||
match = PHONE_RE.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
def strip_html_tags(text: str) -> str:
|
||||
return re.sub(r"<[^>]+>", "", text or "").strip()
|
||||
|
||||
|
||||
class HualvCrawler:
|
||||
def __init__(
|
||||
self,
|
||||
max_pages: int = 9999,
|
||||
sleep_seconds: float = 0.15,
|
||||
use_proxy: bool = True,
|
||||
db_connection=None,
|
||||
):
|
||||
self.max_pages = max_pages
|
||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||
class HualvSpider:
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
self.client = RequestsClient(
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Connection": "close",
|
||||
},
|
||||
use_proxy=use_proxy,
|
||||
retry_total=2,
|
||||
retry_backoff_factor=1,
|
||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||
retry_allowed_methods=("GET", "POST"),
|
||||
self.session = self._build_session()
|
||||
self.areas = self._load_areas()
|
||||
|
||||
def _build_session(self) -> requests.Session:
|
||||
report_proxy_status()
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
session.proxies.update(proxies)
|
||||
else:
|
||||
session.proxies.clear()
|
||||
custom_headers = HEADERS.copy()
|
||||
custom_headers['User-Agent'] = (
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
|
||||
'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
|
||||
'Mobile/15E148 Safari/604.1'
|
||||
)
|
||||
custom_headers["Connection"] = "close"
|
||||
session.headers.update(custom_headers)
|
||||
return session
|
||||
|
||||
def _request_text(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
timeout: int = 20,
|
||||
max_retries: int = 3,
|
||||
referer: str = SITE_BASE,
|
||||
data: Optional[Dict] = None,
|
||||
) -> str:
|
||||
headers = {"Referer": referer}
|
||||
last_error: Optional[Exception] = None
|
||||
def _refresh_session(self) -> None:
|
||||
try:
|
||||
self.session.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.session = self._build_session()
|
||||
|
||||
for attempt in range(max_retries):
|
||||
wait_for_request()
|
||||
def _load_areas(self):
|
||||
tables = ("area_new", "area2", "area")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
try:
|
||||
if method.upper() == "POST":
|
||||
resp = self.client.post_text(
|
||||
url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
data=data,
|
||||
)
|
||||
else:
|
||||
resp = self.client.get_text(
|
||||
url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
code = resp.status_code
|
||||
if code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
self.client.refresh()
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
if code >= 500 and attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
if code >= 400:
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
return resp.text
|
||||
provinces = self.db.select_data(
|
||||
table,
|
||||
"code, province, pinyin, id",
|
||||
"domain='66law' AND level=1"
|
||||
) or []
|
||||
cities = self.db.select_data(
|
||||
table,
|
||||
"code, city, province, pid",
|
||||
"domain='66law' AND level=2"
|
||||
) or []
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise
|
||||
|
||||
if last_error is not None:
|
||||
raise last_error
|
||||
raise RequestClientError(f"Unknown request error: {url}")
|
||||
|
||||
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
|
||||
return self._request_text(
|
||||
"GET",
|
||||
url,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
referer=referer,
|
||||
)
|
||||
|
||||
def _post_text(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
data: Dict,
|
||||
timeout: int = 20,
|
||||
max_retries: int = 3,
|
||||
referer: str = SITE_BASE,
|
||||
) -> str:
|
||||
return self._request_text(
|
||||
"POST",
|
||||
url,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
referer=referer,
|
||||
data=data,
|
||||
)
|
||||
|
||||
def _extract_spc_location(self, script_text: str) -> List:
|
||||
# main-v2.js 内置了 sPCLocation=new Array(...),后面紧跟 cateinfo 数组
|
||||
marker = "sPCLocation = new Array("
|
||||
start = script_text.find(marker)
|
||||
if start == -1:
|
||||
marker = "sPCLocation=new Array("
|
||||
start = script_text.find(marker)
|
||||
if start == -1:
|
||||
return []
|
||||
start += len(marker)
|
||||
|
||||
next_marker = script_text.find("cateinfo = new Array(", start)
|
||||
if next_marker == -1:
|
||||
next_marker = script_text.find("cateinfo=new Array(", start)
|
||||
|
||||
if next_marker != -1:
|
||||
end = script_text.rfind(");", start, next_marker)
|
||||
else:
|
||||
end = script_text.find(");", start)
|
||||
|
||||
if end == -1 or end <= start:
|
||||
return []
|
||||
|
||||
raw = "[" + script_text[start:end] + "]"
|
||||
try:
|
||||
data = ast.literal_eval(raw)
|
||||
except Exception:
|
||||
return []
|
||||
return data if isinstance(data, list) else []
|
||||
|
||||
def discover_cities(self) -> List[CityTarget]:
|
||||
script_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyer/")
|
||||
rows = self._extract_spc_location(script_text)
|
||||
|
||||
targets: List[CityTarget] = []
|
||||
seen: Set[Tuple[int, int]] = set()
|
||||
|
||||
for province in rows:
|
||||
if not isinstance(province, list) or len(province) < 3:
|
||||
continue
|
||||
try:
|
||||
province_id = int(province[0])
|
||||
except Exception:
|
||||
|
||||
if not cities:
|
||||
continue
|
||||
province_name = str(province[1] or "").strip()
|
||||
city_rows = province[2] if isinstance(province[2], list) else []
|
||||
|
||||
for city in city_rows:
|
||||
if not isinstance(city, list) or len(city) < 2:
|
||||
continue
|
||||
try:
|
||||
city_id = int(city[0])
|
||||
except Exception:
|
||||
continue
|
||||
city_name = str(city[1] or "").strip()
|
||||
if city_id <= 0 or not city_name:
|
||||
continue
|
||||
|
||||
key = (province_id, city_id)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
targets.append(
|
||||
CityTarget(
|
||||
province_id=province_id,
|
||||
province_name=province_name,
|
||||
city_id=city_id,
|
||||
city_name=city_name,
|
||||
)
|
||||
)
|
||||
return targets
|
||||
|
||||
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[Dict], int]:
|
||||
payload = {
|
||||
"pid": str(target.province_id),
|
||||
"cid": str(target.city_id),
|
||||
"page": str(page),
|
||||
}
|
||||
text = self._post_text(
|
||||
LIST_API_URL,
|
||||
data=payload,
|
||||
referer=SITE_BASE + "/findlawyer/",
|
||||
)
|
||||
data = json.loads((text or "").strip().lstrip("\ufeff") or "{}")
|
||||
items = data.get("lawyerList") or data.get("queryLawyerList") or []
|
||||
if not isinstance(items, list):
|
||||
items = []
|
||||
|
||||
page_count = 0
|
||||
try:
|
||||
page_count = int((data.get("lawyerItems") or {}).get("pageCount") or 0)
|
||||
except Exception:
|
||||
page_count = 0
|
||||
return items, page_count
|
||||
|
||||
def parse_detail(self, detail_url: str) -> Dict:
|
||||
contact_url = detail_url.rstrip("/") + "/lawyer_contact.aspx"
|
||||
html = self._get_text(contact_url, referer=detail_url)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
full_text = soup.get_text(" ", strip=True)
|
||||
|
||||
name = ""
|
||||
law_firm = ""
|
||||
phone = ""
|
||||
email = ""
|
||||
address = ""
|
||||
license_no = ""
|
||||
practice_years: Optional[int] = None
|
||||
|
||||
name_tag = soup.select_one(".logo-box .title b")
|
||||
if name_tag:
|
||||
name = name_tag.get_text(strip=True).replace("律师", "").strip()
|
||||
if not name and soup.title:
|
||||
match = re.search(r"([^\s,,。_]+?)律师", soup.title.get_text(" ", strip=True))
|
||||
if match:
|
||||
name = match.group(1).strip()
|
||||
|
||||
phone_candidates = [
|
||||
soup.select_one(".logo-box .r-bar .tel").get_text(" ", strip=True)
|
||||
if soup.select_one(".logo-box .r-bar .tel")
|
||||
else "",
|
||||
soup.select_one(".lawyer-show ul.info").get_text(" ", strip=True)
|
||||
if soup.select_one(".lawyer-show ul.info")
|
||||
else "",
|
||||
full_text,
|
||||
]
|
||||
for candidate in phone_candidates:
|
||||
phone = normalize_phone(candidate)
|
||||
if phone:
|
||||
break
|
||||
|
||||
for li in soup.select(".lawyer-show ul.info li"):
|
||||
li_text = li.get_text(" ", strip=True)
|
||||
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
|
||||
law_firm = li_text
|
||||
|
||||
if not law_firm:
|
||||
match = re.search(r'"affiliation":\{"@type":"Organization","name":"([^"]+)"', html)
|
||||
if match:
|
||||
law_firm = match.group(1).strip()
|
||||
|
||||
match = re.search(r'"identifier":"([^"]+)"', html)
|
||||
if match:
|
||||
license_no = match.group(1).strip()
|
||||
|
||||
match = re.search(r'"streetAddress":"([^"]+)"', html)
|
||||
if match:
|
||||
address = match.group(1).strip()
|
||||
|
||||
email_match = EMAIL_RE.search(html)
|
||||
if email_match:
|
||||
email = email_match.group(0).strip()
|
||||
|
||||
year_match = YEAR_RE.search(full_text)
|
||||
if year_match:
|
||||
try:
|
||||
practice_years = int(year_match.group(1))
|
||||
except Exception:
|
||||
practice_years = None
|
||||
|
||||
specialties = [node.get_text(strip=True) for node in soup.select(".tag-h38 span")]
|
||||
specialties = [x for x in specialties if x]
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"law_firm": law_firm,
|
||||
"phone": phone,
|
||||
"email": email,
|
||||
"address": address,
|
||||
"license_no": license_no,
|
||||
"practice_years": practice_years,
|
||||
"specialties": specialties,
|
||||
"detail_url": detail_url,
|
||||
"contact_url": contact_url,
|
||||
}
|
||||
|
||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||
seen_details: Set[str] = set()
|
||||
|
||||
for page in range(1, self.max_pages + 1):
|
||||
try:
|
||||
items, page_count = self.fetch_list_page(target, page)
|
||||
except Exception as exc:
|
||||
print(f"[list] 失败 pid={target.province_id} cid={target.city_id} p{page}: {exc}")
|
||||
break
|
||||
|
||||
if not items:
|
||||
break
|
||||
|
||||
for item in items:
|
||||
detail_url = str(item.get("lawyerUrl") or "").strip()
|
||||
if not detail_url:
|
||||
continue
|
||||
if detail_url.startswith("//"):
|
||||
detail_url = "https:" + detail_url
|
||||
if not detail_url.startswith("http"):
|
||||
detail_url = urljoin(SITE_BASE, detail_url)
|
||||
|
||||
if detail_url in seen_details:
|
||||
continue
|
||||
seen_details.add(detail_url)
|
||||
|
||||
try:
|
||||
detail = self.parse_detail(detail_url)
|
||||
except Exception as exc:
|
||||
print(f"[detail] 失败 {detail_url}: {exc}")
|
||||
continue
|
||||
|
||||
now = int(time.time())
|
||||
uid = str(item.get("lawyerId") or item.get("globalUserId") or detail_url)
|
||||
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
|
||||
|
||||
list_name = str(item.get("name") or "").replace("律师", "").strip()
|
||||
category_text = str(item.get("categoryNames") or "").strip()
|
||||
category_arr = [x.strip() for x in re.split(r"[、,,]", category_text) if x.strip()]
|
||||
|
||||
yield {
|
||||
"record_id": record_id,
|
||||
"collected_at": now,
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"province_id": target.province_id,
|
||||
"province": target.province_name,
|
||||
"city_id": target.city_id,
|
||||
"city": target.city_name,
|
||||
"page": page,
|
||||
"detail_url": detail_url,
|
||||
"contact_url": detail.get("contact_url", ""),
|
||||
},
|
||||
"list_snapshot": {
|
||||
"lawyer_id": item.get("lawyerId"),
|
||||
"name": list_name,
|
||||
"category_names": category_arr,
|
||||
"help_count": strip_html_tags(str(item.get("helpCount") or "")),
|
||||
"comment_score": strip_html_tags(str(item.get("commentScore") or "")),
|
||||
"response_time": str(item.get("responseTime") or "").strip(),
|
||||
"year": item.get("year"),
|
||||
"is_adv": bool(item.get("isAdv")),
|
||||
},
|
||||
"profile": {
|
||||
"name": detail.get("name") or list_name,
|
||||
"law_firm": detail.get("law_firm") or "",
|
||||
"phone": detail.get("phone") or "",
|
||||
"email": detail.get("email") or "",
|
||||
"address": detail.get("address") or "",
|
||||
"license_no": detail.get("license_no") or "",
|
||||
"practice_years": detail.get("practice_years"),
|
||||
"specialties": detail.get("specialties") or category_arr,
|
||||
},
|
||||
"raw": item,
|
||||
province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
|
||||
city_map = {}
|
||||
for city in cities:
|
||||
province_info = province_map.get(city.get('pid'), {}) or {}
|
||||
province_code = province_info.get('code')
|
||||
city_map[city.get('code')] = {
|
||||
"name": city.get('city'),
|
||||
"province": city.get('province'),
|
||||
"province_code": province_code,
|
||||
}
|
||||
print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
|
||||
return city_map
|
||||
|
||||
if self.sleep_seconds:
|
||||
time.sleep(self.sleep_seconds)
|
||||
if last_error:
|
||||
print(f"[华律] 加载地区数据失败: {last_error}")
|
||||
print("[华律] 无城市数据(已尝试 area_new/area2/area)")
|
||||
return {}
|
||||
|
||||
if page_count > 0 and page >= page_count:
|
||||
break
|
||||
def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with request_slot():
|
||||
resp = self.session.post(LIST_URL, data=data, timeout=20, verify=False)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
resp.close()
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||
self._refresh_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print("请求失败: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise requests.exceptions.HTTPError(f"{status_code} Error")
|
||||
try:
|
||||
return json.loads(text)
|
||||
except ValueError as exc:
|
||||
print(f"解析JSON失败: {exc}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"请求失败: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||
source = record.get("source", {}) or {}
|
||||
profile = record.get("profile", {}) or {}
|
||||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||||
contact_url = f"{url}lawyer_contact.aspx"
|
||||
print(f" 详情: {contact_url}")
|
||||
existing = self.db.select_data(
|
||||
"lawyer",
|
||||
"id, avatar_url",
|
||||
f"domain='{DOMAIN}' AND url='{contact_url}'"
|
||||
)
|
||||
existing_id = None
|
||||
if existing:
|
||||
existing_id = existing[0].get("id")
|
||||
avatar = (existing[0].get("avatar_url") or "").strip()
|
||||
if avatar:
|
||||
print(" -- 已存在且头像已补全,跳过")
|
||||
return None
|
||||
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
html = self._get_detail(contact_url)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
province = (source.get("province") or "").strip()
|
||||
city = (source.get("city") or province).strip()
|
||||
return {
|
||||
"name": (profile.get("name") or "").strip(),
|
||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
info_list = soup.find("ul", class_="information-list")
|
||||
if not info_list:
|
||||
return None
|
||||
|
||||
phone = ""
|
||||
law_firm = ""
|
||||
for li in info_list.find_all("li"):
|
||||
text = li.get_text(strip=True)
|
||||
if "手机号" in text:
|
||||
cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
|
||||
match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
|
||||
if match:
|
||||
phone = match.group(0)
|
||||
if "执业单位" in text:
|
||||
law_firm = text.replace("执业单位", "").strip()
|
||||
|
||||
name = ""
|
||||
breadcrumb = soup.find("div", class_="weizhi")
|
||||
if breadcrumb:
|
||||
links = breadcrumb.find_all("a")
|
||||
if len(links) > 2:
|
||||
name = links[2].get_text(strip=True)
|
||||
|
||||
phone = phone.replace('-', '').strip()
|
||||
if not phone or not re.fullmatch(r"1\d{10}", phone):
|
||||
print(" 无手机号,跳过")
|
||||
return None
|
||||
|
||||
avatar_url, site_time = self._extract_avatar_and_time(soup)
|
||||
data = {
|
||||
"phone": phone,
|
||||
"province": province,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"url": (source.get("contact_url") or source.get("detail_url") or "").strip(),
|
||||
"domain": LEGACY_DOMAIN,
|
||||
"create_time": int(record.get("collected_at") or time.time()),
|
||||
"params": json.dumps(record, ensure_ascii=False),
|
||||
"law_firm": law_firm,
|
||||
"url": contact_url,
|
||||
"avatar_url": avatar_url,
|
||||
"create_time": int(time.time()),
|
||||
"site_time": site_time,
|
||||
"domain": DOMAIN,
|
||||
"name": name,
|
||||
"params": json.dumps({"source": url}, ensure_ascii=False)
|
||||
}
|
||||
|
||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||
if not self.db or not phones:
|
||||
return set()
|
||||
|
||||
deduped = sorted({p for p in phones if p})
|
||||
if not deduped:
|
||||
return set()
|
||||
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
|
||||
return existing
|
||||
|
||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||
if not self.db:
|
||||
return 0, 0
|
||||
|
||||
rows: List[Dict[str, str]] = []
|
||||
for record in records:
|
||||
row = self._to_legacy_lawyer_row(record)
|
||||
if row:
|
||||
rows.append(row)
|
||||
if not rows:
|
||||
return 0, 0
|
||||
|
||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
|
||||
for row in rows:
|
||||
phone = row.get("phone", "")
|
||||
if not phone or phone in existing:
|
||||
skipped += 1
|
||||
continue
|
||||
if existing_id:
|
||||
update_data = {
|
||||
"avatar_url": avatar_url,
|
||||
"site_time": site_time,
|
||||
}
|
||||
if name:
|
||||
update_data["name"] = name
|
||||
if law_firm:
|
||||
update_data["law_firm"] = law_firm
|
||||
if province:
|
||||
update_data["province"] = province
|
||||
if city:
|
||||
update_data["city"] = city
|
||||
if phone:
|
||||
update_data["phone"] = phone
|
||||
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
|
||||
try:
|
||||
self.db.insert_data("lawyer", row)
|
||||
existing.add(phone)
|
||||
inserted += 1
|
||||
self.db.update_data("lawyer", update_data, f"id={existing_id}")
|
||||
print(" -- 已存在,已补全头像/时间")
|
||||
except Exception as exc:
|
||||
skipped += 1
|
||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||
print(f" 更新失败: {exc}")
|
||||
return None
|
||||
# 若手机号已存在,则更新头像/时间,不再插入新记录
|
||||
existing_phone = self.db.select_data(
|
||||
"lawyer",
|
||||
"id, avatar_url, url",
|
||||
f"domain='{DOMAIN}' AND phone='{phone}'"
|
||||
)
|
||||
if existing_phone:
|
||||
existing_row = existing_phone[0]
|
||||
avatar = (existing_row.get("avatar_url") or "").strip()
|
||||
if avatar:
|
||||
print(" -- 已存在手机号且头像已补全,跳过")
|
||||
return None
|
||||
update_data = {
|
||||
"avatar_url": avatar_url,
|
||||
"site_time": site_time,
|
||||
}
|
||||
if name:
|
||||
update_data["name"] = name
|
||||
if law_firm:
|
||||
update_data["law_firm"] = law_firm
|
||||
if province:
|
||||
update_data["province"] = province
|
||||
if city:
|
||||
update_data["city"] = city
|
||||
if phone:
|
||||
update_data["phone"] = phone
|
||||
if not existing_row.get("url"):
|
||||
update_data["url"] = contact_url
|
||||
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
|
||||
try:
|
||||
self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
|
||||
print(" -- 已存在手机号,已补全头像/时间")
|
||||
except Exception as exc:
|
||||
print(f" 更新失败: {exc}")
|
||||
return None
|
||||
return data
|
||||
|
||||
return inserted, skipped
|
||||
def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
|
||||
avatar_url = ""
|
||||
site_time = None
|
||||
img_tag = soup.select_one(
|
||||
"div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
|
||||
)
|
||||
if img_tag:
|
||||
src = (img_tag.get("src") or "").strip()
|
||||
if src:
|
||||
if src.startswith("//"):
|
||||
avatar_url = f"https:{src}"
|
||||
else:
|
||||
avatar_url = src
|
||||
match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
|
||||
if match:
|
||||
site_time = int(f"{match.group(1)}{match.group(2)}")
|
||||
else:
|
||||
match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
|
||||
if match:
|
||||
site_time = int(f"{match.group(1)}{match.group(2)}")
|
||||
return avatar_url, site_time
|
||||
|
||||
def crawl(
|
||||
self,
|
||||
output_path: str,
|
||||
max_cities: int = 0,
|
||||
city_filter: Optional[str] = None,
|
||||
) -> None:
|
||||
cities = self.discover_cities()
|
||||
print(f"[discover] 共发现城市 {len(cities)} 个")
|
||||
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with request_slot():
|
||||
resp = self.session.get(url, timeout=15, verify=False)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
resp.close()
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||
self._refresh_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(" 请求失败: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise requests.exceptions.HTTPError(f"{status_code} Error")
|
||||
return text
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f" 请求失败: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
if city_filter:
|
||||
key = city_filter.strip().lower()
|
||||
cities = [
|
||||
c for c in cities
|
||||
if key in c.city_name.lower() or key in str(c.city_id).lower()
|
||||
]
|
||||
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
||||
def run(self):
|
||||
print("启动华律网采集...")
|
||||
if not self.areas:
|
||||
print("无城市数据")
|
||||
return
|
||||
|
||||
if max_cities > 0:
|
||||
cities = cities[:max_cities]
|
||||
print(f"[discover] 截断城市数 {len(cities)}")
|
||||
for city_code, city_info in self.areas.items():
|
||||
province_code = city_info.get("province_code")
|
||||
if not province_code:
|
||||
continue
|
||||
province_name = city_info.get("province", "")
|
||||
city_name = city_info.get("name", "")
|
||||
print(f"采集 {province_name}-{city_name}")
|
||||
|
||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||
page = 1
|
||||
while True:
|
||||
payload = {"pid": province_code, "cid": city_code, "page": str(page)}
|
||||
data = self._post(payload)
|
||||
if not data or not data.get("lawyerList"):
|
||||
break
|
||||
|
||||
seen_ids: Set[str] = set()
|
||||
if os.path.exists(output_path):
|
||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||
for line in old_file:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
for item in data["lawyerList"]:
|
||||
result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
|
||||
if not result:
|
||||
continue
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
rid = item.get("record_id")
|
||||
if rid:
|
||||
seen_ids.add(rid)
|
||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||
self.db.insert_data("lawyer", result)
|
||||
print(f" -> 新增: {result['name']} ({result['phone']})")
|
||||
except Exception as exc:
|
||||
print(f" 插入失败: {exc}")
|
||||
time.sleep(1)
|
||||
|
||||
total_new_json = 0
|
||||
total_new_db = 0
|
||||
total_skip_db = 0
|
||||
page_count = data.get("lawyerItems", {}).get("pageCount", page)
|
||||
if page >= page_count:
|
||||
break
|
||||
page += 1
|
||||
time.sleep(2)
|
||||
|
||||
with open(output_path, "a", encoding="utf-8") as out:
|
||||
for idx, target in enumerate(cities, start=1):
|
||||
print(
|
||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||
f"(pid={target.province_id}, cid={target.city_id})"
|
||||
)
|
||||
city_records = list(self.crawl_city(target))
|
||||
|
||||
city_new_json = 0
|
||||
for record in city_records:
|
||||
rid = record["record_id"]
|
||||
if rid in seen_ids:
|
||||
continue
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
seen_ids.add(rid)
|
||||
city_new_json += 1
|
||||
total_new_json += 1
|
||||
|
||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||
total_new_db += city_new_db
|
||||
total_skip_db += city_skip_db
|
||||
|
||||
print(
|
||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||
)
|
||||
|
||||
print(
|
||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="华律网全新采集脚本(站点数据直采)")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="/www/wwwroot/lawyers/data/hualv_records_all.jsonl",
|
||||
help="输出 jsonl 文件路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-cities",
|
||||
type=int,
|
||||
default=0,
|
||||
help="最多采集多少个城市,0 表示不限",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=9999,
|
||||
help="每个城市最多采集多少页",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--city-filter",
|
||||
default="",
|
||||
help="按城市名称或城市编码过滤,如 beijing / 110100",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.15,
|
||||
help="详情页请求间隔秒数",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--direct",
|
||||
action="store_true",
|
||||
help="直连模式,不使用 proxy_settings.json 代理",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-db",
|
||||
action="store_true",
|
||||
help="只输出 JSONL,不写入数据库",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if args.no_db:
|
||||
crawler = HualvCrawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=None,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
return
|
||||
|
||||
with Db() as db:
|
||||
crawler = HualvCrawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=db,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
time.sleep(1)
|
||||
print("华律网采集完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
with Db() as db:
|
||||
spider = HualvSpider(db)
|
||||
spider.run()
|
||||
|
||||
+238
-586
@@ -1,16 +1,13 @@
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
import random
|
||||
from typing import Dict, Optional, List, Set
|
||||
from urllib.parse import urljoin
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import threading
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
@@ -20,628 +17,283 @@ if request_dir not in sys.path:
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from Db import Db
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
from utils.rate_limiter import wait_for_request
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
SITE_NAME = "lawtime"
|
||||
LEGACY_DOMAIN = "法律快车"
|
||||
SITE_BASE = "https://www.lawtime.cn"
|
||||
PROVINCE_API = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode=0"
|
||||
CITY_API_TEMPLATE = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode={province_id}"
|
||||
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/lawyer"
|
||||
from Db import Db
|
||||
from config import LAWTIME_CONFIG
|
||||
from utils.rate_limiter import request_slot
|
||||
|
||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
|
||||
LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
|
||||
DETAIL_BASE = "https://m.lawtime.cn"
|
||||
DOMAIN = "法律快车"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CityTarget:
|
||||
province_id: str
|
||||
province_name: str
|
||||
province_py: str
|
||||
city_id: str
|
||||
city_name: str
|
||||
city_py: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListCard:
|
||||
detail_url: str
|
||||
name: str
|
||||
phone: str
|
||||
address: str = ""
|
||||
specialties: List[str] = field(default_factory=list)
|
||||
metric_text: str = ""
|
||||
|
||||
|
||||
def normalize_phone(text: str) -> str:
|
||||
compact = re.sub(r"\D", "", text or "")
|
||||
match = PHONE_RE.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
class LawtimeCrawler:
|
||||
def __init__(
|
||||
self,
|
||||
max_pages: int = 9999,
|
||||
sleep_seconds: float = 0.1,
|
||||
use_proxy: bool = True,
|
||||
db_connection=None,
|
||||
):
|
||||
self.max_pages = max_pages
|
||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||
class LawtimeSpider:
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
self.client = RequestsClient(
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/json,*/*;q=0.8",
|
||||
"Connection": "close",
|
||||
},
|
||||
use_proxy=use_proxy,
|
||||
retry_total=2,
|
||||
retry_backoff_factor=1,
|
||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||
retry_allowed_methods=("GET",),
|
||||
)
|
||||
self.session = self._build_session()
|
||||
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
|
||||
self._tls = threading.local()
|
||||
|
||||
def _get_text(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
timeout: int = 20,
|
||||
max_retries: int = 3,
|
||||
referer: str = SITE_BASE,
|
||||
) -> str:
|
||||
headers = {"Referer": referer}
|
||||
last_error: Optional[Exception] = None
|
||||
def _build_session(self) -> requests.Session:
|
||||
report_proxy_status()
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
session.proxies.update(proxies)
|
||||
else:
|
||||
session.proxies.clear()
|
||||
headers = LAWTIME_CONFIG.get("HEADERS", {})
|
||||
if headers:
|
||||
session.headers.update(headers)
|
||||
session.headers.setdefault("Connection", "close")
|
||||
return session
|
||||
|
||||
for attempt in range(max_retries):
|
||||
wait_for_request()
|
||||
try:
|
||||
resp = self.client.get_text(
|
||||
url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
)
|
||||
code = resp.status_code
|
||||
if code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
self.client.refresh()
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
if code >= 500 and attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
if code >= 400:
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
return resp.text
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise
|
||||
|
||||
if last_error is not None:
|
||||
raise last_error
|
||||
raise RequestClientError(f"Unknown request error: {url}")
|
||||
|
||||
def _get_json(self, url: str, *, referer: str) -> List[Dict]:
|
||||
text = self._get_text(url, referer=referer)
|
||||
cleaned = (text or "").strip().lstrip("\ufeff")
|
||||
if not cleaned or cleaned.startswith("<"):
|
||||
return []
|
||||
def _refresh_session(self) -> None:
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except ValueError:
|
||||
return []
|
||||
return data if isinstance(data, list) else []
|
||||
self.session.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.session = self._build_session()
|
||||
|
||||
def discover_cities(self) -> List[CityTarget]:
|
||||
provinces = self._get_json(PROVINCE_API, referer=SITE_BASE)
|
||||
if not provinces:
|
||||
print("[discover] 地区接口未返回有效数据")
|
||||
return []
|
||||
def _get_thread_session(self) -> requests.Session:
|
||||
s = getattr(self._tls, "session", None)
|
||||
if s is not None:
|
||||
return s
|
||||
s = self._build_session()
|
||||
s.headers.update(dict(self.session.headers))
|
||||
self._tls.session = s
|
||||
return s
|
||||
|
||||
results: List[CityTarget] = []
|
||||
seen_py: Set[str] = set()
|
||||
|
||||
for province in provinces:
|
||||
province_id = str(province.get("id") or "").strip()
|
||||
province_name = str(province.get("province") or province.get("city") or "").strip()
|
||||
province_py = str(province.get("pinyin") or "").strip()
|
||||
if not province_id or not province_name:
|
||||
continue
|
||||
|
||||
city_api = CITY_API_TEMPLATE.format(province_id=province_id)
|
||||
def _refresh_thread_session(self) -> None:
|
||||
s = getattr(self._tls, "session", None)
|
||||
if s is not None:
|
||||
try:
|
||||
cities = self._get_json(city_api, referer=LIST_URL_TEMPLATE.format(city_py=province_py or ""))
|
||||
except Exception as exc:
|
||||
print(f"[city] 获取失败 province={province_id}: {exc}")
|
||||
continue
|
||||
|
||||
if not cities:
|
||||
cities = [
|
||||
{
|
||||
"id": province_id,
|
||||
"province": province_name,
|
||||
"city": province_name,
|
||||
"pinyin": province_py,
|
||||
}
|
||||
]
|
||||
|
||||
for city in cities:
|
||||
city_id = str(city.get("id") or "").strip()
|
||||
city_name = str(city.get("city") or city.get("province") or "").strip()
|
||||
city_py = str(city.get("pinyin") or "").strip()
|
||||
if not city_id or not city_name or not city_py:
|
||||
continue
|
||||
if city_py in seen_py:
|
||||
continue
|
||||
seen_py.add(city_py)
|
||||
|
||||
results.append(
|
||||
CityTarget(
|
||||
province_id=province_id,
|
||||
province_name=province_name,
|
||||
province_py=province_py,
|
||||
city_id=city_id,
|
||||
city_name=city_name,
|
||||
city_py=city_py,
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def _build_list_url(self, city_py: str, page: int) -> str:
|
||||
base = LIST_URL_TEMPLATE.format(city_py=city_py)
|
||||
if page <= 1:
|
||||
return base
|
||||
return f"{base}?page={page}"
|
||||
|
||||
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[ListCard], bool, str]:
|
||||
list_url = self._build_list_url(target.city_py, page)
|
||||
html = self._get_text(list_url, referer=SITE_BASE + "/")
|
||||
|
||||
cards = self.parse_list_cards(html)
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
next_link = soup.select_one(f"div.page a[href*='page={page + 1}']")
|
||||
has_next = next_link is not None
|
||||
|
||||
return cards, has_next, list_url
|
||||
|
||||
def parse_list_cards(self, html: str) -> List[ListCard]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
cards: List[ListCard] = []
|
||||
seen: Set[str] = set()
|
||||
|
||||
for item in soup.select("li.lawyer-item-card"):
|
||||
link_tag = item.select_one("a.name[href]") or item.select_one("a.lawyer-img-box[href]")
|
||||
if not link_tag:
|
||||
continue
|
||||
detail_url = (link_tag.get("href") or "").strip()
|
||||
if not detail_url.startswith("http"):
|
||||
continue
|
||||
if detail_url in seen:
|
||||
continue
|
||||
seen.add(detail_url)
|
||||
|
||||
name = link_tag.get_text(strip=True)
|
||||
phone = ""
|
||||
phone_tag = item.select_one("div.phone")
|
||||
if phone_tag:
|
||||
phone = normalize_phone(phone_tag.get_text(" ", strip=True))
|
||||
|
||||
address = ""
|
||||
addr_tag = item.select_one("div.location .txt")
|
||||
if addr_tag:
|
||||
address = addr_tag.get_text(" ", strip=True)
|
||||
|
||||
specialties: List[str] = []
|
||||
prof_tag = item.select_one("div.prof .txt")
|
||||
if prof_tag:
|
||||
specialties = [
|
||||
x.strip() for x in re.split(r"[、,,]", prof_tag.get_text(" ", strip=True)) if x.strip()
|
||||
]
|
||||
|
||||
metric_text = ""
|
||||
metric_tag = item.select_one("div.num-msg")
|
||||
if metric_tag:
|
||||
metric_text = metric_tag.get_text(" ", strip=True)
|
||||
|
||||
cards.append(
|
||||
ListCard(
|
||||
detail_url=detail_url,
|
||||
name=name,
|
||||
phone=phone,
|
||||
address=address,
|
||||
specialties=specialties,
|
||||
metric_text=metric_text,
|
||||
)
|
||||
)
|
||||
|
||||
return cards
|
||||
|
||||
def parse_detail(self, detail_url: str) -> Dict:
|
||||
html = self._get_text(detail_url, referer=SITE_BASE)
|
||||
if "网站防火墙" in html or "访问被拒绝" in html or "/ipfilter/verify" in html:
|
||||
raise RequestClientError(f"firewall blocked: {detail_url}")
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
text = soup.get_text(" ", strip=True)
|
||||
|
||||
name = ""
|
||||
law_firm = ""
|
||||
phone = ""
|
||||
address = ""
|
||||
practice_years: Optional[int] = None
|
||||
specialties: List[str] = []
|
||||
|
||||
if soup.title:
|
||||
title = soup.title.get_text(" ", strip=True)
|
||||
match = re.search(r"([^\s_,,。]+?)律师", title)
|
||||
if match:
|
||||
name = match.group(1).strip()
|
||||
|
||||
phone_candidates = [
|
||||
soup.select_one(".data-w .tel-b b").get_text(" ", strip=True)
|
||||
if soup.select_one(".data-w .tel-b b")
|
||||
else "",
|
||||
soup.select_one(".law-info-b .item .two-r.b").get_text(" ", strip=True)
|
||||
if soup.select_one(".law-info-b .item .two-r.b")
|
||||
else "",
|
||||
text,
|
||||
]
|
||||
for candidate in phone_candidates:
|
||||
phone = normalize_phone(candidate)
|
||||
if phone:
|
||||
break
|
||||
|
||||
law_firm_tag = soup.select_one(".law-info-b .item .two-nowrap")
|
||||
if law_firm_tag:
|
||||
law_firm = law_firm_tag.get_text(" ", strip=True)
|
||||
|
||||
for li in soup.select(".law-info-b .item"):
|
||||
li_text = li.get_text(" ", strip=True)
|
||||
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
|
||||
law_firm = li_text
|
||||
|
||||
addr_tag = soup.select_one(".law-info-b .item .two-r[title]")
|
||||
if addr_tag:
|
||||
addr_value = (addr_tag.get("title") or "").strip()
|
||||
if len(addr_value) > 8:
|
||||
address = addr_value
|
||||
|
||||
if not address:
|
||||
addr_tag = soup.select_one(".law-info-b .item .two-r")
|
||||
if addr_tag:
|
||||
addr_value = addr_tag.get_text(" ", strip=True)
|
||||
if len(addr_value) > 8 and "律师" not in addr_value:
|
||||
address = addr_value
|
||||
|
||||
year_match = YEAR_RE.search(text)
|
||||
if year_match:
|
||||
try:
|
||||
practice_years = int(year_match.group(1))
|
||||
s.close()
|
||||
except Exception:
|
||||
practice_years = None
|
||||
pass
|
||||
self._tls.session = None
|
||||
|
||||
specialties = [x.get_text(strip=True) for x in soup.select(".profession-b .item") if x.get_text(strip=True)]
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"law_firm": law_firm,
|
||||
"phone": phone,
|
||||
"address": address,
|
||||
"practice_years": practice_years,
|
||||
"specialties": specialties,
|
||||
"detail_url": detail_url,
|
||||
}
|
||||
|
||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||
seen_details: Set[str] = set()
|
||||
|
||||
for page in range(1, self.max_pages + 1):
|
||||
try:
|
||||
cards, has_next, list_url = self.fetch_list_page(target, page)
|
||||
except Exception as exc:
|
||||
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
|
||||
break
|
||||
|
||||
if not cards:
|
||||
break
|
||||
|
||||
for card in cards:
|
||||
if card.detail_url in seen_details:
|
||||
continue
|
||||
seen_details.add(card.detail_url)
|
||||
|
||||
detail: Dict = {}
|
||||
try:
|
||||
detail = self.parse_detail(card.detail_url)
|
||||
except Exception as exc:
|
||||
print(f"[detail] 失败 {card.detail_url}: {exc}")
|
||||
|
||||
phone = normalize_phone(detail.get("phone") or card.phone)
|
||||
profile_name = (detail.get("name") or card.name).replace("律师", "").strip()
|
||||
|
||||
now = int(time.time())
|
||||
record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
|
||||
|
||||
yield {
|
||||
"record_id": record_id,
|
||||
"collected_at": now,
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"province_id": target.province_id,
|
||||
"province": target.province_name,
|
||||
"province_py": target.province_py,
|
||||
"city_id": target.city_id,
|
||||
"city": target.city_name,
|
||||
"city_py": target.city_py,
|
||||
"page": page,
|
||||
"list_url": list_url,
|
||||
"detail_url": card.detail_url,
|
||||
},
|
||||
"list_snapshot": {
|
||||
"name": card.name,
|
||||
"phone": card.phone,
|
||||
"address": card.address,
|
||||
"specialties": card.specialties,
|
||||
"metric_text": card.metric_text,
|
||||
},
|
||||
"profile": {
|
||||
"name": profile_name,
|
||||
"law_firm": (detail.get("law_firm") or "").strip(),
|
||||
"phone": phone,
|
||||
"address": (detail.get("address") or card.address or "").strip(),
|
||||
"practice_years": detail.get("practice_years"),
|
||||
"specialties": detail.get("specialties") or card.specialties,
|
||||
},
|
||||
}
|
||||
|
||||
if self.sleep_seconds:
|
||||
time.sleep(self.sleep_seconds)
|
||||
|
||||
if not has_next:
|
||||
break
|
||||
|
||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||
source = record.get("source", {}) or {}
|
||||
profile = record.get("profile", {}) or {}
|
||||
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
return None
|
||||
|
||||
province = (source.get("province") or "").strip()
|
||||
city = (source.get("city") or province).strip()
|
||||
return {
|
||||
"name": (profile.get("name") or "").strip(),
|
||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||
"province": province,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
|
||||
"domain": LEGACY_DOMAIN,
|
||||
"create_time": int(record.get("collected_at") or time.time()),
|
||||
"params": json.dumps(record, ensure_ascii=False),
|
||||
}
|
||||
|
||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||
if not self.db or not phones:
|
||||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||
if not phones:
|
||||
return set()
|
||||
|
||||
deduped = sorted({p for p in phones if p})
|
||||
if not deduped:
|
||||
return set()
|
||||
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
for i in range(0, len(phones), chunk_size):
|
||||
chunk = phones[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||
cur.execute(sql, [DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
|
||||
return existing
|
||||
|
||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||
if not self.db:
|
||||
return 0, 0
|
||||
def _load_areas(self):
|
||||
condition = "level = 2 and domain='法律快车'"
|
||||
tables = ("area_new", "area", "area2")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
try:
|
||||
rows = self.db.select_data(table, "pinyin, province, city", condition) or []
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
continue
|
||||
if rows:
|
||||
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
|
||||
print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
||||
return rows
|
||||
|
||||
rows: List[Dict[str, str]] = []
|
||||
for record in records:
|
||||
row = self._to_legacy_lawyer_row(record)
|
||||
if row:
|
||||
rows.append(row)
|
||||
if not rows:
|
||||
return 0, 0
|
||||
if last_error:
|
||||
print(f"[法律快车] 加载地区数据失败: {last_error}")
|
||||
print("[法律快车] 无城市数据(已尝试 area_new/area/area2)")
|
||||
return []
|
||||
|
||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||||
return self._get_with_session(self.session, url, max_retries=max_retries, is_thread=False)
|
||||
|
||||
for row in rows:
|
||||
phone = row.get("phone", "")
|
||||
if not phone or phone in existing:
|
||||
skipped += 1
|
||||
def _get_with_session(self, session: requests.Session, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with request_slot():
|
||||
resp = session.get(url, timeout=15, verify=False)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
resp.close()
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"请求失败 {url}: 403,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||
if is_thread:
|
||||
self._refresh_thread_session()
|
||||
session = self._get_thread_session()
|
||||
else:
|
||||
self._refresh_session()
|
||||
session = self.session
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"请求失败 {url}: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||||
return text
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"请求失败 {url}: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _parse_list(self, html: str, province: str, city: str) -> int:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
links = [a.get("href", "") for a in soup.select("a.hide_link")]
|
||||
links = [link.replace("lll", "int") for link in links if link]
|
||||
if not links:
|
||||
return 0
|
||||
|
||||
detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
|
||||
|
||||
results: List[Dict[str, str]] = []
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
||||
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
|
||||
for fut in as_completed(futs):
|
||||
try:
|
||||
data = fut.result()
|
||||
except Exception as exc:
|
||||
print(f" 详情解析异常: {exc}")
|
||||
continue
|
||||
if data and data.get("phone"):
|
||||
results.append(data)
|
||||
|
||||
if not results:
|
||||
return len(detail_urls)
|
||||
|
||||
phones = [d["phone"] for d in results if d.get("phone")]
|
||||
existing = self._existing_phones(phones)
|
||||
|
||||
for data in results:
|
||||
phone = data.get("phone")
|
||||
if not phone:
|
||||
continue
|
||||
if phone in existing:
|
||||
print(f" -- 已存在: {data['name']} ({phone})")
|
||||
continue
|
||||
try:
|
||||
self.db.insert_data("lawyer", row)
|
||||
existing.add(phone)
|
||||
inserted += 1
|
||||
self.db.insert_data("lawyer", data)
|
||||
print(f" -> 新增: {data['name']} ({phone})")
|
||||
except Exception as exc:
|
||||
skipped += 1
|
||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||
print(f" 插入失败 {data.get('url')}: {exc}")
|
||||
|
||||
return inserted, skipped
|
||||
return len(detail_urls)
|
||||
|
||||
def crawl(
|
||||
self,
|
||||
output_path: str,
|
||||
max_cities: int = 0,
|
||||
city_filter: Optional[str] = None,
|
||||
) -> None:
|
||||
cities = self.discover_cities()
|
||||
print(f"[discover] 共发现城市 {len(cities)} 个")
|
||||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||||
html = None
|
||||
sess = self._get_thread_session()
|
||||
html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
if city_filter:
|
||||
key = city_filter.strip().lower()
|
||||
cities = [
|
||||
c for c in cities
|
||||
if key in c.city_py.lower() or key in c.city_name.lower()
|
||||
]
|
||||
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
text = soup.get_text(" ")
|
||||
|
||||
if max_cities > 0:
|
||||
cities = cities[:max_cities]
|
||||
print(f"[discover] 截断城市数 {len(cities)}")
|
||||
name = ""
|
||||
title_tag = soup.find("title")
|
||||
if title_tag:
|
||||
match = re.search(r"(\S+)律师", title_tag.get_text())
|
||||
if match:
|
||||
name = match.group(1)
|
||||
if not name:
|
||||
intl_div = soup.find("div", class_="intl")
|
||||
if intl_div:
|
||||
match = re.search(r"(\S+)律师", intl_div.get_text())
|
||||
if match:
|
||||
name = match.group(1)
|
||||
|
||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||
phone = ""
|
||||
phone_pattern = r"1[3-9]\d{9}"
|
||||
for item in soup.select("div.item.flex"):
|
||||
label = item.find("div", class_="label")
|
||||
desc = item.find("div", class_="desc")
|
||||
if not label or not desc:
|
||||
continue
|
||||
label_text = label.get_text()
|
||||
desc_text = desc.get_text().replace("-", "")
|
||||
if "联系电话" in label_text or "电话" in label_text:
|
||||
matches = re.findall(phone_pattern, desc_text)
|
||||
if matches:
|
||||
phone = matches[0]
|
||||
break
|
||||
if not phone:
|
||||
matches = re.findall(phone_pattern, text.replace("-", ""))
|
||||
if matches:
|
||||
phone = matches[0]
|
||||
if not phone:
|
||||
print(f" 无手机号: {url}")
|
||||
return None
|
||||
|
||||
seen_ids: Set[str] = set()
|
||||
if os.path.exists(output_path):
|
||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||
for line in old_file:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
rid = item.get("record_id")
|
||||
if rid:
|
||||
seen_ids.add(rid)
|
||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||
law_firm = ""
|
||||
for item in soup.select("div.item.flex"):
|
||||
label = item.find("div", class_="label")
|
||||
desc = item.find("div", class_="desc")
|
||||
if not label or not desc:
|
||||
continue
|
||||
if "执业律所" in label.get_text() or "律所" in label.get_text():
|
||||
law_firm = desc.get_text(strip=True).replace("已认证", "")
|
||||
break
|
||||
|
||||
total_new_json = 0
|
||||
total_new_db = 0
|
||||
total_skip_db = 0
|
||||
params = {
|
||||
"list_url": url,
|
||||
"province": province,
|
||||
"city": city,
|
||||
}
|
||||
|
||||
with open(output_path, "a", encoding="utf-8") as out:
|
||||
for idx, target in enumerate(cities, start=1):
|
||||
print(
|
||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||
f"({target.city_py})"
|
||||
)
|
||||
city_records = list(self.crawl_city(target))
|
||||
return {
|
||||
"name": name or "",
|
||||
"law_firm": law_firm,
|
||||
"province": province,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"url": url,
|
||||
"domain": DOMAIN,
|
||||
"create_time": int(time.time()),
|
||||
"params": json.dumps(params, ensure_ascii=False)
|
||||
}
|
||||
|
||||
city_new_json = 0
|
||||
for record in city_records:
|
||||
rid = record["record_id"]
|
||||
if rid in seen_ids:
|
||||
continue
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
seen_ids.add(rid)
|
||||
city_new_json += 1
|
||||
total_new_json += 1
|
||||
def run(self):
|
||||
print("启动法律快车采集...")
|
||||
areas = self._load_areas()
|
||||
if not areas:
|
||||
print("无地区数据")
|
||||
return
|
||||
|
||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||
total_new_db += city_new_db
|
||||
total_skip_db += city_skip_db
|
||||
|
||||
print(
|
||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||
)
|
||||
|
||||
print(
|
||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="法律快车全新采集脚本(站点数据直采)")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="/www/wwwroot/lawyers/data/lawtime_records_all.jsonl",
|
||||
help="输出 jsonl 文件路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-cities",
|
||||
type=int,
|
||||
default=0,
|
||||
help="最多采集多少个城市,0 表示不限",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=9999,
|
||||
help="每个城市最多采集多少页",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--city-filter",
|
||||
default="",
|
||||
help="按城市拼音或城市名过滤,如 beijing",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="详情页请求间隔秒数",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--direct",
|
||||
action="store_true",
|
||||
help="直连模式,不使用 proxy_settings.json 代理",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-db",
|
||||
action="store_true",
|
||||
help="只输出 JSONL,不写入数据库",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if args.no_db:
|
||||
crawler = LawtimeCrawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=None,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
return
|
||||
|
||||
with Db() as db:
|
||||
crawler = LawtimeCrawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=db,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
for area in areas:
|
||||
pinyin = area.get("pinyin")
|
||||
province = area.get("province", "")
|
||||
city = area.get("city", "")
|
||||
if not pinyin:
|
||||
continue
|
||||
page = 1
|
||||
while True:
|
||||
list_url = LIST_BASE.format(pinyin=pinyin, page=page)
|
||||
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
||||
html = self._get(list_url)
|
||||
if not html:
|
||||
break
|
||||
link_count = self._parse_list(html, province, city)
|
||||
if link_count == 0:
|
||||
break
|
||||
page += 1
|
||||
print("法律快车采集完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
with Db() as db:
|
||||
spider = LawtimeSpider(db)
|
||||
spider.run()
|
||||
|
||||
+267
-608
@@ -1,17 +1,11 @@
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
import random
|
||||
from typing import Dict, Optional, List, Set
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import threading
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
@@ -21,237 +15,167 @@ if request_dir not in sys.path:
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from Db import Db
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
from utils.rate_limiter import wait_for_request
|
||||
import requests
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
SITE_NAME = "64365"
|
||||
LEGACY_DOMAIN = "律图"
|
||||
SITE_BASE = "https://m.64365.com"
|
||||
AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js"
|
||||
LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
|
||||
from Db import Db
|
||||
from utils.rate_limiter import request_slot
|
||||
|
||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||
YEAR_RE = re.compile(r"(\d+)\s*年")
|
||||
DOMAIN = "律图"
|
||||
LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CityTarget:
|
||||
area_id: str
|
||||
province_id: str
|
||||
province_name: str
|
||||
province_py: str
|
||||
city_name: str
|
||||
city_py: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListCard:
|
||||
detail_url: str
|
||||
name: str
|
||||
specialties: List[str]
|
||||
score_text: str
|
||||
service_text: str
|
||||
|
||||
|
||||
def normalize_phone(text: str) -> str:
|
||||
compact = re.sub(r"\D", "", text or "")
|
||||
match = PHONE_RE.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
class Six4365Crawler:
|
||||
def __init__(
|
||||
self,
|
||||
max_pages: int = 9999,
|
||||
sleep_seconds: float = 0.1,
|
||||
use_proxy: bool = True,
|
||||
db_connection=None,
|
||||
):
|
||||
self.max_pages = max_pages
|
||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||
class Six4365Spider:
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
self.client = RequestsClient(
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
"Accept": "text/html, */*; q=0.01",
|
||||
"Connection": "close",
|
||||
},
|
||||
use_proxy=use_proxy,
|
||||
retry_total=2,
|
||||
retry_backoff_factor=1,
|
||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||
retry_allowed_methods=("GET", "POST"),
|
||||
)
|
||||
self.session = self._build_session()
|
||||
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
|
||||
self._tls = threading.local()
|
||||
self.cities = self._load_cities()
|
||||
|
||||
def _request_text(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
timeout: int = 20,
|
||||
max_retries: int = 3,
|
||||
referer: str = SITE_BASE,
|
||||
data: Optional[Dict] = None,
|
||||
) -> str:
|
||||
headers = {"Referer": referer}
|
||||
last_error: Optional[Exception] = None
|
||||
def _build_session(self) -> requests.Session:
|
||||
report_proxy_status()
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
session.proxies.update(proxies)
|
||||
else:
|
||||
session.proxies.clear()
|
||||
session.headers.update({
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
"Connection": "close",
|
||||
})
|
||||
return session
|
||||
|
||||
for attempt in range(max_retries):
|
||||
wait_for_request()
|
||||
def _refresh_session(self) -> None:
|
||||
try:
|
||||
self.session.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.session = self._build_session()
|
||||
|
||||
def _get_thread_session(self) -> requests.Session:
|
||||
"""requests.Session 不是严格线程安全:每个线程用独立 session(但共享同样代理/headers)"""
|
||||
s = getattr(self._tls, "session", None)
|
||||
if s is not None:
|
||||
return s
|
||||
s = self._build_session()
|
||||
s.headers.update(dict(self.session.headers))
|
||||
self._tls.session = s
|
||||
return s
|
||||
|
||||
def _refresh_thread_session(self) -> None:
|
||||
s = getattr(self._tls, "session", None)
|
||||
if s is not None:
|
||||
try:
|
||||
if method.upper() == "POST":
|
||||
resp = self.client.post_text(
|
||||
url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
data=data,
|
||||
)
|
||||
else:
|
||||
resp = self.client.get_text(
|
||||
url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
)
|
||||
s.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._tls.session = None
|
||||
|
||||
code = resp.status_code
|
||||
if code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
self.client.refresh()
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
if code >= 500 and attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
if code >= 400:
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
return resp.text
|
||||
def _existing_urls(self, urls: List[str]) -> Set[str]:
|
||||
"""批量查重,减少 N 次 is_data_exist"""
|
||||
if not urls:
|
||||
return set()
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
# IN 参数过多会失败,分批
|
||||
chunk_size = 500
|
||||
for i in range(0, len(urls), chunk_size):
|
||||
chunk = urls[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
|
||||
cur.execute(sql, chunk)
|
||||
for row in cur.fetchall():
|
||||
# pymysql 默认返回 tuple
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
return existing
|
||||
|
||||
def _load_cities(self):
|
||||
tables = ("area_new", "area2", "area")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
try:
|
||||
provinces = self.db.select_data(
|
||||
table,
|
||||
"id, code, province",
|
||||
"domain='64365' AND level=1"
|
||||
) or []
|
||||
cities = self.db.select_data(
|
||||
table,
|
||||
"code, city, province, pid",
|
||||
"domain='64365' AND level=2"
|
||||
) or []
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise
|
||||
continue
|
||||
|
||||
if last_error is not None:
|
||||
raise last_error
|
||||
raise RequestClientError(f"Unknown request error: {url}")
|
||||
if not cities:
|
||||
continue
|
||||
|
||||
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
|
||||
return self._request_text(
|
||||
"GET",
|
||||
url,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
referer=referer,
|
||||
)
|
||||
province_map = {row.get('id'): row for row in provinces}
|
||||
data = {}
|
||||
for city in cities:
|
||||
province_row = province_map.get(city.get('pid'), {}) or {}
|
||||
data[str(city.get('code'))] = {
|
||||
"name": city.get('city'),
|
||||
"province": city.get('province'),
|
||||
"province_name": province_row.get('province', city.get('province')),
|
||||
}
|
||||
print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
|
||||
return data
|
||||
|
||||
def _post_text(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
data: Dict,
|
||||
timeout: int = 20,
|
||||
max_retries: int = 3,
|
||||
referer: str = SITE_BASE,
|
||||
) -> str:
|
||||
return self._request_text(
|
||||
"POST",
|
||||
url,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
referer=referer,
|
||||
data=data,
|
||||
)
|
||||
if last_error:
|
||||
print(f"[律图] 加载地区数据失败: {last_error}")
|
||||
print("[律图] 无城市数据(已尝试 area_new/area2/area)")
|
||||
return {}
|
||||
|
||||
def _extract_area_data(self, text: str) -> List[Dict]:
|
||||
match = re.search(
|
||||
r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData",
|
||||
text,
|
||||
re.S,
|
||||
)
|
||||
if not match:
|
||||
return []
|
||||
|
||||
raw = match.group(1)
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except Exception:
|
||||
return []
|
||||
return data if isinstance(data, list) else []
|
||||
|
||||
def discover_cities(self) -> List[CityTarget]:
|
||||
text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/")
|
||||
provinces = self._extract_area_data(text)
|
||||
|
||||
targets: List[CityTarget] = []
|
||||
seen_area: Set[str] = set()
|
||||
|
||||
for province in provinces:
|
||||
province_id = str(province.get("id") or "").strip()
|
||||
province_name = str(province.get("name") or "").strip()
|
||||
province_py = str(province.get("py") or "").strip()
|
||||
child_rows = province.get("child") or []
|
||||
|
||||
# 常规省份 child 是地级市;直辖市 child 是区县,此时使用省级 id 抓取
|
||||
if child_rows and any((row.get("child") or []) for row in child_rows):
|
||||
for city in child_rows:
|
||||
area_id = str(city.get("id") or "").strip()
|
||||
city_name = str(city.get("name") or "").strip()
|
||||
city_py = str(city.get("py") or "").strip()
|
||||
if not area_id or not city_name:
|
||||
def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with request_slot():
|
||||
resp = self.session.post(LIST_URL, data=payload, timeout=10, verify=False)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
resp.close()
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||
self._refresh_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
if area_id in seen_area:
|
||||
continue
|
||||
seen_area.add(area_id)
|
||||
targets.append(
|
||||
CityTarget(
|
||||
area_id=area_id,
|
||||
province_id=province_id,
|
||||
province_name=province_name,
|
||||
province_py=province_py,
|
||||
city_name=city_name,
|
||||
city_py=city_py,
|
||||
)
|
||||
)
|
||||
else:
|
||||
if not province_id or not province_name:
|
||||
continue
|
||||
if province_id in seen_area:
|
||||
continue
|
||||
seen_area.add(province_id)
|
||||
targets.append(
|
||||
CityTarget(
|
||||
area_id=province_id,
|
||||
province_id=province_id,
|
||||
province_name=province_name,
|
||||
province_py=province_py,
|
||||
city_name=province_name,
|
||||
city_py=province_py,
|
||||
)
|
||||
)
|
||||
print("请求失败: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise requests.exceptions.HTTPError(f"{status_code} Error")
|
||||
return text
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"请求失败: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
return targets
|
||||
|
||||
def _build_payload(self, area_id: str, page: int) -> Dict[str, str]:
|
||||
ua = self.client.headers.get("User-Agent", "")
|
||||
def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
|
||||
return {
|
||||
"AdCode": "",
|
||||
"RegionId": str(area_id),
|
||||
"RegionId": str(city_code),
|
||||
"CategoryId": "",
|
||||
"MaxNumber": "",
|
||||
"OnlyData": "true",
|
||||
"IgnoreButton": "",
|
||||
"LawyerRecommendRequest[AreaId]": str(area_id),
|
||||
"LawyerRecommendRequest[AreaId]": str(city_code),
|
||||
"LawyerRecommendRequest[LawCategoryIds]": "",
|
||||
"LawyerRecommendRequest[LawFirmPersonCount]": "",
|
||||
"LawyerRecommendRequest[LawFirmScale]": "",
|
||||
@@ -268,429 +192,164 @@ class Six4365Crawler:
|
||||
"LawyerRecommendRequest[RefferUrl]": "",
|
||||
"LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
|
||||
"LawyerRecommendRequest[resource_type_name]": "",
|
||||
"LawyerRecommendRequest[UserAgent]": ua,
|
||||
"LawyerRecommendRequest[UserAgent]": self.session.headers["User-Agent"],
|
||||
"LawyerRecommendRequest[AddLawyerWithNoData]": "false",
|
||||
"ShowCaseButton": "true",
|
||||
}
|
||||
|
||||
def fetch_list_html(self, target: CityTarget, page: int) -> str:
|
||||
payload = self._build_payload(target.area_id, page)
|
||||
return self._post_text(
|
||||
LIST_API_URL,
|
||||
data=payload,
|
||||
referer=SITE_BASE + "/findlawyer/",
|
||||
)
|
||||
|
||||
def parse_list_cards(self, html: str) -> List[ListCard]:
|
||||
def _parse_list(self, html: str, province: str, city: str) -> int:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
cards: List[ListCard] = []
|
||||
seen: Set[str] = set()
|
||||
lawyers = soup.find_all("a", class_="lawyer")
|
||||
if not lawyers:
|
||||
return 0
|
||||
|
||||
for anchor in soup.select("a.lawyer[href]"):
|
||||
href = (anchor.get("href") or "").strip()
|
||||
detail_urls: List[str] = []
|
||||
for lawyer in lawyers:
|
||||
href = lawyer.get("href")
|
||||
if not href:
|
||||
continue
|
||||
detail_url = urljoin(SITE_BASE, href)
|
||||
if detail_url in seen:
|
||||
detail_urls.append(f"{href.rstrip('/')}/info/")
|
||||
|
||||
if not detail_urls:
|
||||
return 0
|
||||
|
||||
results: List[Dict[str, str]] = []
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
||||
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
|
||||
for fut in as_completed(futs):
|
||||
try:
|
||||
data = fut.result()
|
||||
except Exception as exc:
|
||||
print(f" 详情解析异常: {exc}")
|
||||
continue
|
||||
if data:
|
||||
results.append(data)
|
||||
|
||||
if not results:
|
||||
return len(detail_urls)
|
||||
|
||||
existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
|
||||
for data in results:
|
||||
if not data:
|
||||
continue
|
||||
seen.add(detail_url)
|
||||
url = data.get("url", "")
|
||||
if not url:
|
||||
continue
|
||||
if url in existing:
|
||||
print(f" -- 已存在URL: {url}")
|
||||
continue
|
||||
try:
|
||||
self.db.insert_data("lawyer", data)
|
||||
print(f" -> 新增: {data['name']} ({data['phone']})")
|
||||
except Exception as exc:
|
||||
print(f" 插入失败 {url}: {exc}")
|
||||
|
||||
name = ""
|
||||
name_tag = anchor.select_one("b.name")
|
||||
if name_tag:
|
||||
name = name_tag.get_text(strip=True)
|
||||
return len(detail_urls)
|
||||
|
||||
specialties: List[str] = []
|
||||
skill_tag = anchor.select_one("div.skill")
|
||||
if skill_tag:
|
||||
raw = skill_tag.get_text(" ", strip=True).replace("擅长:", "")
|
||||
specialties = [x.strip() for x in re.split(r"[、,,]", raw) if x.strip()]
|
||||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||||
html = self._get_detail(url)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
score_text = ""
|
||||
score_tag = anchor.select_one("div.info span[title='评分'] em")
|
||||
if score_tag:
|
||||
score_text = score_tag.get_text(strip=True)
|
||||
|
||||
service_text = ""
|
||||
service_tag = anchor.select_one("div.info")
|
||||
if service_tag:
|
||||
service_text = service_tag.get_text(" ", strip=True)
|
||||
|
||||
cards.append(
|
||||
ListCard(
|
||||
detail_url=detail_url,
|
||||
name=name,
|
||||
specialties=specialties,
|
||||
score_text=score_text,
|
||||
service_text=service_text,
|
||||
)
|
||||
)
|
||||
|
||||
return cards
|
||||
|
||||
def parse_detail(self, detail_url: str) -> Dict:
|
||||
info_url = detail_url.rstrip("/") + "/info/"
|
||||
html = self._get_text(info_url, referer=detail_url)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
base_info = soup.find("ul", class_="intro-basic-bar")
|
||||
if not base_info:
|
||||
return None
|
||||
|
||||
name = ""
|
||||
law_firm = ""
|
||||
phone = ""
|
||||
practice_years: Optional[int] = None
|
||||
office_area = ""
|
||||
address = ""
|
||||
specialties: List[str] = []
|
||||
|
||||
for li in soup.select("ul.intro-basic-bar li"):
|
||||
label_tag = li.select_one("span.label")
|
||||
value_tag = li.select_one("div.txt")
|
||||
if not label_tag or not value_tag:
|
||||
for li in base_info.find_all("li"):
|
||||
label = li.find("span", class_="label")
|
||||
txt = li.find("div", class_="txt")
|
||||
if not label or not txt:
|
||||
continue
|
||||
label_text = label.get_text(strip=True)
|
||||
if "姓名" in label_text:
|
||||
name = txt.get_text(strip=True)
|
||||
if "执业律所" in label_text:
|
||||
law_firm = txt.get_text(strip=True)
|
||||
|
||||
label = label_tag.get_text(" ", strip=True).replace(":", "")
|
||||
value = value_tag.get_text(" ", strip=True)
|
||||
more_section = soup.find("div", class_="more-intro-basic")
|
||||
if more_section:
|
||||
phone_ul = more_section.find("ul", class_="intro-basic-bar")
|
||||
if phone_ul:
|
||||
for li in phone_ul.find_all("li"):
|
||||
label = li.find("span", class_="label")
|
||||
txt = li.find("div", class_="txt")
|
||||
if label and txt and "联系电话" in label.get_text(strip=True):
|
||||
phone = txt.get_text(strip=True).replace(" ", "")
|
||||
break
|
||||
|
||||
if "姓名" in label and not name:
|
||||
name = value
|
||||
elif "执业律所" in label and not law_firm:
|
||||
law_firm = value
|
||||
elif "联系电话" in label and not phone:
|
||||
phone = normalize_phone(value)
|
||||
elif "执业年限" in label and practice_years is None:
|
||||
year_match = YEAR_RE.search(value)
|
||||
if year_match:
|
||||
try:
|
||||
practice_years = int(year_match.group(1))
|
||||
except Exception:
|
||||
practice_years = None
|
||||
elif "办公地区" in label and not office_area:
|
||||
office_area = value
|
||||
elif "办公地址" in label and not address:
|
||||
address = value
|
||||
|
||||
text = soup.get_text(" ", strip=True)
|
||||
if not phone:
|
||||
phone = normalize_phone(text)
|
||||
|
||||
if not name and soup.title:
|
||||
title = soup.title.get_text(" ", strip=True)
|
||||
match = re.search(r"([^\s_,,。]+?)律师", title)
|
||||
if match:
|
||||
name = match.group(1).strip()
|
||||
|
||||
skill_match = re.search(r"擅长:([^\n]+)", text)
|
||||
if skill_match:
|
||||
specialties = [x.strip() for x in re.split(r"[、,,]", skill_match.group(1)) if x.strip()]
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"law_firm": law_firm,
|
||||
"phone": phone,
|
||||
"practice_years": practice_years,
|
||||
"office_area": office_area,
|
||||
"address": address,
|
||||
"specialties": specialties,
|
||||
"detail_url": detail_url,
|
||||
"info_url": info_url,
|
||||
}
|
||||
|
||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||
seen_detail_urls: Set[str] = set()
|
||||
page_first_seen: Set[str] = set()
|
||||
|
||||
for page in range(1, self.max_pages + 1):
|
||||
try:
|
||||
html = self.fetch_list_html(target, page)
|
||||
except Exception as exc:
|
||||
print(f"[list] 失败 area={target.area_id} p{page}: {exc}")
|
||||
break
|
||||
|
||||
cards = self.parse_list_cards(html)
|
||||
if not cards:
|
||||
break
|
||||
|
||||
first_url = cards[0].detail_url
|
||||
if first_url in page_first_seen:
|
||||
break
|
||||
page_first_seen.add(first_url)
|
||||
|
||||
for card in cards:
|
||||
if card.detail_url in seen_detail_urls:
|
||||
continue
|
||||
seen_detail_urls.add(card.detail_url)
|
||||
|
||||
try:
|
||||
detail = self.parse_detail(card.detail_url)
|
||||
except Exception as exc:
|
||||
print(f"[detail] 失败 {card.detail_url}: {exc}")
|
||||
continue
|
||||
|
||||
now = int(time.time())
|
||||
uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url)
|
||||
uid = uid_match.group(1) if uid_match else card.detail_url
|
||||
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
|
||||
|
||||
yield {
|
||||
"record_id": record_id,
|
||||
"collected_at": now,
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"province_id": target.province_id,
|
||||
"province": target.province_name,
|
||||
"province_py": target.province_py,
|
||||
"area_id": target.area_id,
|
||||
"city": target.city_name,
|
||||
"city_py": target.city_py,
|
||||
"page": page,
|
||||
"detail_url": card.detail_url,
|
||||
"info_url": detail.get("info_url", ""),
|
||||
},
|
||||
"list_snapshot": {
|
||||
"name": card.name,
|
||||
"specialties": card.specialties,
|
||||
"score_text": card.score_text,
|
||||
"service_text": card.service_text,
|
||||
},
|
||||
"profile": {
|
||||
"name": detail.get("name") or card.name,
|
||||
"law_firm": detail.get("law_firm") or "",
|
||||
"phone": detail.get("phone") or "",
|
||||
"practice_years": detail.get("practice_years"),
|
||||
"office_area": detail.get("office_area") or "",
|
||||
"address": detail.get("address") or "",
|
||||
"specialties": detail.get("specialties") or card.specialties,
|
||||
},
|
||||
}
|
||||
|
||||
if self.sleep_seconds:
|
||||
time.sleep(self.sleep_seconds)
|
||||
|
||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||
source = record.get("source", {}) or {}
|
||||
profile = record.get("profile", {}) or {}
|
||||
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
phone = phone.replace('-', '').strip()
|
||||
if not name or not phone:
|
||||
return None
|
||||
|
||||
province = (source.get("province") or "").strip()
|
||||
city = (source.get("city") or province).strip()
|
||||
return {
|
||||
"name": (profile.get("name") or "").strip(),
|
||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||
data = {
|
||||
"phone": phone,
|
||||
"province": province,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"url": (source.get("info_url") or source.get("detail_url") or "").strip(),
|
||||
"domain": LEGACY_DOMAIN,
|
||||
"create_time": int(record.get("collected_at") or time.time()),
|
||||
"params": json.dumps(record, ensure_ascii=False),
|
||||
"law_firm": law_firm,
|
||||
"url": url,
|
||||
"domain": DOMAIN,
|
||||
"name": name,
|
||||
"create_time": int(time.time()),
|
||||
"params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
|
||||
}
|
||||
return data
|
||||
|
||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||
if not self.db or not phones:
|
||||
return set()
|
||||
|
||||
deduped = sorted({p for p in phones if p})
|
||||
if not deduped:
|
||||
return set()
|
||||
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
|
||||
return existing
|
||||
|
||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||
if not self.db:
|
||||
return 0, 0
|
||||
|
||||
rows: List[Dict[str, str]] = []
|
||||
for record in records:
|
||||
row = self._to_legacy_lawyer_row(record)
|
||||
if row:
|
||||
rows.append(row)
|
||||
if not rows:
|
||||
return 0, 0
|
||||
|
||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
|
||||
for row in rows:
|
||||
phone = row.get("phone", "")
|
||||
if not phone or phone in existing:
|
||||
skipped += 1
|
||||
continue
|
||||
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||||
session = self._get_thread_session()
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
self.db.insert_data("lawyer", row)
|
||||
existing.add(phone)
|
||||
inserted += 1
|
||||
except Exception as exc:
|
||||
skipped += 1
|
||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||
|
||||
return inserted, skipped
|
||||
|
||||
def crawl(
|
||||
self,
|
||||
output_path: str,
|
||||
max_cities: int = 0,
|
||||
city_filter: Optional[str] = None,
|
||||
) -> None:
|
||||
cities = self.discover_cities()
|
||||
print(f"[discover] 共发现地区 {len(cities)} 个")
|
||||
|
||||
if city_filter:
|
||||
key = city_filter.strip().lower()
|
||||
cities = [
|
||||
c for c in cities
|
||||
if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id
|
||||
]
|
||||
print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}")
|
||||
|
||||
if max_cities > 0:
|
||||
cities = cities[:max_cities]
|
||||
print(f"[discover] 截断地区数 {len(cities)}")
|
||||
|
||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||
|
||||
seen_ids: Set[str] = set()
|
||||
if os.path.exists(output_path):
|
||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||
for line in old_file:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
with request_slot():
|
||||
resp = session.get(url, timeout=10, verify=False)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
resp.close()
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||
self._refresh_thread_session()
|
||||
session = self._get_thread_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
rid = item.get("record_id")
|
||||
if rid:
|
||||
seen_ids.add(rid)
|
||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||
print(" 请求失败: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise requests.exceptions.HTTPError(f"{status_code} Error")
|
||||
return text
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f" 请求失败: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
total_new_json = 0
|
||||
total_new_db = 0
|
||||
total_skip_db = 0
|
||||
def run(self):
|
||||
print("启动律图采集...")
|
||||
if not self.cities:
|
||||
print("无城市数据")
|
||||
return
|
||||
|
||||
with open(output_path, "a", encoding="utf-8") as out:
|
||||
for idx, target in enumerate(cities, start=1):
|
||||
print(
|
||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||
f"(area={target.area_id})"
|
||||
)
|
||||
city_records = list(self.crawl_city(target))
|
||||
|
||||
city_new_json = 0
|
||||
for record in city_records:
|
||||
rid = record["record_id"]
|
||||
if rid in seen_ids:
|
||||
continue
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
seen_ids.add(rid)
|
||||
city_new_json += 1
|
||||
total_new_json += 1
|
||||
|
||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||
total_new_db += city_new_db
|
||||
total_skip_db += city_skip_db
|
||||
|
||||
print(
|
||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||
)
|
||||
|
||||
print(
|
||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="律图全新采集脚本(站点数据直采)")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl",
|
||||
help="输出 jsonl 文件路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-cities",
|
||||
type=int,
|
||||
default=0,
|
||||
help="最多采集多少个地区,0 表示不限",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=9999,
|
||||
help="每个地区最多采集多少页",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--city-filter",
|
||||
default="",
|
||||
help="按城市名称/拼音/编码过滤",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="详情页请求间隔秒数",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--direct",
|
||||
action="store_true",
|
||||
help="直连模式,不使用 proxy_settings.json 代理",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-db",
|
||||
action="store_true",
|
||||
help="只输出 JSONL,不写入数据库",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if args.no_db:
|
||||
crawler = Six4365Crawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=None,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
return
|
||||
|
||||
with Db() as db:
|
||||
crawler = Six4365Crawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=db,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
for city_code, info in self.cities.items():
|
||||
province = info.get("province_name", "")
|
||||
city = info.get("name", "")
|
||||
print(f"采集 {province}-{city}")
|
||||
page = 1
|
||||
while True:
|
||||
payload = self._build_payload(city_code, page)
|
||||
html = self._post(payload)
|
||||
if not html:
|
||||
break
|
||||
link_count = self._parse_list(html, province, city)
|
||||
if link_count == 0:
|
||||
break
|
||||
page += 1
|
||||
print("律图采集完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
with Db() as db:
|
||||
spider = Six4365Spider(db)
|
||||
spider.run()
|
||||
|
||||
@@ -0,0 +1,220 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
request_dir = os.path.join(project_root, "request")
|
||||
if request_dir not in sys.path:
|
||||
sys.path.insert(0, request_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
import requests
|
||||
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckResult:
|
||||
site: str
|
||||
url: str
|
||||
method: str
|
||||
ok: bool
|
||||
status_code: Optional[int]
|
||||
error: str
|
||||
hint: str
|
||||
elapsed_ms: int
|
||||
|
||||
|
||||
def _now_ms() -> int:
|
||||
return int(time.time() * 1000)
|
||||
|
||||
|
||||
def _short_hint(text: str) -> str:
|
||||
s = (text or "").strip().lower()
|
||||
flags = []
|
||||
for key, label in [
|
||||
("403", "403"),
|
||||
("429", "429"),
|
||||
("captcha", "captcha"),
|
||||
("验证码", "captcha_cn"),
|
||||
("人机", "bot_check_cn"),
|
||||
("access denied", "access_denied"),
|
||||
("forbidden", "forbidden"),
|
||||
("too many requests", "rate_limited"),
|
||||
("cloudflare", "cloudflare"),
|
||||
("challenge", "challenge"),
|
||||
]:
|
||||
if key in s:
|
||||
flags.append(label)
|
||||
return ",".join(flags)[:120]
|
||||
|
||||
|
||||
def _build_session() -> requests.Session:
|
||||
report_proxy_status()
|
||||
s = requests.Session()
|
||||
s.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
s.proxies.update(proxies)
|
||||
else:
|
||||
s.proxies.clear()
|
||||
s.headers.update(
|
||||
{
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/136.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "*/*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Connection": "close",
|
||||
}
|
||||
)
|
||||
return s
|
||||
|
||||
|
||||
def _check(
|
||||
session: requests.Session,
|
||||
*,
|
||||
site: str,
|
||||
method: str,
|
||||
url: str,
|
||||
timeout: Tuple[float, float] = (10.0, 15.0),
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
) -> CheckResult:
|
||||
start = _now_ms()
|
||||
try:
|
||||
resp = session.request(
|
||||
method=method,
|
||||
url=url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
data=data,
|
||||
)
|
||||
text = resp.text or ""
|
||||
status = resp.status_code
|
||||
hint = _short_hint(text[:1200])
|
||||
ok = 200 <= status < 400
|
||||
return CheckResult(
|
||||
site=site,
|
||||
url=url,
|
||||
method=method,
|
||||
ok=ok,
|
||||
status_code=status,
|
||||
error="",
|
||||
hint=hint,
|
||||
elapsed_ms=_now_ms() - start,
|
||||
)
|
||||
except Exception as exc:
|
||||
return CheckResult(
|
||||
site=site,
|
||||
url=url,
|
||||
method=method,
|
||||
ok=False,
|
||||
status_code=None,
|
||||
error=str(exc)[:200],
|
||||
hint="",
|
||||
elapsed_ms=_now_ms() - start,
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
resp.close() # type: ignore[name-defined]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _tests() -> List[Dict[str, Any]]:
|
||||
# 每个站点选一个“代表性列表/API”作为冒烟:能快速暴露 403/验证码/限频。
|
||||
return [
|
||||
{
|
||||
"site": "大律师(m站)",
|
||||
"method": "GET",
|
||||
"url": "https://m.maxlaw.cn/",
|
||||
},
|
||||
{
|
||||
"site": "大律师(PC站)",
|
||||
"method": "GET",
|
||||
"url": "https://www.maxlaw.cn/law/beijing?page=1",
|
||||
"headers": {"Referer": "https://www.maxlaw.cn/"},
|
||||
},
|
||||
{
|
||||
"site": "找法网(m站)",
|
||||
"method": "GET",
|
||||
"url": "https://m.findlaw.cn/beijing/q_lawyer/p1?ajax=1&order=0&sex=-1",
|
||||
"headers": {
|
||||
"Referer": "https://m.findlaw.cn/beijing/q_lawyer/",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||
},
|
||||
},
|
||||
{
|
||||
"site": "法律快车(m站)",
|
||||
"method": "GET",
|
||||
"url": "https://m.lawtime.cn/beijing/lawyer/?page=1",
|
||||
},
|
||||
{
|
||||
"site": "律图(m站)",
|
||||
"method": "POST",
|
||||
"url": "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/",
|
||||
"data": {
|
||||
"RegionId": "110100", # 北京市
|
||||
"OnlyData": "true",
|
||||
"LawyerRecommendRequest[AreaId]": "110100",
|
||||
"LawyerRecommendRequest[PageIndex]": "1",
|
||||
"LawyerRecommendRequest[PageSize]": "10",
|
||||
"LawyerRecommendRequest[OrderType]": "0",
|
||||
"LawyerRecommendRequest[Type]": "1",
|
||||
},
|
||||
},
|
||||
{
|
||||
"site": "华律(m站)",
|
||||
"method": "POST",
|
||||
"url": "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/",
|
||||
"data": {
|
||||
"pid": "110000", # 北京
|
||||
"cid": "110100", # 北京市
|
||||
"page": "1",
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def main() -> int:
|
||||
mode = os.getenv("PROXY_ENABLED")
|
||||
print(f"[smoke] PROXY_ENABLED={mode!r}")
|
||||
s = _build_session()
|
||||
results: List[CheckResult] = []
|
||||
for item in _tests():
|
||||
res = _check(
|
||||
s,
|
||||
site=item["site"],
|
||||
method=item["method"],
|
||||
url=item["url"],
|
||||
headers=item.get("headers"),
|
||||
data=item.get("data"),
|
||||
)
|
||||
results.append(res)
|
||||
print(
|
||||
f"[smoke] {res.site} {res.method} {res.status_code} ok={res.ok} "
|
||||
f"{res.elapsed_ms}ms hint={res.hint or '-'} err={res.error or '-'}"
|
||||
)
|
||||
time.sleep(0.3)
|
||||
|
||||
summary = {
|
||||
"proxy_enabled": mode,
|
||||
"results": [res.__dict__ for res in results],
|
||||
}
|
||||
print("[smoke] summary_json=" + json.dumps(summary, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
+31
-71
@@ -1,80 +1,40 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
LOG_DIR="${PROJECT_ROOT}/logs"
|
||||
DATA_DIR="${PROJECT_ROOT}/data"
|
||||
# 切换到脚本所在目录,确保相对路径正确
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
mkdir -p "${LOG_DIR}" "${DATA_DIR}"
|
||||
echo "使用 request/proxy_settings.json 读取代理配置"
|
||||
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||
|
||||
if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
|
||||
PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
|
||||
else
|
||||
PYTHON_BIN="python3"
|
||||
fi
|
||||
|
||||
RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
|
||||
|
||||
echo "[start] project=${PROJECT_ROOT}"
|
||||
echo "[start] python=${PYTHON_BIN}"
|
||||
echo "[start] mode=${RUN_MODE}"
|
||||
echo "[start] proxy=request/proxy_settings.json"
|
||||
|
||||
# 大律师(新结构采集 + 写库)可通过环境变量控制
|
||||
DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
|
||||
DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
|
||||
DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
|
||||
DLS_SLEEP="${DLS_SLEEP:-0.2}"
|
||||
DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
|
||||
DLS_EXTRA_ARGS=()
|
||||
|
||||
if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
|
||||
fi
|
||||
if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
|
||||
fi
|
||||
if [[ -n "${DLS_CITY_FILTER}" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
|
||||
fi
|
||||
DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
|
||||
|
||||
if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--direct)
|
||||
fi
|
||||
if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--no-db)
|
||||
fi
|
||||
|
||||
run_bg() {
|
||||
local name="$1"
|
||||
shift
|
||||
local logfile="${LOG_DIR}/${name}.log"
|
||||
nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
|
||||
echo "[start] ${name} pid=$! log=${logfile}"
|
||||
is_job_running() {
|
||||
local script="$1"
|
||||
local script_regex="${script//./\\.}"
|
||||
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||
}
|
||||
|
||||
run_fg() {
|
||||
local name="$1"
|
||||
shift
|
||||
local logfile="${LOG_DIR}/${name}.log"
|
||||
echo "[start] ${name} fg log=${logfile}"
|
||||
env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
|
||||
start_job() {
|
||||
local script="$1"
|
||||
local log_file="$2"
|
||||
local label="$3"
|
||||
local existing
|
||||
|
||||
existing="$(is_job_running "${script}")"
|
||||
if [[ -n "${existing}" ]]; then
|
||||
echo "跳过 ${label}: ${script} 已在运行"
|
||||
echo "${existing}" | head -n 1
|
||||
return 0
|
||||
fi
|
||||
|
||||
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||
sleep 1
|
||||
}
|
||||
|
||||
if [[ "${RUN_MODE}" == "sequential" ]]; then
|
||||
run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
||||
run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
||||
run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
||||
run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
||||
run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
||||
echo "[done] sequential completed"
|
||||
else
|
||||
run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
||||
run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
||||
run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
||||
run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
||||
run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
||||
echo "[done] all crawlers started in background"
|
||||
fi
|
||||
start_job "dls.py" "dls.log" "大律师"
|
||||
start_job "dls_pc.py" "dls_pc.log" "大律师PC站"
|
||||
start_job "findlaw.py" "findlaw.log" "找法网"
|
||||
start_job "lawtime.py" "lawtime.log" "法律快车"
|
||||
start_job "six4365.py" "six4365.log" "律图"
|
||||
start_job "hualv.py" "hualv.log" "华律"
|
||||
|
||||
Executable
+48
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 切换到脚本所在目录,确保相对路径正确
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# 强制直连:不使用代理 IP
|
||||
export PROXY_ENABLED=0
|
||||
|
||||
# 直连模式建议更保守一些,降低被临时风控的概率
|
||||
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||
|
||||
is_job_running() {
|
||||
local script="$1"
|
||||
local script_regex="${script//./\\.}"
|
||||
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||
}
|
||||
|
||||
start_job() {
|
||||
local script="$1"
|
||||
local log_file="$2"
|
||||
local label="$3"
|
||||
local existing
|
||||
|
||||
existing="$(is_job_running "${script}")"
|
||||
if [[ -n "${existing}" ]]; then
|
||||
echo "跳过 ${label}: ${script} 已在运行"
|
||||
echo "${existing}" | head -n 1
|
||||
return 0
|
||||
fi
|
||||
|
||||
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||
sleep 1
|
||||
}
|
||||
|
||||
echo "直连模式(PROXY_ENABLED=0),每周两次建议用 cron 调度"
|
||||
echo "当前归入直连组:大律师(m/PC)、华律、律图"
|
||||
|
||||
# 直连优先站点:
|
||||
# - 大律师(m站/PC站):当前可直接访问,未见明显强风控
|
||||
# - 华律:当前网页可直接访问,未见明显强风控
|
||||
# - 律图:当前网页可直接访问,未见明显强风控
|
||||
start_job "dls.py" "direct_dls.log" "大律师(直连)"
|
||||
start_job "dls_pc.py" "direct_dls_pc.log" "大律师PC站(直连)"
|
||||
start_job "hualv.py" "direct_hualv.log" "华律(直连)"
|
||||
start_job "six4365.py" "direct_six4365.log" "律图(直连)"
|
||||
Executable
+53
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 切换到脚本所在目录,确保相对路径正确
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# 强制开启代理:用于容易被限频/拦截的站点
|
||||
export PROXY_ENABLED=1
|
||||
|
||||
# 代理模式下默认更保守一点,避免冲爆代理与触发风控
|
||||
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||
|
||||
# 可选:开启代理连通性测试输出(部分脚本会打印测试信息/代理状态)
|
||||
export PROXY_TEST="${PROXY_TEST:-0}"
|
||||
|
||||
is_job_running() {
|
||||
local script="$1"
|
||||
local script_regex="${script//./\\.}"
|
||||
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||
}
|
||||
|
||||
start_job() {
|
||||
local script="$1"
|
||||
local log_file="$2"
|
||||
local label="$3"
|
||||
local existing
|
||||
|
||||
existing="$(is_job_running "${script}")"
|
||||
if [[ -n "${existing}" ]]; then
|
||||
echo "跳过 ${label}: ${script} 已在运行"
|
||||
echo "${existing}" | head -n 1
|
||||
return 0
|
||||
fi
|
||||
|
||||
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||
sleep 1
|
||||
}
|
||||
|
||||
echo "代理模式(PROXY_ENABLED=1),每周一次建议用 cron 调度"
|
||||
echo "代理配置读取自 request/proxy_settings.json"
|
||||
echo "每周一次代理任务 = 全量采集所有站点"
|
||||
|
||||
# 每周一次代理任务做全量采集:
|
||||
# - 强风控/更敏感站点:找法网、法律快车
|
||||
# - 其余站点也一并跑,保证每周至少有一次“全量最新数据”刷新
|
||||
start_job "dls.py" "proxy_dls.log" "大律师(代理全量)"
|
||||
start_job "dls_pc.py" "proxy_dls_pc.log" "大律师PC站(代理全量)"
|
||||
start_job "findlaw.py" "proxy_findlaw.log" "找法网(代理)"
|
||||
start_job "lawtime.py" "proxy_lawtime.log" "法律快车(代理)"
|
||||
start_job "hualv.py" "proxy_hualv.log" "华律(代理全量)"
|
||||
start_job "six4365.py" "proxy_six4365.log" "律图(代理全量)"
|
||||
@@ -0,0 +1,565 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
import pymysql
|
||||
from openpyxl import Workbook, load_workbook
|
||||
from openpyxl.styles import Font
|
||||
|
||||
from config import DB_CONFIG
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LawyerRecord:
|
||||
id: int
|
||||
name: str
|
||||
phone: str
|
||||
law_firm: str
|
||||
province: str
|
||||
city: str
|
||||
domain: str
|
||||
create_time: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PhoneBackfill:
|
||||
matched_phones: List[str]
|
||||
records: List[LawyerRecord]
|
||||
best_name: str
|
||||
best_law_firm: str
|
||||
best_domain: str
|
||||
candidate_names: List[str]
|
||||
candidate_firms: List[str]
|
||||
candidate_domains: List[str]
|
||||
|
||||
|
||||
DOMAIN_PRIORITY = {
|
||||
"华律": 90,
|
||||
"大律师": 85,
|
||||
"找法网": 82,
|
||||
"法律快车": 80,
|
||||
"律图": 72,
|
||||
"众法利单页": 68,
|
||||
"众法利": 66,
|
||||
"六四三六五": 64,
|
||||
"智飞律师在线": 40,
|
||||
"高德地图": 10,
|
||||
}
|
||||
|
||||
GENERIC_FIRMS = {"高德搜索"}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="按律所名从数据库补手机号并导出对比表")
|
||||
parser.add_argument("--input", default="man.xlsx", help="原始 xlsx 文件路径")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="man_firm_phone_compare.xlsx",
|
||||
help="输出 xlsx 文件路径",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def normalize_text(value: object) -> str:
|
||||
text = str(value or "").strip()
|
||||
text = text.replace("(", "(").replace(")", ")")
|
||||
text = re.sub(r"\s+", "", text)
|
||||
return text
|
||||
|
||||
|
||||
def normalize_firm(value: object) -> str:
|
||||
text = normalize_text(value)
|
||||
text = text.replace("本地大所", "").replace("特色律所", "")
|
||||
return text
|
||||
|
||||
|
||||
def normalize_name(value: object) -> str:
|
||||
text = normalize_text(value)
|
||||
return text.replace("律师", "")
|
||||
|
||||
|
||||
def normalize_province(value: object) -> str:
|
||||
text = str(value or "").strip()
|
||||
mapping = {
|
||||
"北京市": "北京",
|
||||
"天津市": "天津",
|
||||
"上海市": "上海",
|
||||
"重庆市": "重庆",
|
||||
"内蒙古自治区": "内蒙古",
|
||||
"广西壮族自治区": "广西",
|
||||
"宁夏回族自治区": "宁夏",
|
||||
"新疆维吾尔自治区": "新疆",
|
||||
"西藏自治区": "西藏",
|
||||
"香港特别行政区": "香港",
|
||||
"澳门特别行政区": "澳门",
|
||||
"新疆生产建设兵团": "新疆",
|
||||
}
|
||||
if text in mapping:
|
||||
return mapping[text]
|
||||
if text.endswith("省") and len(text) > 1:
|
||||
return text[:-1]
|
||||
return text
|
||||
|
||||
|
||||
def normalize_city(value: object) -> str:
|
||||
text = str(value or "").strip()
|
||||
for suffix in ("市", "地区", "盟"):
|
||||
if text.endswith(suffix) and len(text) > len(suffix):
|
||||
return text[: -len(suffix)]
|
||||
return text
|
||||
|
||||
|
||||
def split_phones(value: object) -> List[str]:
|
||||
return re.findall(r"1\d{10}", str(value or ""))
|
||||
|
||||
|
||||
def unique_phones(records: Sequence[LawyerRecord]) -> List[str]:
|
||||
output: List[str] = []
|
||||
seen = set()
|
||||
for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True):
|
||||
if record.phone and record.phone not in seen:
|
||||
seen.add(record.phone)
|
||||
output.append(record.phone)
|
||||
return output
|
||||
|
||||
|
||||
def unique_values(records: Sequence[LawyerRecord], attr: str) -> List[str]:
|
||||
output: List[str] = []
|
||||
seen = set()
|
||||
for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True):
|
||||
value = getattr(record, attr, "")
|
||||
if value and value not in seen:
|
||||
seen.add(value)
|
||||
output.append(value)
|
||||
return output
|
||||
|
||||
|
||||
def phone_record_sort_key(
|
||||
record: LawyerRecord,
|
||||
target_name: object,
|
||||
target_province: object,
|
||||
target_city: object,
|
||||
) -> Tuple[int, int, int]:
|
||||
score = 0
|
||||
normalized_target_name = normalize_name(target_name)
|
||||
normalized_target_province = normalize_province(target_province)
|
||||
normalized_target_city = normalize_city(target_city)
|
||||
|
||||
if normalized_target_name:
|
||||
if normalize_name(record.name) == normalized_target_name:
|
||||
score += 400
|
||||
elif record.name:
|
||||
score -= 40
|
||||
|
||||
if record.law_firm and record.law_firm not in GENERIC_FIRMS:
|
||||
score += 220
|
||||
elif record.law_firm:
|
||||
score += 40
|
||||
|
||||
if record.name:
|
||||
score += 100
|
||||
|
||||
if normalized_target_city:
|
||||
if normalize_city(record.city) == normalized_target_city:
|
||||
score += 45
|
||||
elif record.city:
|
||||
score -= 10
|
||||
|
||||
if normalized_target_province:
|
||||
if normalize_province(record.province) == normalized_target_province:
|
||||
score += 25
|
||||
elif record.province:
|
||||
score -= 5
|
||||
|
||||
score += DOMAIN_PRIORITY.get(record.domain, 50)
|
||||
return score, record.create_time, record.id
|
||||
|
||||
|
||||
def compare_result(original_phones: Sequence[str], candidate_phones: Sequence[str]) -> str:
|
||||
if not candidate_phones:
|
||||
return "未匹配"
|
||||
if not original_phones:
|
||||
return "原手机号为空"
|
||||
|
||||
original_set = set(original_phones)
|
||||
candidate_set = set(candidate_phones)
|
||||
if original_set == candidate_set:
|
||||
return "完全一致"
|
||||
if original_set & candidate_set:
|
||||
return "候选包含原手机号"
|
||||
return "不包含原手机号"
|
||||
|
||||
|
||||
def infer_firm_from_address(address: object, ordered_firms: Sequence[str]) -> str:
|
||||
normalized_address = normalize_text(address)
|
||||
if not normalized_address:
|
||||
return ""
|
||||
for firm in ordered_firms:
|
||||
if len(firm) < 4:
|
||||
continue
|
||||
if firm in normalized_address:
|
||||
return firm
|
||||
return ""
|
||||
|
||||
|
||||
def load_db_indexes() -> Tuple[Dict[str, List[LawyerRecord]], List[str], Dict[str, List[LawyerRecord]]]:
|
||||
conn = pymysql.connect(**DB_CONFIG)
|
||||
firm_index: Dict[str, List[LawyerRecord]] = defaultdict(list)
|
||||
phone_index: Dict[str, List[LawyerRecord]] = defaultdict(list)
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT id, name, phone, law_firm, province, city, domain, create_time
|
||||
FROM lawyer
|
||||
WHERE phone IS NOT NULL
|
||||
AND phone <> ''
|
||||
"""
|
||||
)
|
||||
for row in cur.fetchall():
|
||||
record = LawyerRecord(
|
||||
id=int(row[0]),
|
||||
name=str(row[1] or "").strip(),
|
||||
phone=str(row[2] or "").strip(),
|
||||
law_firm=str(row[3] or "").strip(),
|
||||
province=str(row[4] or "").strip(),
|
||||
city=str(row[5] or "").strip(),
|
||||
domain=str(row[6] or "").strip(),
|
||||
create_time=int(row[7] or 0),
|
||||
)
|
||||
phone_index[record.phone].append(record)
|
||||
normalized_firm = normalize_firm(record.law_firm)
|
||||
if normalized_firm:
|
||||
firm_index[normalized_firm].append(record)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
ordered_firms = sorted(firm_index.keys(), key=len, reverse=True)
|
||||
return firm_index, ordered_firms, phone_index
|
||||
|
||||
|
||||
def build_phone_backfill(
|
||||
original_phone: object,
|
||||
name: object,
|
||||
province: object,
|
||||
city: object,
|
||||
phone_index: Dict[str, List[LawyerRecord]],
|
||||
) -> PhoneBackfill:
|
||||
def pick_best_name(records: Sequence[LawyerRecord], target_name: object) -> str:
|
||||
normalized_target_name = normalize_name(target_name)
|
||||
if normalized_target_name:
|
||||
for item in records:
|
||||
if item.name and normalize_name(item.name) == normalized_target_name:
|
||||
return item.name
|
||||
for item in records:
|
||||
if item.name:
|
||||
return item.name
|
||||
return ""
|
||||
|
||||
records: List[LawyerRecord] = []
|
||||
seen_ids = set()
|
||||
for phone in split_phones(original_phone):
|
||||
for record in phone_index.get(phone, []):
|
||||
if record.id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(record.id)
|
||||
records.append(record)
|
||||
|
||||
sorted_records = sorted(
|
||||
records,
|
||||
key=lambda item: phone_record_sort_key(item, name, province, city),
|
||||
reverse=True,
|
||||
)
|
||||
candidate_names = unique_values(sorted_records, "name")
|
||||
candidate_firms = unique_values(
|
||||
[item for item in sorted_records if item.law_firm and item.law_firm not in GENERIC_FIRMS],
|
||||
"law_firm",
|
||||
)
|
||||
if not candidate_firms:
|
||||
candidate_firms = unique_values(
|
||||
[item for item in sorted_records if item.law_firm],
|
||||
"law_firm",
|
||||
)
|
||||
candidate_domains = unique_values(sorted_records, "domain")
|
||||
matched_phones = unique_values(sorted_records, "phone")
|
||||
|
||||
best_name = pick_best_name(sorted_records, name)
|
||||
best_law_firm = ""
|
||||
best_domain = ""
|
||||
preferred_name = normalize_name(name) or normalize_name(best_name)
|
||||
|
||||
for record in sorted_records:
|
||||
if not record.law_firm or record.law_firm in GENERIC_FIRMS:
|
||||
continue
|
||||
if preferred_name and normalize_name(record.name) != preferred_name:
|
||||
continue
|
||||
best_law_firm = record.law_firm
|
||||
best_domain = record.domain
|
||||
break
|
||||
|
||||
if not best_law_firm:
|
||||
for record in sorted_records:
|
||||
if record.law_firm and record.law_firm not in GENERIC_FIRMS:
|
||||
best_law_firm = record.law_firm
|
||||
best_domain = record.domain
|
||||
break
|
||||
|
||||
if not best_domain and sorted_records:
|
||||
best_domain = sorted_records[0].domain
|
||||
|
||||
return PhoneBackfill(
|
||||
matched_phones=matched_phones,
|
||||
records=sorted_records,
|
||||
best_name=best_name,
|
||||
best_law_firm=best_law_firm,
|
||||
best_domain=best_domain,
|
||||
candidate_names=candidate_names,
|
||||
candidate_firms=candidate_firms,
|
||||
candidate_domains=candidate_domains,
|
||||
)
|
||||
|
||||
|
||||
def match_row(
|
||||
name: object,
|
||||
original_phone: object,
|
||||
law_firm: object,
|
||||
province: object,
|
||||
city: object,
|
||||
address: object,
|
||||
phone_backfill: PhoneBackfill,
|
||||
firm_index: Dict[str, List[LawyerRecord]],
|
||||
ordered_firms: Sequence[str],
|
||||
) -> Tuple[str, str, List[LawyerRecord]]:
|
||||
def add_method(part: str, method_parts: List[str]) -> None:
|
||||
if part and part not in method_parts:
|
||||
method_parts.append(part)
|
||||
|
||||
matched_firm = normalize_firm(law_firm)
|
||||
used_phone_backfill_firm = False
|
||||
inferred_from_address = False
|
||||
if not matched_firm:
|
||||
matched_firm = normalize_firm(phone_backfill.best_law_firm)
|
||||
used_phone_backfill_firm = bool(matched_firm)
|
||||
if not matched_firm:
|
||||
matched_firm = infer_firm_from_address(address, ordered_firms)
|
||||
inferred_from_address = bool(matched_firm)
|
||||
if not matched_firm:
|
||||
return "", "无可用律所名", []
|
||||
|
||||
candidates = firm_index.get(matched_firm, [])
|
||||
if not candidates:
|
||||
return matched_firm, "数据库无此律所", []
|
||||
|
||||
method_parts = ["律所"]
|
||||
chosen = list(candidates)
|
||||
|
||||
normalized_name = normalize_name(name)
|
||||
if not normalized_name:
|
||||
normalized_name = normalize_name(phone_backfill.best_name)
|
||||
if normalized_name:
|
||||
name_filtered = [item for item in chosen if normalize_name(item.name) == normalized_name]
|
||||
if name_filtered:
|
||||
chosen = name_filtered
|
||||
add_method("姓名", method_parts)
|
||||
|
||||
if len(unique_phones(chosen)) != 1:
|
||||
normalized_province = normalize_province(province)
|
||||
normalized_city = normalize_city(city)
|
||||
|
||||
if normalized_province and normalized_city:
|
||||
province_city_filtered = [
|
||||
item
|
||||
for item in chosen
|
||||
if normalize_province(item.province) == normalized_province
|
||||
and normalize_city(item.city) == normalized_city
|
||||
]
|
||||
if province_city_filtered:
|
||||
chosen = province_city_filtered
|
||||
add_method("省份", method_parts)
|
||||
add_method("城市", method_parts)
|
||||
|
||||
if len(unique_phones(chosen)) != 1 and normalized_city:
|
||||
city_filtered = [
|
||||
item for item in chosen if normalize_city(item.city) == normalized_city
|
||||
]
|
||||
if city_filtered:
|
||||
chosen = city_filtered
|
||||
add_method("城市", method_parts)
|
||||
|
||||
if len(unique_phones(chosen)) != 1 and normalized_province:
|
||||
province_filtered = [
|
||||
item
|
||||
for item in chosen
|
||||
if normalize_province(item.province) == normalized_province
|
||||
]
|
||||
if province_filtered:
|
||||
chosen = province_filtered
|
||||
add_method("省份", method_parts)
|
||||
|
||||
method = "+".join(method_parts)
|
||||
if used_phone_backfill_firm:
|
||||
method = "手机号回填律所|" + method
|
||||
elif inferred_from_address:
|
||||
method = "地址推断律所|" + method
|
||||
return matched_firm, method, chosen
|
||||
|
||||
|
||||
def autosize_columns(ws) -> None:
|
||||
for column_cells in ws.columns:
|
||||
values = [str(cell.value or "") for cell in column_cells]
|
||||
max_length = min(max((len(value) for value in values), default=0), 60)
|
||||
column_letter = column_cells[0].column_letter
|
||||
ws.column_dimensions[column_letter].width = max_length + 2
|
||||
|
||||
|
||||
def iter_input_rows(ws) -> Iterable[Tuple[int, List[object]]]:
|
||||
for row_idx in range(1, ws.max_row + 1):
|
||||
yield row_idx, [ws.cell(row_idx, col_idx).value for col_idx in range(1, 8)]
|
||||
|
||||
|
||||
def build_output(input_path: str, output_path: str) -> Dict[str, int]:
|
||||
workbook = load_workbook(input_path)
|
||||
source_ws = workbook.active
|
||||
|
||||
firm_index, ordered_firms, phone_index = load_db_indexes()
|
||||
|
||||
out_wb = Workbook()
|
||||
out_ws = out_wb.active
|
||||
out_ws.title = "firm_phone_compare"
|
||||
headers = [
|
||||
"原始行号",
|
||||
"原姓名",
|
||||
"原手机号",
|
||||
"原律所",
|
||||
"原省份",
|
||||
"原城市",
|
||||
"原地址",
|
||||
"原备注",
|
||||
"手机号命中记录数",
|
||||
"手机号命中手机号",
|
||||
"手机号补全姓名",
|
||||
"手机号补全律所",
|
||||
"手机号补全来源",
|
||||
"手机号候选姓名",
|
||||
"手机号候选律所",
|
||||
"用于匹配的律所",
|
||||
"匹配方式",
|
||||
"数据库候选手机号",
|
||||
"候选数量",
|
||||
"原手机号对比",
|
||||
"数据库候选姓名",
|
||||
"数据库候选省市",
|
||||
"数据库来源",
|
||||
]
|
||||
out_ws.append(headers)
|
||||
for cell in out_ws[1]:
|
||||
cell.font = Font(bold=True)
|
||||
|
||||
stats = defaultdict(int)
|
||||
for row_idx, row in iter_input_rows(source_ws):
|
||||
name, original_phone, law_firm, province, city, address, remark = row
|
||||
needs_phone_completion = not normalize_firm(law_firm)
|
||||
phone_backfill = build_phone_backfill(
|
||||
original_phone=original_phone,
|
||||
name=name,
|
||||
province=province,
|
||||
city=city,
|
||||
phone_index=phone_index,
|
||||
)
|
||||
matched_firm, method, matched_records = match_row(
|
||||
name=name,
|
||||
original_phone=original_phone,
|
||||
law_firm=law_firm,
|
||||
province=province,
|
||||
city=city,
|
||||
address=address,
|
||||
phone_backfill=phone_backfill,
|
||||
firm_index=firm_index,
|
||||
ordered_firms=ordered_firms,
|
||||
)
|
||||
candidate_phones = unique_phones(matched_records)
|
||||
compare = compare_result(split_phones(original_phone), candidate_phones)
|
||||
candidate_names = unique_values(matched_records, "name")
|
||||
candidate_domains = unique_values(matched_records, "domain")
|
||||
city_province_pairs = []
|
||||
seen_pairs = set()
|
||||
for record in matched_records:
|
||||
pair = f"{record.province}-{record.city}".strip("-")
|
||||
if pair and pair not in seen_pairs:
|
||||
seen_pairs.add(pair)
|
||||
city_province_pairs.append(pair)
|
||||
|
||||
out_ws.append(
|
||||
[
|
||||
row_idx,
|
||||
name or "",
|
||||
original_phone or "",
|
||||
law_firm or "",
|
||||
province or "",
|
||||
city or "",
|
||||
address or "",
|
||||
remark or "",
|
||||
len(phone_backfill.records) if needs_phone_completion else "",
|
||||
" / ".join(phone_backfill.matched_phones) if needs_phone_completion else "",
|
||||
phone_backfill.best_name if needs_phone_completion else "",
|
||||
phone_backfill.best_law_firm if needs_phone_completion else "",
|
||||
phone_backfill.best_domain if needs_phone_completion else "",
|
||||
" / ".join(phone_backfill.candidate_names) if needs_phone_completion else "",
|
||||
" / ".join(phone_backfill.candidate_firms) if needs_phone_completion else "",
|
||||
matched_firm or "",
|
||||
method or "",
|
||||
" / ".join(candidate_phones) or "",
|
||||
len(candidate_phones),
|
||||
compare,
|
||||
" / ".join(candidate_names) or "",
|
||||
" / ".join(city_province_pairs) or "",
|
||||
" / ".join(candidate_domains) or "",
|
||||
]
|
||||
)
|
||||
|
||||
if needs_phone_completion and phone_backfill.records:
|
||||
stats["phone_backfill_hit_rows"] += 1
|
||||
if needs_phone_completion and phone_backfill.best_name:
|
||||
stats["phone_backfill_name_rows"] += 1
|
||||
if needs_phone_completion and phone_backfill.best_law_firm:
|
||||
stats["phone_backfill_firm_rows"] += 1
|
||||
if needs_phone_completion and method.startswith("手机号回填律所|"):
|
||||
stats["phone_backfill_used_for_match_rows"] += 1
|
||||
|
||||
if candidate_phones:
|
||||
stats["matched_rows"] += 1
|
||||
if len(candidate_phones) == 1:
|
||||
stats["unique_rows"] += 1
|
||||
else:
|
||||
stats["multi_rows"] += 1
|
||||
else:
|
||||
stats["unmatched_rows"] += 1
|
||||
|
||||
if compare == "完全一致":
|
||||
stats["same_rows"] += 1
|
||||
elif compare == "候选包含原手机号":
|
||||
stats["contains_rows"] += 1
|
||||
elif compare == "不包含原手机号":
|
||||
stats["diff_rows"] += 1
|
||||
elif compare == "原手机号为空":
|
||||
stats["blank_phone_rows"] += 1
|
||||
|
||||
out_ws.freeze_panes = "A2"
|
||||
autosize_columns(out_ws)
|
||||
out_wb.save(output_path)
|
||||
return dict(stats)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
stats = build_output(args.input, args.output)
|
||||
print(f"已生成: {args.output}")
|
||||
for key in sorted(stats):
|
||||
print(f"{key}={stats[key]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,22 +1,114 @@
|
||||
# common_sites 独立项目配置
|
||||
|
||||
# 数据库连接配置
|
||||
DB_CONFIG = {
|
||||
"host": "8.134.219.222",
|
||||
"user": "lawyer",
|
||||
"password": "CTxr8yGwsSX3NdfJ",
|
||||
"database": "lawyer",
|
||||
"host": "8.134.219.222", # 数据库地址
|
||||
"user": "lawyer", # 数据库用户名
|
||||
"password": "CTxr8yGwsSX3NdfJ", # 数据库密码
|
||||
"database": "lawyer", # 数据库名称
|
||||
"charset": "utf8mb4",
|
||||
}
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||
"Accept": "*/*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
# 高德地图 API 配置
|
||||
GAODE_CONFIG = {
|
||||
"API_KEY": "f261575fb28003761c433f6c9379e89d",
|
||||
}
|
||||
|
||||
# 微信爬虫特定的配置
|
||||
WEIXIN_CONFIG = {
|
||||
"TOKEN": "553117235", # 您的Token
|
||||
"FINGERPRINT": "3c02c35093184e9a9a668ac3c81e53f9",
|
||||
"COOKIE": {
|
||||
"appmsglist_action_3258147150": "card",
|
||||
"_qimei_uuid42": "1a302160d051008226aec905b63f99ff3989f30009",
|
||||
"_qimei_i_3": "63b22b84c15204dfc595ac6452d722b1f0bdf0f6145b568ae68a7c0e70947438686637943989e2a1d792",
|
||||
"_qimei_h38": "215986ce26aec905b63f99ff0200000e81a302",
|
||||
"ua_id": "S7gglu0eZh9NkAzLAAAAADH8dynpnFZVN29lxm7BQo0=",
|
||||
"wxuin": "73074968761097",
|
||||
"mm_lang": "zh_CN",
|
||||
"eas_sid": "91X7I7K4K5k364U2z3k2I980F5",
|
||||
"_qimei_q36": "",
|
||||
"_qimei_fingerprint": "d895c46d5fda98cab67d9daec00068ed",
|
||||
"_qimei_i_1": "4dc76680945f59d3c7c4ab325dd526b3feeea1a31458558bbdd97e582493206c6163629d39d8e1dcd4b2c28f",
|
||||
"pgv_pvid": "6923507145",
|
||||
"ts_uid": "9585717820",
|
||||
"_t_qbtool_uid": "aaaa2vn5byd280l00iglw701zci788cb",
|
||||
"_ga": "GA1.1.1323926288.1775838938",
|
||||
"_ga_TPFW0KPXC1": "GS2.1.s1775841484$o2$g1$t1775841485$j59$l0$h0",
|
||||
"uuid": "20d1cfb540221c6e7b6d665ab1d4a8f7",
|
||||
"rand_info": "CAESIA8LYV6dvWh5dYrgQLPhZb8TXwUJoWdcdDzN0TTdztSj",
|
||||
"slave_bizuin": "3258147150",
|
||||
"data_bizuin": "3258147150",
|
||||
"bizuin": "3258147150",
|
||||
"data_ticket": "dgLFmSrI8f1q6JnYOd2Y/sKJIWjh6YlLSau1n1+Mv5iOTR5hgsm1qjNLypWflGd6",
|
||||
"slave_sid": "VGVnNmM5NmFpV19ESElmVlZOTGZfVVJfWE5HanlHNjN0WEswZVkxVk9vc2FTenQzVGRsWUxDT0xGQVBJRVZzU0JNVV9RckRJVE9jSVUwbjl4Z2VHaEZKSzE5WVc3THRCRW96T0Z1V1VwbnBLSnkxSWdKaHdaN1dYdzI1SmdpZ0IyOFJtUE45OTR2Q2NvM1FB",
|
||||
"slave_user": "gh_fe76760560d0",
|
||||
"xid": "4893c62dc8518b6a1628fd34bc9aa276",
|
||||
"_clck": "3258147150|1|g5g|0",
|
||||
"_clsk": "1p4oo3h|1776957001796|5|1|mp.weixin.qq.com/weheat-agent/payload/record"
|
||||
},
|
||||
"COUNT": 20, # 单页条数
|
||||
"REQUESTS_PER_SECOND": 8, # 每秒最大请求数(调高更快,但有风控风险)
|
||||
"PAGE_DELAY": 0.8, # 每页采集后的等待秒数
|
||||
"CITY_DELAY": 0.3, # 每城市采集后的等待秒数
|
||||
}
|
||||
|
||||
# 通用请求头
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
}
|
||||
|
||||
# 法律快车爬虫配置
|
||||
LAWTIME_CONFIG = {
|
||||
"HEADERS": {
|
||||
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
|
||||
}
|
||||
}
|
||||
|
||||
# Redis配置 - 用于采集索引和断点恢复
|
||||
REDIS_CONFIG = {
|
||||
"host": "127.0.0.1",
|
||||
"port": 6379,
|
||||
"password": "",
|
||||
"db": 0, # 使用数据库0
|
||||
"decode_responses": True, # 自动解码响应
|
||||
"socket_timeout": 5, # 连接超时时间
|
||||
"socket_connect_timeout": 5, # 连接建立超时时间
|
||||
"health_check_interval": 30, # 健康检查间隔
|
||||
"retry_on_timeout": True, # 超时重试
|
||||
"max_connections": 20, # 最大连接数
|
||||
}
|
||||
|
||||
# Redis键名配置
|
||||
REDIS_KEYS = {
|
||||
"spider_progress": "lawyer:spider:progress:{spider_name}", # 爬虫进度
|
||||
"url_processed": "lawyer:url:processed:{spider_name}", # 已处理URL集合
|
||||
"url_failed": "lawyer:url:failed:{spider_name}", # 失败URL集合
|
||||
"spider_stats": "lawyer:stats:{spider_name}", # 爬虫统计信息
|
||||
"global_stats": "lawyer:global:stats", # 全局统计
|
||||
"session_info": "lawyer:session:{session_id}", # 会话信息
|
||||
"url_queue": "lawyer:queue:{spider_name}", # URL队列
|
||||
"duplicate_filter": "lawyer:duplicate:{spider_name}", # 去重过滤器
|
||||
}
|
||||
|
||||
# MongoDB配置 - 用于日志存储
|
||||
MONGO_CONFIG = {
|
||||
"uri": "mongodb://127.0.0.1:27017/",
|
||||
"database": "lawyer",
|
||||
"collections": {
|
||||
"logs": "logs", # 通用日志
|
||||
"spider_logs": "spider_logs", # 爬虫专用日志
|
||||
"error_logs": "error_logs", # 错误日志
|
||||
"system_logs": "system_logs", # 系统日志
|
||||
"performance_logs": "performance_logs" # 性能日志
|
||||
},
|
||||
"options": {
|
||||
"maxPoolSize": 10, # 连接池最大连接数
|
||||
"minPoolSize": 1, # 连接池最小连接数
|
||||
"maxIdleTimeMS": 30000, # 最大空闲时间
|
||||
"serverSelectionTimeoutMS": 5000, # 服务器选择超时
|
||||
"connectTimeoutMS": 10000, # 连接超时
|
||||
"socketTimeoutMS": 30000, # Socket超时
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
services:
|
||||
mongodb:
|
||||
image: mongo:7
|
||||
container_name: lawyers_mongodb
|
||||
restart: always
|
||||
ports:
|
||||
- "27017:27017"
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
environment:
|
||||
MONGO_INITDB_DATABASE: lawyer
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
||||
@@ -0,0 +1,401 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import math
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, List, Optional
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.exceptions import RequestException, HTTPError, ConnectionError, Timeout
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
# 添加项目根目录到系统路径(保留你的原逻辑)
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from Db import Db # 你的 DB 封装
|
||||
import config as project_config
|
||||
|
||||
# logging 配置
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GaodeSpider:
|
||||
"""高德地图 API 商户手机号采集 - 重构版"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_connection,
|
||||
api_key: Optional[str] = None,
|
||||
offset: int = 20,
|
||||
max_pages_per_city: int = 10,
|
||||
sleep_between_pages: float = 2.0,
|
||||
sleep_between_cities: float = 3.0,
|
||||
):
|
||||
self.db = db_connection
|
||||
config_api_key = ""
|
||||
gaode_config = getattr(project_config, "GAODE_CONFIG", None)
|
||||
if isinstance(gaode_config, dict):
|
||||
config_api_key = str(gaode_config.get("API_KEY", "")).strip()
|
||||
|
||||
self.api_key = (api_key or os.environ.get("AMAP_API_KEY", "") or config_api_key).strip()
|
||||
if not self.api_key:
|
||||
raise ValueError("高德 API Key 未配置,请在 config.py 的 GAODE_CONFIG.API_KEY 或环境变量 AMAP_API_KEY 中填写")
|
||||
self.api_base = "https://restapi.amap.com/v3/place/text"
|
||||
self.offset = offset
|
||||
self.session = self._build_session()
|
||||
self.max_pages_per_city = max_pages_per_city
|
||||
self.sleep_between_pages = sleep_between_pages
|
||||
self.sleep_between_cities = sleep_between_cities
|
||||
|
||||
# 加载地区数据
|
||||
self.cities = self._load_area_data()
|
||||
|
||||
def _build_session(self) -> requests.Session:
|
||||
s = requests.Session()
|
||||
# Retry for idempotent errors (GET) and some server errors
|
||||
retries = Retry(
|
||||
total=3,
|
||||
backoff_factor=1,
|
||||
status_forcelist=(429, 500, 502, 503, 504),
|
||||
allowed_methods=frozenset(["GET", "POST"])
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retries)
|
||||
s.mount("https://", adapter)
|
||||
s.mount("http://", adapter)
|
||||
s.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
})
|
||||
return s
|
||||
|
||||
def _load_area_data(self) -> Dict[int, Dict]:
|
||||
"""从数据库加载地区数据(保持与你原来表结构兼容)。
|
||||
要求 area_new 表含:id, code, city, province, pid, pinyin, domain, level
|
||||
仅加载 domain='findlaw' 且 level=2 的城市
|
||||
返回字典:{ city_id: {code, name, province, pid, pinyin} }
|
||||
"""
|
||||
try:
|
||||
rows = self.db.select_data("area_new", "id, code, city, province, pid, pinyin", "domain='maxlaw' AND level=2")
|
||||
result = {}
|
||||
for r in rows:
|
||||
cid = r.get("id")
|
||||
result[cid] = {
|
||||
"code": r.get("code") or "",
|
||||
"name": r.get("city") or "",
|
||||
"province": r.get("province") or "",
|
||||
"pid": r.get("pid"),
|
||||
"pinyin": r.get("pinyin") or ""
|
||||
}
|
||||
logger.info("加载城市数量: %d", len(result))
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.exception("从数据库加载地区数据失败: %s", e)
|
||||
return {}
|
||||
|
||||
def _search_gaode_api(self, keywords: str, city: str, page: int = 1) -> Dict:
|
||||
"""调用高德 API 搜索,返回整个 JSON 响应(或空 dict)"""
|
||||
params = {
|
||||
"keywords": keywords,
|
||||
"city": city,
|
||||
"offset": self.offset,
|
||||
"page": page,
|
||||
"key": self.api_key,
|
||||
"extensions": "all"
|
||||
}
|
||||
print(params)
|
||||
try:
|
||||
resp = self.session.get(self.api_base, params=params, timeout=15)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data
|
||||
except (HTTPError, ConnectionError, Timeout) as e:
|
||||
logger.warning("高德 API 请求失败(%s %s page=%s): %s", keywords, city, page, e)
|
||||
return {}
|
||||
except ValueError as e:
|
||||
logger.error("高德 API 返回非 JSON 数据: %s", e)
|
||||
return {}
|
||||
|
||||
def _split_and_clean_phones(self, raw_tel: str) -> List[str]:
|
||||
"""把 raw tel 拆成候选号码并清洗"""
|
||||
if not raw_tel:
|
||||
return []
|
||||
|
||||
logger.debug("原始电话号码: %s", raw_tel)
|
||||
|
||||
# 常见分隔符 ; / , 、 | 空格
|
||||
parts = re.split(r"[;,/,、\|]+|\s+", raw_tel.strip())
|
||||
cleaned = []
|
||||
|
||||
for p in parts:
|
||||
if not p:
|
||||
continue
|
||||
|
||||
original_p = p
|
||||
# 移除括号内内容以及非数字和连字符
|
||||
p = re.sub(r"(.*?)|\(.*?\)|[^\d\-+]", "", p)
|
||||
# 有的号码含国际码 +86
|
||||
p = p.lstrip("+")
|
||||
# 移除前导 86(如果之后是11位)
|
||||
if p.startswith("86") and len(p) > 11:
|
||||
p = p[2:]
|
||||
# 最后移除短横线
|
||||
p = p.replace("-", "")
|
||||
|
||||
if p:
|
||||
cleaned.append(p)
|
||||
logger.debug("清洗后号码: %s -> %s", original_p, p)
|
||||
|
||||
logger.debug("清洗后共 %d 个号码: %s", len(cleaned), cleaned)
|
||||
return cleaned
|
||||
|
||||
def _is_valid_phone(self, phone: str) -> bool:
|
||||
"""验证手机号码,必须为11位且以1开头的手机号。"""
|
||||
if not phone:
|
||||
return False
|
||||
# 强制要求:11位且以1开头的手机号
|
||||
if re.fullmatch(r"1\d{10}", phone):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _extract_phones_from_poi(self, poi: Dict) -> List[str]:
|
||||
"""
|
||||
从 POI 数据中提取所有候选电话号码。
|
||||
优先查找 poi['business']['tel'](你的示例),并兼容早期可能的字段如 tel/phone。
|
||||
返回去重后且通过校验的号码列表。
|
||||
"""
|
||||
candidates = []
|
||||
|
||||
# 1) 优先查找 business.tel 字段(高德API的主要电话字段)
|
||||
business = poi.get("business") or {}
|
||||
tel = business.get("tel")
|
||||
if tel:
|
||||
logger.debug("从 business.tel 提取: %s", tel)
|
||||
candidates.extend(self._split_and_clean_phones(str(tel)))
|
||||
|
||||
# 2) 兼容旧结构:顶层 tel/phone/contact 等
|
||||
for key in ("tel", "phone", "contact", "business_area"):
|
||||
v = poi.get(key)
|
||||
if v:
|
||||
logger.debug("从 %s 提取: %s", key, v)
|
||||
candidates.extend(self._split_and_clean_phones(str(v)))
|
||||
|
||||
# 3) 审慎兼容:一些扩展字段可能也包含电话
|
||||
for nested_key in ("biz_ext", "ext", "attributes"):
|
||||
nested = poi.get(nested_key) or {}
|
||||
if isinstance(nested, dict):
|
||||
for subkey in ("tel", "phone", "contact"):
|
||||
if nested.get(subkey):
|
||||
logger.debug("从 %s.%s 提取: %s", nested_key, subkey, nested.get(subkey))
|
||||
candidates.extend(self._split_and_clean_phones(str(nested.get(subkey))))
|
||||
|
||||
# 去重并只保留合法号码(按 _is_valid_phone)
|
||||
unique = []
|
||||
for c in candidates:
|
||||
if c not in unique and self._is_valid_phone(c):
|
||||
unique.append(c)
|
||||
logger.debug("有效电话号码: %s", c)
|
||||
|
||||
logger.debug("POI %s 提取到 %d 个有效电话号码", poi.get("name", ""), len(unique))
|
||||
return unique
|
||||
|
||||
|
||||
def _is_duplicate(self, phone: str) -> bool:
|
||||
"""检查某个 phone 是否已存在(domain='高德地图')"""
|
||||
try:
|
||||
condition = f"phone='{phone}' AND domain='高德地图'"
|
||||
exists = self.db.is_data_exist("lawyer", condition)
|
||||
if exists:
|
||||
logger.debug("手机号已存在: %s (domain=高德地图)", phone)
|
||||
return exists
|
||||
except Exception as e:
|
||||
logger.exception("去重检查失败: %s", e)
|
||||
# 若出错,返回 True 以避免重复插入或脏数据
|
||||
return True
|
||||
|
||||
def _parse_poi_to_record(self, poi: Dict, city_info: Dict, province_info: Dict, used_phone: str) -> Dict:
|
||||
"""把单条 poi 转为数据库记录(针对某个已选的号码 used_phone)"""
|
||||
# 安全处理 shopinfo 字段,可能是字符串或字典
|
||||
shopinfo = poi.get("shopinfo")
|
||||
if isinstance(shopinfo, dict):
|
||||
law_firm = shopinfo.get("shop_name", "高德搜索")
|
||||
else:
|
||||
law_firm = "高德搜索"
|
||||
|
||||
record = {
|
||||
"name": poi.get("name", "").strip(),
|
||||
"phone": used_phone,
|
||||
"law_firm": law_firm,
|
||||
"province": province_info.get("name", ""),
|
||||
"city": city_info.get("name", ""),
|
||||
"url": poi.get("website", "") or "",
|
||||
"domain": "高德地图",
|
||||
"create_time": int(time.time()),
|
||||
"params": json.dumps({
|
||||
"address": poi.get("address", ""),
|
||||
"location": poi.get("location", ""),
|
||||
"type": poi.get("type", ""),
|
||||
"business_area": poi.get("business_area", ""),
|
||||
"raw_tel": poi.get("tel", "") or "",
|
||||
"raw_poi": poi
|
||||
}, ensure_ascii=False)
|
||||
}
|
||||
return record
|
||||
|
||||
def _save_lawyer(self, record: Dict) -> bool:
|
||||
"""存储律师信息到数据库"""
|
||||
try:
|
||||
self.db.insert_data("lawyer", record)
|
||||
logger.info("新增商户: %s (%s)", record.get("name"), record.get("phone"))
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception("存储失败: %s %s", record.get("name"), record.get("phone"))
|
||||
return False
|
||||
|
||||
def _search_city(self, keywords: str, city_info: Dict, province_info: Dict) -> int:
|
||||
"""在指定城市搜索并存储;返回新增条数"""
|
||||
# city 参数可以是 city code 或 city name;使用你存表里的 code 优先
|
||||
# city_code = city_info.get("code") or city_info.get("name")
|
||||
city_code = city_info.get("name")
|
||||
total_added = 0
|
||||
|
||||
# 先请求第一页拿到 count
|
||||
page = 1
|
||||
first_resp = self._search_gaode_api(keywords, city_code, page)
|
||||
if not first_resp:
|
||||
logger.info(" 未获取到第一页数据: %s", keywords)
|
||||
return 0
|
||||
|
||||
status = first_resp.get("status")
|
||||
if str(status) != "1":
|
||||
logger.warning(" 高德返回错误: %s", first_resp.get("info"))
|
||||
return 0
|
||||
|
||||
try:
|
||||
count = int(first_resp.get("count", 0))
|
||||
except Exception:
|
||||
count = 0
|
||||
# 计算总页数
|
||||
total_pages = math.ceil(count / self.offset) if count else 1
|
||||
total_pages = min(total_pages, self.max_pages_per_city)
|
||||
|
||||
logger.info(" 城市 %s 搜索到 count=%s, pages=%s (限制 %s)", city_code, count, total_pages, self.max_pages_per_city)
|
||||
|
||||
# 处理第一页 POIs
|
||||
def process_page(page_num: int, page_data: Dict) -> int:
|
||||
"""处理单页数据,返回新增条数"""
|
||||
nonlocal total_added
|
||||
if not page_data:
|
||||
logger.info(" page %s 未返回数据", page_num)
|
||||
return 0
|
||||
if str(page_data.get("status")) != "1":
|
||||
logger.warning(" page %s 返回状态非1: %s", page_num, page_data.get("info"))
|
||||
return 0
|
||||
|
||||
pois = page_data.get("pois") or []
|
||||
page_added = 0
|
||||
for poi in pois:
|
||||
name = (poi.get("name") or "").strip()
|
||||
if not name:
|
||||
continue
|
||||
phones = self._extract_phones_from_poi(poi)
|
||||
if not phones:
|
||||
logger.debug(" 跳过无电话: %s", name)
|
||||
continue
|
||||
for ph in phones:
|
||||
# 如果已存在跳过该号码
|
||||
if self._is_duplicate(ph):
|
||||
logger.debug(" 跳过已存在号码: %s (%s)", name, ph)
|
||||
continue
|
||||
rec = self._parse_poi_to_record(poi, city_info, province_info, ph)
|
||||
ok = self._save_lawyer(rec)
|
||||
if ok:
|
||||
page_added += 1
|
||||
total_added += 1
|
||||
# 如果该 POI 有多个号码,我们仍尝试插入其它号码(有用时)
|
||||
# end for phones
|
||||
# end for pois
|
||||
return page_added
|
||||
|
||||
# 先处理第一页
|
||||
first_page_added = process_page(1, first_resp)
|
||||
logger.info(" 城市 %s 第 1 页新增 %d 条", city_code, first_page_added)
|
||||
|
||||
# 记录已处理的页面,避免重复处理
|
||||
processed_pages = {1}
|
||||
|
||||
# 依次请求剩余页面,直到读完或无数据
|
||||
for page_num in range(2, total_pages + 1):
|
||||
if page_num in processed_pages:
|
||||
continue
|
||||
|
||||
time.sleep(self.sleep_between_pages)
|
||||
page_data = self._search_gaode_api(keywords, city_code, page_num)
|
||||
|
||||
if not page_data:
|
||||
logger.info(" 第 %s 页无响应数据,停止翻页", page_num)
|
||||
break
|
||||
|
||||
if str(page_data.get("status")) != "1":
|
||||
logger.info(" 第 %s 页状态异常(%s),停止翻页", page_num, page_data.get("info"))
|
||||
break
|
||||
|
||||
pois = page_data.get("pois") or []
|
||||
if not pois:
|
||||
logger.info(" 第 %s 页返回空pois,提前结束", page_num)
|
||||
break
|
||||
|
||||
page_added = process_page(page_num, page_data)
|
||||
logger.info(" 城市 %s 第 %s 页新增 %d 条", city_code, page_num, page_added)
|
||||
processed_pages.add(page_num)
|
||||
|
||||
# 如果结果数量不足一页,说明已经接近尾部
|
||||
if len(pois) < self.offset:
|
||||
logger.info(" 第 %s 页结果不足一页,推测已到尾页,提前结束", page_num)
|
||||
break
|
||||
|
||||
return total_added
|
||||
|
||||
def run(self):
|
||||
logger.info("启动高德地图律师信息采集...")
|
||||
if not self.cities:
|
||||
logger.error("未加载城市列表,退出")
|
||||
return
|
||||
|
||||
total_stored = 0
|
||||
keywords_suffix = "律师"
|
||||
|
||||
for city_id, city_info in self.cities.items():
|
||||
try:
|
||||
province_info = {"name": city_info.get("province", "")}
|
||||
city_name = city_info.get("name", "")
|
||||
if not city_name:
|
||||
continue
|
||||
search_keywords = f"{keywords_suffix}"
|
||||
added = self._search_city(search_keywords, city_info, province_info)
|
||||
total_stored += added
|
||||
logger.info("城市 %s 完成,新增 %d 条,总计 %d", city_name, added, total_stored)
|
||||
time.sleep(self.sleep_between_cities)
|
||||
except Exception as e:
|
||||
logger.exception("处理城市 %s 时出错: %s", city_info.get("name", ""), e)
|
||||
|
||||
logger.info("采集完成,共新增 %d 条商户信息。", total_stored)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 运行示例
|
||||
with Db() as db:
|
||||
spider = GaodeSpider(db)
|
||||
spider.run()
|
||||
+832
@@ -0,0 +1,832 @@
|
||||
// ==UserScript==
|
||||
// @name Douyin Batch City Search + AutoScroll + Capture
|
||||
// @namespace http://tampermonkey.net/
|
||||
// @version 1.1
|
||||
// @description 从 Python 服务获取地区列表,按 city + "律师" 搜索并自动下滑,拦截 /aweme/v1/web/discover/search/ 返回并转发到入库接口。
|
||||
// @author You
|
||||
// @match https://www.douyin.com/*
|
||||
// @grant GM_xmlhttpRequest
|
||||
// @connect *
|
||||
// @run-at document-idle
|
||||
// ==/UserScript==
|
||||
|
||||
(function () {
|
||||
'use strict';
|
||||
|
||||
/********************* 配置区(按需修改) *********************/
|
||||
const API_BASE = 'http://127.0.0.1:9002'; // 改成你部署 Python 服务的地址,例如 http://nas.nepiedg.site:9002
|
||||
const AREA_API = `${API_BASE}/api/layer/get_area?server=1`; // 获取城市列表的接口
|
||||
const SEND_TARGETS = [
|
||||
`${API_BASE}/api/layer/index?server=1&save_only=0`
|
||||
];
|
||||
|
||||
// 搜索框与按钮选择器(根据页面更新)
|
||||
const SEARCH_INPUT_SELECTORS = [
|
||||
'input[data-e2e="search-input"]',
|
||||
'input[data-e2e="searchbar-input"]',
|
||||
'form[data-e2e="searchbar"] input',
|
||||
'input[placeholder*="搜索"]'
|
||||
];
|
||||
const SEARCH_BTN_SELECTORS = [
|
||||
'[data-e2e="search-button"]',
|
||||
'button[data-e2e="search-button"]',
|
||||
'span[data-e2e="search-button"]',
|
||||
'button[data-e2e="searchbar-button"]',
|
||||
'span.btn-title'
|
||||
];
|
||||
|
||||
// 每个城市搜索时的自动下滑配置
|
||||
const SCROLL_INTERVAL_MS = 2000;
|
||||
const MAX_STABLE_COUNT = 6;
|
||||
const MAX_SCROLLS_PER_CITY = 120;
|
||||
const SCROLL_BY = 2200;
|
||||
const WAIT_AFTER_SEARCH_MS = 1000;
|
||||
const DELAY_BETWEEN_CITIES_MS = 1500;
|
||||
|
||||
// 断点续跑配置
|
||||
const PROGRESS_STORAGE_KEY = 'dm_batch_progress_v1';
|
||||
const DEVICE_ID_STORAGE_KEY = 'dm_batch_device_id_v1';
|
||||
const PROGRESS_SYNC_ENABLED = true;
|
||||
const PROGRESS_KEY = 'douyin_batch_default';
|
||||
const PROGRESS_API = `${API_BASE}/api/layer/progress?server=1`;
|
||||
|
||||
// 可选:如果希望只发送包含手机号的条目,可在此启用并调整正则
|
||||
const ONLY_SEND_IF_HAS_PHONE = false;
|
||||
const PHONE_REGEX = /(?:\+?86)?1[3-9]\d{9}/g;
|
||||
|
||||
/********************* 运行时状态 *********************/
|
||||
let areaList = [];
|
||||
let stopFlag = false; // 由 UI 控制,true 表示停止整个任务
|
||||
let skipCurrentCityFlag = false; // 由 UI 控制,true 表示跳过当前城市
|
||||
let currentCityIndex = -1;
|
||||
let currentAreaSignature = '';
|
||||
let isLoopRunning = false;
|
||||
let inputEl = null;
|
||||
let btnEl = null;
|
||||
const DEVICE_ID = getOrCreateDeviceId();
|
||||
|
||||
// 节流/去重发送
|
||||
let lastSentHash = null;
|
||||
let lastSentAt = 0;
|
||||
const SEND_MIN_INTERVAL_MS = 800;
|
||||
let progressSyncInFlight = false;
|
||||
let progressSyncPendingPayload = null;
|
||||
|
||||
/********************* 工具函数 *********************/
|
||||
function log(...args) { console.log('[DouyinBatch] ', ...args); }
|
||||
function err(...args) { console.error('[DouyinBatch] ', ...args); }
|
||||
|
||||
function hashString(str) {
|
||||
let h = 2166136261 >>> 0;
|
||||
for (let i = 0; i < str.length; i++) {
|
||||
h ^= str.charCodeAt(i);
|
||||
h = Math.imul(h, 16777619) >>> 0;
|
||||
}
|
||||
return h.toString(16);
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(r => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
function getOrCreateDeviceId() {
|
||||
try {
|
||||
const old = localStorage.getItem(DEVICE_ID_STORAGE_KEY);
|
||||
if (old) return old;
|
||||
const generated = (window.crypto && typeof window.crypto.randomUUID === 'function')
|
||||
? window.crypto.randomUUID()
|
||||
: `dm-${Date.now()}-${Math.random().toString(16).slice(2, 10)}`;
|
||||
localStorage.setItem(DEVICE_ID_STORAGE_KEY, generated);
|
||||
return generated;
|
||||
} catch (_) {
|
||||
return `dm-${Date.now()}-${Math.random().toString(16).slice(2, 10)}`;
|
||||
}
|
||||
}
|
||||
|
||||
function getAreaRowName(row) {
|
||||
if (!row || typeof row !== 'object') return '';
|
||||
return String(row.city || row.province || row.name || '').trim();
|
||||
}
|
||||
|
||||
function buildAreaSignature(list) {
|
||||
try {
|
||||
if (!Array.isArray(list) || list.length === 0) return 'empty';
|
||||
const names = list.map(getAreaRowName).filter(Boolean);
|
||||
return hashString(`${list.length}|${names.join('|')}`);
|
||||
} catch (e) {
|
||||
return 'unknown';
|
||||
}
|
||||
}
|
||||
|
||||
function readProgress() {
|
||||
try {
|
||||
const raw = localStorage.getItem(PROGRESS_STORAGE_KEY);
|
||||
if (!raw) return null;
|
||||
const parsed = JSON.parse(raw);
|
||||
if (!parsed || typeof parsed !== 'object') return null;
|
||||
return parsed;
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function buildProgressPayload(nextCityIndex, reason = '') {
|
||||
const safeIndex = Number.isFinite(nextCityIndex) ? Math.max(0, Math.floor(nextCityIndex)) : 0;
|
||||
const currentArea = areaList[safeIndex] || areaList[Math.max(0, currentCityIndex)] || {};
|
||||
return {
|
||||
progress_key: PROGRESS_KEY,
|
||||
device_id: DEVICE_ID,
|
||||
next_city_index: safeIndex,
|
||||
area_signature: currentAreaSignature || '',
|
||||
area_total: Array.isArray(areaList) ? areaList.length : 0,
|
||||
current_city: getAreaRowName(currentArea),
|
||||
reason,
|
||||
status: stopFlag ? 'paused' : 'running',
|
||||
extra: {
|
||||
path: location.pathname || '',
|
||||
href: location.href || '',
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function persistProgress(nextCityIndex, reason = '') {
|
||||
try {
|
||||
const payload = buildProgressPayload(nextCityIndex, reason);
|
||||
localStorage.setItem(PROGRESS_STORAGE_KEY, JSON.stringify({
|
||||
nextCityIndex: payload.next_city_index,
|
||||
areaSignature: payload.area_signature,
|
||||
reason: payload.reason,
|
||||
updatedAt: Date.now(),
|
||||
progressKey: payload.progress_key,
|
||||
deviceId: payload.device_id,
|
||||
}));
|
||||
|
||||
enqueueRemoteProgressSync(payload);
|
||||
} catch (e) {
|
||||
err('保存进度失败', e);
|
||||
}
|
||||
}
|
||||
|
||||
function restoreProgress(areaSignature, listLength) {
|
||||
const progress = readProgress();
|
||||
if (!progress) return 0;
|
||||
if (!progress.areaSignature || progress.areaSignature !== areaSignature) return 0;
|
||||
const idx = Number.isFinite(progress.nextCityIndex) ? Math.floor(progress.nextCityIndex) : 0;
|
||||
if (idx < 0 || idx >= listLength) return 0;
|
||||
return idx;
|
||||
}
|
||||
|
||||
function clearProgress() {
|
||||
try { localStorage.removeItem(PROGRESS_STORAGE_KEY); } catch (_) {}
|
||||
enqueueRemoteProgressSync({
|
||||
action: 'clear',
|
||||
progress_key: PROGRESS_KEY,
|
||||
device_id: DEVICE_ID,
|
||||
});
|
||||
}
|
||||
|
||||
function gmGetJson(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
GM_xmlhttpRequest({
|
||||
method: 'GET',
|
||||
url,
|
||||
onload(res) {
|
||||
try {
|
||||
const json = JSON.parse(res.responseText);
|
||||
resolve(json);
|
||||
} catch (e) {
|
||||
reject(e);
|
||||
}
|
||||
},
|
||||
onerror(err) { reject(err); }
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function gmPostJson(url, data) {
|
||||
return new Promise((resolve, reject) => {
|
||||
GM_xmlhttpRequest({
|
||||
method: 'POST',
|
||||
url,
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
data: JSON.stringify(data || {}),
|
||||
onload(res) {
|
||||
try {
|
||||
const json = JSON.parse(res.responseText || '{}');
|
||||
resolve(json);
|
||||
} catch (e) {
|
||||
reject(e);
|
||||
}
|
||||
},
|
||||
onerror(err) { reject(err); }
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function enqueueRemoteProgressSync(payload) {
|
||||
if (!PROGRESS_SYNC_ENABLED) return;
|
||||
if (!payload || typeof payload !== 'object') return;
|
||||
progressSyncPendingPayload = payload;
|
||||
if (progressSyncInFlight) return;
|
||||
flushRemoteProgressSync();
|
||||
}
|
||||
|
||||
async function flushRemoteProgressSync() {
|
||||
if (!PROGRESS_SYNC_ENABLED) return;
|
||||
if (progressSyncInFlight) return;
|
||||
|
||||
progressSyncInFlight = true;
|
||||
try {
|
||||
while (progressSyncPendingPayload) {
|
||||
const payload = progressSyncPendingPayload;
|
||||
progressSyncPendingPayload = null;
|
||||
try {
|
||||
await gmPostJson(PROGRESS_API, payload);
|
||||
} catch (e) {
|
||||
err('同步远端进度失败', e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
progressSyncInFlight = false;
|
||||
}
|
||||
}
|
||||
|
||||
async function restoreRemoteProgress(areaSignature, listLength) {
|
||||
if (!PROGRESS_SYNC_ENABLED) return 0;
|
||||
try {
|
||||
const url = `${PROGRESS_API}&progress_key=${encodeURIComponent(PROGRESS_KEY)}`;
|
||||
const response = await gmGetJson(url);
|
||||
const data = response && response.data ? response.data : null;
|
||||
if (!data || typeof data !== 'object') return 0;
|
||||
|
||||
const remoteSignature = String(data.area_signature || '');
|
||||
if (!remoteSignature || remoteSignature !== areaSignature) return 0;
|
||||
|
||||
const idxRaw = data.next_city_index;
|
||||
const idx = Number.isFinite(idxRaw) ? Math.floor(idxRaw) : Math.floor(Number(idxRaw || 0));
|
||||
if (!Number.isFinite(idx) || idx < 0 || idx >= listLength) return 0;
|
||||
return idx;
|
||||
} catch (e) {
|
||||
err('读取远端进度失败', e);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
function setNativeValue(el, value) {
|
||||
if (!el) return;
|
||||
const prototype = el.constructor && el.constructor.prototype ? el.constructor.prototype : window.HTMLInputElement && window.HTMLInputElement.prototype;
|
||||
const descriptor = prototype ? Object.getOwnPropertyDescriptor(prototype, 'value') : null;
|
||||
if (descriptor && descriptor.set) {
|
||||
descriptor.set.call(el, value);
|
||||
} else {
|
||||
el.value = value;
|
||||
}
|
||||
}
|
||||
|
||||
async function simulateSearchInput(keyword) {
|
||||
if (!inputEl) return;
|
||||
try {
|
||||
inputEl.focus();
|
||||
inputEl.dispatchEvent(new Event('focus', { bubbles: false }));
|
||||
|
||||
// 清空旧值并触发事件
|
||||
if (inputEl.value) {
|
||||
setNativeValue(inputEl, '');
|
||||
if (typeof InputEvent === 'function') {
|
||||
inputEl.dispatchEvent(new InputEvent('input', { bubbles: true, inputType: 'deleteContentBackward', data: '' }));
|
||||
} else {
|
||||
inputEl.dispatchEvent(new Event('input', { bubbles: true }));
|
||||
}
|
||||
}
|
||||
|
||||
setNativeValue(inputEl, keyword);
|
||||
if (typeof InputEvent === 'function') {
|
||||
inputEl.dispatchEvent(new InputEvent('beforeinput', { bubbles: true, inputType: 'insertText', data: keyword }));
|
||||
inputEl.dispatchEvent(new InputEvent('input', { bubbles: true, inputType: 'insertText', data: keyword }));
|
||||
} else {
|
||||
inputEl.dispatchEvent(new Event('input', { bubbles: true }));
|
||||
}
|
||||
inputEl.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
inputEl.dispatchEvent(new Event('blur', { bubbles: false }));
|
||||
} catch (e) {
|
||||
err('simulateSearchInput error', e);
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 80));
|
||||
}
|
||||
|
||||
function simulateSearchTrigger() {
|
||||
let triggered = false;
|
||||
if (btnEl && btnEl.isConnected) {
|
||||
try {
|
||||
btnEl.focus();
|
||||
btnEl.dispatchEvent(new MouseEvent('mousedown', { bubbles: true, cancelable: true, view: window }));
|
||||
btnEl.dispatchEvent(new MouseEvent('mouseup', { bubbles: true, cancelable: true, view: window }));
|
||||
btnEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true, view: window }));
|
||||
triggered = true;
|
||||
} catch (e) {
|
||||
err('simulateSearchTrigger click error', e);
|
||||
}
|
||||
}
|
||||
|
||||
if (!triggered && inputEl) {
|
||||
try {
|
||||
const opts = { bubbles: true, cancelable: true, key: 'Enter', code: 'Enter', keyCode: 13, which: 13 };
|
||||
inputEl.dispatchEvent(new KeyboardEvent('keydown', opts));
|
||||
inputEl.dispatchEvent(new KeyboardEvent('keypress', opts));
|
||||
inputEl.dispatchEvent(new KeyboardEvent('keyup', opts));
|
||||
triggered = true;
|
||||
} catch (e) {
|
||||
err('Enter 触发搜索失败', e);
|
||||
}
|
||||
}
|
||||
|
||||
return triggered;
|
||||
}
|
||||
|
||||
function sendToTargets(data) {
|
||||
try {
|
||||
const body = typeof data === 'string' ? data : JSON.stringify(data);
|
||||
if (ONLY_SEND_IF_HAS_PHONE) {
|
||||
if (!PHONE_REGEX.test(body)) {
|
||||
// 未匹配手机号则跳过发送
|
||||
return;
|
||||
}
|
||||
}
|
||||
const hash = hashString(body);
|
||||
const now = Date.now();
|
||||
if (hash === lastSentHash && now - lastSentAt < SEND_MIN_INTERVAL_MS) {
|
||||
return;
|
||||
}
|
||||
lastSentHash = hash;
|
||||
lastSentAt = now;
|
||||
|
||||
for (const target of SEND_TARGETS) {
|
||||
GM_xmlhttpRequest({
|
||||
method: 'POST',
|
||||
url: target,
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
data: body,
|
||||
onload(res) { log(`sent -> ${target}, status: ${res.status}`); },
|
||||
onerror(e) { err(`send error to ${target}`, e); }
|
||||
});
|
||||
}
|
||||
} catch (e) {
|
||||
err('sendToTargets error', e);
|
||||
}
|
||||
}
|
||||
|
||||
/********************* 拦截 fetch 与 XHR(捕获目标接口返回) *********************/
|
||||
const TARGET_PATH = '/aweme/v1/web/discover/search/';
|
||||
|
||||
(function interceptFetch() {
|
||||
if (!window.fetch) return;
|
||||
const orig = window.fetch.bind(window);
|
||||
window.fetch = function (...args) {
|
||||
try {
|
||||
const resource = args[0];
|
||||
const url = (typeof resource === 'string') ? resource : (resource && resource.url) ? resource.url : '';
|
||||
if (url && url.includes(TARGET_PATH)) {
|
||||
return orig(...args).then((response) => {
|
||||
try {
|
||||
const cloned = response.clone();
|
||||
cloned.json().then((json) => {
|
||||
if (json && typeof json === 'object') {
|
||||
sendToTargets({ source: 'fetch', url, data: json, ts: Date.now(), cityIndex: currentCityIndex });
|
||||
}
|
||||
}).catch(()=>{});
|
||||
} catch (e) { /* ignore */ }
|
||||
return response;
|
||||
});
|
||||
}
|
||||
} catch (e) { err('fetch wrapper error', e); }
|
||||
return orig(...args);
|
||||
};
|
||||
})();
|
||||
|
||||
(function interceptXHR() {
|
||||
const XHR = window.XMLHttpRequest;
|
||||
if (!XHR) return;
|
||||
const origOpen = XHR.prototype.open;
|
||||
const origSend = XHR.prototype.send;
|
||||
|
||||
XHR.prototype.open = function (method, url, ...rest) {
|
||||
try { this.__dm_url = (typeof url === 'string') ? url : ''; } catch(e){}
|
||||
return origOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
|
||||
XHR.prototype.send = function (body) {
|
||||
try {
|
||||
const targetUrl = this.__dm_url || '';
|
||||
if (targetUrl && targetUrl.includes(TARGET_PATH)) {
|
||||
this.addEventListener('readystatechange', function () {
|
||||
if (this.readyState === 4) {
|
||||
try {
|
||||
const text = this.responseText;
|
||||
if (!text) return;
|
||||
try {
|
||||
const json = JSON.parse(text);
|
||||
sendToTargets({ source: 'xhr', url: targetUrl, data: json, ts: Date.now(), cityIndex: currentCityIndex });
|
||||
} catch (err) {
|
||||
// 非 json 忽略
|
||||
}
|
||||
} catch (e) { /* ignore */ }
|
||||
}
|
||||
});
|
||||
}
|
||||
} catch (e) { err('XHR wrapper error', e); }
|
||||
return origSend.apply(this, [body]);
|
||||
};
|
||||
})();
|
||||
|
||||
/********************* 自动下滑函数(单次搜索) *********************/
|
||||
async function autoScrollUntilStable(statusNode, maxScrolls = MAX_SCROLLS_PER_CITY) {
|
||||
let lastHeight = -1;
|
||||
let stableCount = 0;
|
||||
let scrolls = 0;
|
||||
|
||||
while (!stopFlag) {
|
||||
if (skipCurrentCityFlag) {
|
||||
statusNode.textContent = '收到跳过指令,结束当前地区滚动。';
|
||||
break;
|
||||
}
|
||||
|
||||
scrolls++;
|
||||
if (scrolls > maxScrolls) {
|
||||
statusNode.textContent = `达到单次搜索最大滚动 ${maxScrolls},停止本次自动下滑。`;
|
||||
break;
|
||||
}
|
||||
|
||||
// 执行滚动
|
||||
try {
|
||||
window.scrollBy({ top: SCROLL_BY, left: 0, behavior: 'smooth' });
|
||||
} catch (e) {
|
||||
window.scrollTo(0, (document.body.scrollHeight || document.documentElement.scrollHeight));
|
||||
}
|
||||
|
||||
await sleep(SCROLL_INTERVAL_MS);
|
||||
|
||||
if (skipCurrentCityFlag) {
|
||||
statusNode.textContent = '收到跳过指令,结束当前地区滚动。';
|
||||
break;
|
||||
}
|
||||
|
||||
const curHeight = document.body.scrollHeight || document.documentElement.scrollHeight || 0;
|
||||
if (curHeight === lastHeight) {
|
||||
stableCount++;
|
||||
} else {
|
||||
stableCount = 0;
|
||||
lastHeight = curHeight;
|
||||
}
|
||||
|
||||
statusNode.textContent = `滚动次数: ${scrolls}, 稳定计数: ${stableCount}/${MAX_STABLE_COUNT}`;
|
||||
|
||||
if (stableCount >= MAX_STABLE_COUNT) {
|
||||
statusNode.textContent = `页面高度稳定 (${stableCount}), 本次搜索加载结束。`;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/********************* 页面元素辅助:等待元素出现 *********************/
|
||||
function waitForSelector(selector, timeout = 10000) {
|
||||
const selectors = Array.isArray(selector) ? selector.filter(Boolean) : [selector];
|
||||
return new Promise((resolve, reject) => {
|
||||
let timer;
|
||||
const root = document.documentElement || document.body;
|
||||
|
||||
const cleanup = (observer) => {
|
||||
try { observer && observer.disconnect(); } catch (_) {}
|
||||
if (timer) clearTimeout(timer);
|
||||
};
|
||||
|
||||
const pick = () => {
|
||||
for (const sel of selectors) {
|
||||
if (!sel) continue;
|
||||
try {
|
||||
const found = document.querySelector(sel);
|
||||
if (found) {
|
||||
return found;
|
||||
}
|
||||
} catch (e) {
|
||||
err('query selector error', sel, e);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
const immediate = pick();
|
||||
if (immediate) {
|
||||
return resolve(immediate);
|
||||
}
|
||||
|
||||
const observer = new MutationObserver(() => {
|
||||
const node = pick();
|
||||
if (node) {
|
||||
cleanup(observer);
|
||||
resolve(node);
|
||||
}
|
||||
});
|
||||
|
||||
if (root) {
|
||||
observer.observe(root, { childList: true, subtree: true });
|
||||
}
|
||||
|
||||
timer = setTimeout(() => {
|
||||
cleanup(observer);
|
||||
reject(new Error('timeout waiting for ' + selectors.join(', ')));
|
||||
}, timeout);
|
||||
});
|
||||
}
|
||||
|
||||
async function ensureSearchControls(statusNode) {
|
||||
const isConnected = (node) => {
|
||||
if (!node) return false;
|
||||
try {
|
||||
if (node.isConnected !== undefined) return node.isConnected;
|
||||
return document.contains(node);
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
if (!isConnected(inputEl)) inputEl = null;
|
||||
if (!isConnected(btnEl)) btnEl = null;
|
||||
|
||||
if (!inputEl) {
|
||||
statusNode && (statusNode.textContent = '等待搜索输入框可用...');
|
||||
inputEl = await waitForSelector(SEARCH_INPUT_SELECTORS, 10000);
|
||||
}
|
||||
|
||||
if (!btnEl) {
|
||||
try {
|
||||
statusNode && (statusNode.textContent = '等待搜索按钮可用...');
|
||||
btnEl = await waitForSelector(SEARCH_BTN_SELECTORS, 8000);
|
||||
if (btnEl && btnEl.tagName !== 'BUTTON') {
|
||||
const maybeButton = btnEl.closest('button');
|
||||
if (maybeButton) btnEl = maybeButton;
|
||||
}
|
||||
} catch (e) {
|
||||
btnEl = null;
|
||||
err('未找到搜索按钮,将使用 Enter 键进行触发。');
|
||||
}
|
||||
}
|
||||
|
||||
if (!inputEl) {
|
||||
throw new Error('未定位到搜索输入框');
|
||||
}
|
||||
|
||||
return { inputEl, btnEl };
|
||||
}
|
||||
|
||||
/********************* UI 控制(右下角) *********************/
|
||||
function createUI() {
|
||||
const css = `
|
||||
#dm-batch-btn { position: fixed; right: 12px; bottom: 12px; z-index:999999; background: rgba(0,0,0,0.65); color:#fff;
|
||||
padding:8px 10px; border-radius:8px; font-size:13px; cursor:pointer; user-select:none;}
|
||||
#dm-batch-skip { position: fixed; right:12px; bottom:50px; z-index:999999; background: rgba(30,30,30,0.72); color:#fff;
|
||||
padding:7px 10px; border-radius:8px; font-size:12px; cursor:pointer; user-select:none;}
|
||||
#dm-batch-status { position: fixed; right:12px; bottom:88px; z-index:999999; background: rgba(0,0,0,0.45); color:#fff;
|
||||
padding:6px 8px; border-radius:6px; font-size:12px; max-width:320px; word-break:break-word;}
|
||||
`;
|
||||
const s = document.createElement('style'); s.textContent = css; document.head && document.head.appendChild(s);
|
||||
|
||||
const btn = document.createElement('div');
|
||||
btn.id = 'dm-batch-btn';
|
||||
btn.textContent = 'BatchSearch:停止';
|
||||
btn.dataset.running = '1';
|
||||
document.body.appendChild(btn);
|
||||
|
||||
const skipBtn = document.createElement('div');
|
||||
skipBtn.id = 'dm-batch-skip';
|
||||
skipBtn.textContent = 'BatchSearch:跳过当前';
|
||||
document.body.appendChild(skipBtn);
|
||||
|
||||
const status = document.createElement('div');
|
||||
status.id = 'dm-batch-status';
|
||||
status.textContent = '准备中...';
|
||||
document.body.appendChild(status);
|
||||
|
||||
btn.addEventListener('click', () => {
|
||||
const running = btn.dataset.running === '1';
|
||||
btn.dataset.running = running ? '0' : '1';
|
||||
btn.textContent = running ? 'BatchSearch:已停止' : 'BatchSearch:停止';
|
||||
status.textContent = running ? '已手动停止(已保存断点)' : '已启动';
|
||||
stopFlag = running; // if was running and clicked -> set stopFlag true; if restarting, set false
|
||||
if (running) {
|
||||
skipCurrentCityFlag = false;
|
||||
persistProgress(Math.max(currentCityIndex, 0), 'manual_pause');
|
||||
}
|
||||
if (!stopFlag) {
|
||||
// restart loop if needed
|
||||
runBatchSearchLoop(status).catch(e => err(e));
|
||||
}
|
||||
});
|
||||
|
||||
skipBtn.addEventListener('click', () => {
|
||||
if (currentCityIndex < 0) {
|
||||
status.textContent = '当前还未开始处理城市,稍后再跳过。';
|
||||
return;
|
||||
}
|
||||
skipCurrentCityFlag = true;
|
||||
const areaName = getAreaRowName(areaList[currentCityIndex] || {});
|
||||
status.textContent = `收到跳过指令:${areaName || `索引${currentCityIndex}`}`;
|
||||
});
|
||||
|
||||
skipBtn.addEventListener('contextmenu', (event) => {
|
||||
event.preventDefault();
|
||||
clearProgress();
|
||||
currentCityIndex = 0;
|
||||
status.textContent = '已清除断点。下次将从第 1 个地区开始。';
|
||||
});
|
||||
|
||||
return { btn, skipBtn, status };
|
||||
}
|
||||
|
||||
/********************* 主流程:获取城市并循环搜索 *********************/
|
||||
async function runBatchSearchLoop(statusNode) {
|
||||
if (isLoopRunning) {
|
||||
statusNode.textContent = '批量任务已在运行中,请勿重复启动。';
|
||||
return;
|
||||
}
|
||||
|
||||
isLoopRunning = true;
|
||||
try {
|
||||
stopFlag = (document.getElementById('dm-batch-btn') && document.getElementById('dm-batch-btn').dataset.running === '0');
|
||||
skipCurrentCityFlag = false;
|
||||
|
||||
if (stopFlag) {
|
||||
statusNode.textContent = '当前是暂停状态,点击“BatchSearch:停止”可继续。';
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取 area list(仅在内存为空时获取)
|
||||
if (!areaList || !Array.isArray(areaList) || areaList.length === 0) {
|
||||
statusNode.textContent = '正在获取城市列表...';
|
||||
try {
|
||||
const data = await gmGetJson(AREA_API);
|
||||
const normalizedAreaList = Array.isArray(data)
|
||||
? data
|
||||
: (data && Array.isArray(data.data) ? data.data : []);
|
||||
|
||||
if (normalizedAreaList.length > 0) {
|
||||
areaList = normalizedAreaList;
|
||||
log('获取城市列表数量:', areaList.length);
|
||||
statusNode.textContent = `获取到 ${areaList.length} 个城市,准备开始循环。`;
|
||||
} else {
|
||||
err('area API returned not array', data);
|
||||
statusNode.textContent = '获取城市列表失败(返回格式异常)';
|
||||
return;
|
||||
}
|
||||
} catch (e) {
|
||||
err('获取城市列表失败', e);
|
||||
statusNode.textContent = '获取城市列表失败: ' + e.message;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
currentAreaSignature = buildAreaSignature(areaList);
|
||||
const restoredIndexLocal = restoreProgress(currentAreaSignature, areaList.length);
|
||||
const restoredIndexRemote = await restoreRemoteProgress(currentAreaSignature, areaList.length);
|
||||
const restoredIndex = Math.max(restoredIndexLocal, restoredIndexRemote);
|
||||
const startIndex = (currentCityIndex >= 0 && currentCityIndex < areaList.length)
|
||||
? currentCityIndex
|
||||
: restoredIndex;
|
||||
currentCityIndex = startIndex;
|
||||
|
||||
if (startIndex > 0) {
|
||||
statusNode.textContent = `检测到断点(本地:${restoredIndexLocal + 1} 远端:${restoredIndexRemote + 1}),将从第 ${startIndex + 1}/${areaList.length} 个地区继续。`;
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
// 等待搜索输入与按钮可用
|
||||
try {
|
||||
await ensureSearchControls(statusNode);
|
||||
} catch (e) {
|
||||
err('未找到搜索输入或按钮', e);
|
||||
statusNode.textContent = '未找到搜索输入或按钮,脚本仍会监听接口,但无法自动搜索。';
|
||||
return;
|
||||
}
|
||||
|
||||
let completedAll = true;
|
||||
|
||||
// 主循环:对每个 city 执行搜索 -> 下滑 -> 发送结果 -> 下一 city
|
||||
for (let i = startIndex; i < areaList.length; i++) {
|
||||
if (stopFlag) {
|
||||
completedAll = false;
|
||||
persistProgress(i, 'manual_stop');
|
||||
statusNode.textContent = '已停止(断点已保存)。';
|
||||
break;
|
||||
}
|
||||
|
||||
currentCityIndex = i;
|
||||
skipCurrentCityFlag = false;
|
||||
persistProgress(i, 'start_city');
|
||||
|
||||
const city = (areaList[i].city || areaList[i].province || '').trim();
|
||||
if (!city) {
|
||||
persistProgress(i + 1, 'empty_city');
|
||||
continue;
|
||||
}
|
||||
|
||||
const keyword = `${city}律师`;
|
||||
statusNode.textContent = `正在搜索:${keyword} (${i+1}/${areaList.length})`;
|
||||
log(`开始城市[${i+1}/${areaList.length}] 搜索:`, keyword);
|
||||
|
||||
// 将搜索词放入输入框 (触发 input 事件)
|
||||
try {
|
||||
await ensureSearchControls(statusNode);
|
||||
} catch (e) {
|
||||
err('刷新搜索控件失败', e);
|
||||
statusNode.textContent = '刷新搜索控件失败,终止批量搜索。';
|
||||
completedAll = false;
|
||||
persistProgress(i, 'search_control_error');
|
||||
break;
|
||||
}
|
||||
|
||||
await simulateSearchInput(keyword);
|
||||
|
||||
const triggered = simulateSearchTrigger();
|
||||
if (!triggered) {
|
||||
statusNode.textContent = '搜索触发失败,尝试刷新控件...';
|
||||
btnEl = null;
|
||||
await ensureSearchControls(statusNode);
|
||||
if (!simulateSearchTrigger()) {
|
||||
statusNode.textContent = '搜索触发失败,终止批量搜索。';
|
||||
completedAll = false;
|
||||
persistProgress(i, 'search_trigger_error');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 等待搜索结果开始加载
|
||||
await new Promise(r => setTimeout(r, WAIT_AFTER_SEARCH_MS));
|
||||
|
||||
// 自动下滑直到稳定或达到上限
|
||||
await autoScrollUntilStable(statusNode, MAX_SCROLLS_PER_CITY);
|
||||
|
||||
if (skipCurrentCityFlag) {
|
||||
skipCurrentCityFlag = false;
|
||||
persistProgress(i + 1, 'skip_city');
|
||||
statusNode.textContent = `已跳过 ${keyword},继续下一个地区...`;
|
||||
await sleep(Math.min(DELAY_BETWEEN_CITIES_MS, 800));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (stopFlag) {
|
||||
completedAll = false;
|
||||
persistProgress(i, 'manual_stop_after_scroll');
|
||||
statusNode.textContent = '已停止(断点已保存)。';
|
||||
break;
|
||||
}
|
||||
|
||||
persistProgress(i + 1, 'city_done');
|
||||
|
||||
// 等待短暂间隔再进行下一个城市
|
||||
statusNode.textContent = `完成 ${keyword} 的加载,等待 ${DELAY_BETWEEN_CITIES_MS} ms 后继续...`;
|
||||
await sleep(DELAY_BETWEEN_CITIES_MS);
|
||||
}
|
||||
|
||||
if (completedAll && !stopFlag) {
|
||||
clearProgress();
|
||||
currentCityIndex = -1;
|
||||
statusNode.textContent = '批量搜索完成,已清除断点进度。';
|
||||
log('批量搜索循环结束: completed');
|
||||
} else {
|
||||
log('批量搜索循环结束: paused/broken');
|
||||
}
|
||||
} catch (e) {
|
||||
err('runBatchSearchLoop error', e);
|
||||
persistProgress(Math.max(currentCityIndex, 0), 'loop_exception');
|
||||
} finally {
|
||||
isLoopRunning = false;
|
||||
}
|
||||
}
|
||||
|
||||
/********************* 启动脚本 *********************/
|
||||
(function init() {
|
||||
window.addEventListener('beforeunload', () => {
|
||||
if (currentCityIndex >= 0) {
|
||||
persistProgress(Math.max(currentCityIndex, 0), 'page_unload');
|
||||
}
|
||||
});
|
||||
|
||||
const ui = createUI();
|
||||
ui.status.textContent = '就绪 - 可暂停/跳过,自动保存断点(右键跳过按钮可清除断点)';
|
||||
console.log(location.pathname)
|
||||
// 如果当前为目标页面(/jingxuan/search/),则自动启动;否则仍可在任何页面打开并手动启动。
|
||||
const isAutoPage = location.pathname && location.pathname.indexOf('/search/') !== -1;
|
||||
if (isAutoPage) {
|
||||
ui.status.textContent = '检测到 /jingxuan/search/ 页面,准备开始批量搜索...';
|
||||
// 给页面一点时间加载必要脚本与 dom
|
||||
setTimeout(() => {
|
||||
runBatchSearchLoop(ui.status).catch(e => err(e));
|
||||
}, 800);
|
||||
} else {
|
||||
// 非目标页面,仍可手动点击按钮(按钮初始化为运行状态,点击色变为已停止)
|
||||
ui.status.textContent = '非 /jingxuan/search/ 页面。导航至该页面或手动控制开始。';
|
||||
}
|
||||
})();
|
||||
|
||||
})();
|
||||
|
||||
Binary file not shown.
@@ -0,0 +1,411 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from Db import Db
|
||||
|
||||
SITE_NAME = "zhongfali_group80"
|
||||
LEGACY_DOMAIN = "众法利单页"
|
||||
START_URL = "http://m.zhongfali.com/pg.jsp?groupId=80&pgt=0&pgs=1"
|
||||
DEFAULT_OUTPUT = "/www/wwwroot/lawyers/data/one_off_sites/zhongfali_records_all.jsonl"
|
||||
|
||||
SOCKS_PROXY = "127.0.0.1:7891"
|
||||
CLASH_CONTROLLER = os.environ.get("CLASH_CONTROLLER", "http://127.0.0.1:9090")
|
||||
CLASH_SECRET = os.environ.get("CLASH_SECRET", "")
|
||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||
INITIAL_STATE_RE = re.compile(r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})</script>", re.S)
|
||||
|
||||
|
||||
class ProxyRotator:
|
||||
def __init__(self, controller: str, secret: str):
|
||||
self.controller = controller.rstrip("/")
|
||||
self.secret = secret.strip()
|
||||
self.nodes: List[str] = []
|
||||
self.index = 0
|
||||
|
||||
def _api(self, path: str, method: str = "GET", payload: Optional[Dict] = None) -> Dict:
|
||||
headers = {}
|
||||
if self.secret:
|
||||
headers["Authorization"] = f"Bearer {self.secret}"
|
||||
body = None
|
||||
if payload is not None:
|
||||
headers["Content-Type"] = "application/json"
|
||||
body = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{self.controller}{path}",
|
||||
data=body,
|
||||
headers=headers,
|
||||
method=method,
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="ignore")
|
||||
return json.loads(raw) if raw else {}
|
||||
|
||||
def initialize(self) -> None:
|
||||
if not self.secret:
|
||||
return
|
||||
try:
|
||||
self._api("/configs", method="PATCH", payload={"mode": "global"})
|
||||
proxy_data = self._api("/proxies")
|
||||
proxies = proxy_data.get("proxies", {}) or {}
|
||||
skip = {
|
||||
"GLOBAL",
|
||||
"DIRECT",
|
||||
"REJECT",
|
||||
"REJECT-DROP",
|
||||
"PASS",
|
||||
"COMPATIBLE",
|
||||
"🔰 选择节点",
|
||||
"☁️ OneDrive",
|
||||
"🐟 漏网之鱼",
|
||||
"🎯 全球直连",
|
||||
"🛑 拦截广告",
|
||||
"🌍 爱奇艺&哔哩哔哩",
|
||||
"🎮 Steam 登录/下载",
|
||||
"🎮 Steam 商店/社区",
|
||||
"🌩️ Cloudflare",
|
||||
"🎬 动画疯",
|
||||
"🎓学术网站",
|
||||
"🇨🇳 国内网站",
|
||||
}
|
||||
self.nodes = [
|
||||
name
|
||||
for name, info in proxies.items()
|
||||
if name not in skip and isinstance(info, dict)
|
||||
and info.get("type") not in {"Selector", "URLTest", "Fallback", "LoadBalance"}
|
||||
]
|
||||
if self.nodes:
|
||||
self.switch_to(self.nodes[0])
|
||||
except Exception as exc:
|
||||
print(f"[proxy] rotator init failed: {exc}")
|
||||
self.nodes = []
|
||||
|
||||
def switch_to(self, node_name: str) -> None:
|
||||
self._api("/proxies/GLOBAL", method="PUT", payload={"name": node_name})
|
||||
|
||||
def rotate(self) -> None:
|
||||
if not self.nodes:
|
||||
return
|
||||
self.index = (self.index + 1) % len(self.nodes)
|
||||
node = self.nodes[self.index]
|
||||
self.switch_to(node)
|
||||
|
||||
|
||||
def normalize_phone(value: str) -> str:
|
||||
compact = "".join(ch for ch in str(value or "") if ch.isdigit())
|
||||
match = PHONE_RE.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
def fetch_html(
|
||||
url: str,
|
||||
rotator: Optional[ProxyRotator] = None,
|
||||
max_retries: int = 6,
|
||||
timeout_seconds: int = 18,
|
||||
) -> str:
|
||||
last_error = ""
|
||||
for attempt in range(max_retries):
|
||||
cmd = [
|
||||
"curl",
|
||||
"-sS",
|
||||
"--socks5-hostname",
|
||||
SOCKS_PROXY,
|
||||
"-L",
|
||||
"--compressed",
|
||||
"--max-time",
|
||||
str(timeout_seconds),
|
||||
"-w",
|
||||
"\n__CODE__:%{http_code}",
|
||||
url,
|
||||
]
|
||||
proc = subprocess.run(cmd, capture_output=True)
|
||||
if proc.returncode == 0:
|
||||
raw = proc.stdout.decode("utf-8", errors="ignore")
|
||||
marker = "\n__CODE__:"
|
||||
split_at = raw.rfind(marker)
|
||||
if split_at != -1:
|
||||
text = raw[:split_at]
|
||||
code_text = raw[split_at + len(marker):].strip()
|
||||
else:
|
||||
text = raw
|
||||
code_text = ""
|
||||
code_ok = code_text == "200" if code_text else bool(text)
|
||||
if text and code_ok:
|
||||
return text
|
||||
last_error = "empty body"
|
||||
else:
|
||||
last_error = proc.stderr.decode("utf-8", errors="ignore").strip() or f"exit={proc.returncode}"
|
||||
if rotator and rotator.nodes:
|
||||
try:
|
||||
rotator.rotate()
|
||||
except Exception as exc:
|
||||
last_error = f"{last_error}; rotate failed: {exc}"
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(0.6 * (attempt + 1))
|
||||
raise RuntimeError(f"fetch failed: {url}, reason={last_error}")
|
||||
|
||||
|
||||
def parse_initial_state(html: str) -> Dict:
|
||||
match = INITIAL_STATE_RE.search(html)
|
||||
if not match:
|
||||
raise ValueError("window.__INITIAL_STATE__ not found")
|
||||
return json.loads(match.group(1))
|
||||
|
||||
|
||||
def extract_group_urls_from_group80(state: Dict) -> List[str]:
|
||||
module = (state.get("currentPageModuleIdMap") or {}).get("21") or {}
|
||||
ext_info = module.get("extInfo", {}) or {}
|
||||
second_group_map = ext_info.get("secondGroupMap", {}) or {}
|
||||
rows = second_group_map.get("80") or []
|
||||
|
||||
urls: Set[str] = set()
|
||||
for row in rows:
|
||||
url = str(row.get("url") or "").strip()
|
||||
if url:
|
||||
urls.add(url)
|
||||
for city in row.get("thirdGroupList") or []:
|
||||
city_url = str(city.get("url") or "").strip()
|
||||
if city_url:
|
||||
urls.add(city_url)
|
||||
return sorted(urls)
|
||||
|
||||
|
||||
def extract_detail_urls_from_group_html(html: str) -> Set[str]:
|
||||
detail_ids = set(re.findall(r"h-pd-(\d+)\.html", html))
|
||||
return {f"http://m.zhongfali.com/h-pd-{pid}.html" for pid in detail_ids}
|
||||
|
||||
|
||||
def parse_location_and_name(product_name: str) -> Tuple[str, str, str]:
|
||||
text = re.sub(r"\s+", " ", str(product_name or "")).strip()
|
||||
province = ""
|
||||
city = ""
|
||||
name = ""
|
||||
|
||||
province_match = re.search(r"([\u4e00-\u9fa5]{2,}省)", text)
|
||||
if province_match:
|
||||
province = province_match.group(1)
|
||||
|
||||
city_match = re.search(r"(?:[\u4e00-\u9fa5]+省)?([\u4e00-\u9fa5]+(?:市|区|县|州|盟))", text)
|
||||
if city_match:
|
||||
city = city_match.group(1)
|
||||
|
||||
name_match = re.search(r"([\u4e00-\u9fa5]{2,4})\s*律师", text)
|
||||
if name_match:
|
||||
name = name_match.group(1)
|
||||
|
||||
return province, city, name
|
||||
|
||||
|
||||
def parse_detail_record(detail_url: str, html: str, source_list_url: str) -> Optional[Dict]:
|
||||
state = parse_initial_state(html)
|
||||
|
||||
module = None
|
||||
for mod in (state.get("currentPageModuleIdMap") or {}).values():
|
||||
if isinstance(mod, dict) and (mod.get("extInfo") or {}).get("productInfo"):
|
||||
module = mod
|
||||
break
|
||||
if not module:
|
||||
return None
|
||||
|
||||
ext_info = module.get("extInfo", {}) or {}
|
||||
product_info = ext_info.get("productInfo", {}) or {}
|
||||
|
||||
phone = normalize_phone(product_info.get("material", ""))
|
||||
if not phone:
|
||||
return None
|
||||
|
||||
product_name = str(product_info.get("name") or "").strip()
|
||||
province, city, lawyer_name = parse_location_and_name(product_name)
|
||||
law_firm = str(product_info.get("prop0") or "").strip()
|
||||
|
||||
if not lawyer_name:
|
||||
lawyer_name = product_name
|
||||
|
||||
now = int(time.time())
|
||||
record_id = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
|
||||
return {
|
||||
"record_id": record_id,
|
||||
"collected_at": now,
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"list_url": source_list_url,
|
||||
"detail_url": detail_url,
|
||||
"province": province,
|
||||
"province_py": "",
|
||||
"city": city,
|
||||
"city_py": "",
|
||||
"page": 1,
|
||||
},
|
||||
"list_snapshot": {
|
||||
"name": lawyer_name,
|
||||
"law_firm": law_firm,
|
||||
"specialties": [],
|
||||
"answer_count": None,
|
||||
},
|
||||
"profile": {
|
||||
"name": lawyer_name,
|
||||
"law_firm": law_firm,
|
||||
"phone": phone,
|
||||
"license_no": str(product_info.get("prop1") or "").strip(),
|
||||
"practice_years": None,
|
||||
"email": "",
|
||||
"address": str(product_info.get("prop3") or "").strip(),
|
||||
"specialties": [],
|
||||
},
|
||||
"raw": {
|
||||
"product_name": product_name,
|
||||
"group_ids": product_info.get("groupIdList") or [],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def to_legacy_row(record: Dict) -> Optional[Dict[str, str]]:
|
||||
profile = record.get("profile", {}) or {}
|
||||
source = record.get("source", {}) or {}
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
return None
|
||||
|
||||
province = str(source.get("province") or "").strip()
|
||||
city = str(source.get("city") or province).strip()
|
||||
return {
|
||||
"name": str(profile.get("name") or "").strip(),
|
||||
"law_firm": str(profile.get("law_firm") or "").strip(),
|
||||
"province": province,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"url": str(source.get("detail_url") or "").strip(),
|
||||
"domain": LEGACY_DOMAIN,
|
||||
"create_time": int(record.get("collected_at") or time.time()),
|
||||
"params": json.dumps(record, ensure_ascii=False),
|
||||
}
|
||||
|
||||
|
||||
def delete_old_domain_data(db: Db, domain: str) -> int:
|
||||
cur = db.db.cursor()
|
||||
try:
|
||||
cur.execute("DELETE FROM lawyer WHERE domain=%s", (domain,))
|
||||
affected = cur.rowcount
|
||||
db.db.commit()
|
||||
return affected
|
||||
finally:
|
||||
cur.close()
|
||||
|
||||
|
||||
def write_records_to_db(db: Db, records: List[Dict]) -> int:
|
||||
inserted = 0
|
||||
for record in records:
|
||||
row = to_legacy_row(record)
|
||||
if not row:
|
||||
continue
|
||||
try:
|
||||
db.insert_data("lawyer", row)
|
||||
inserted += 1
|
||||
except Exception as exc:
|
||||
print(f"[db] insert failed phone={row.get('phone', '')}: {exc}")
|
||||
return inserted
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="众法利 groupId=80 基础字段采集(姓名/手机号/地区)")
|
||||
parser.add_argument("--start-url", default=START_URL, help="入口分组页 URL")
|
||||
parser.add_argument("--output", default=DEFAULT_OUTPUT, help="JSONL 输出路径")
|
||||
parser.add_argument("--no-db", action="store_true", help="只写 JSON,不写 DB")
|
||||
parser.add_argument("--no-reset", action="store_true", help="不清理 domain 旧数据")
|
||||
parser.add_argument("--workers", type=int, default=16, help="详情页并发数")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
||||
|
||||
rotator = ProxyRotator(CLASH_CONTROLLER, CLASH_SECRET)
|
||||
rotator.initialize()
|
||||
if rotator.nodes:
|
||||
print(f"[proxy] rotator enabled, nodes={len(rotator.nodes)}")
|
||||
else:
|
||||
print("[proxy] rotator disabled, using current proxy route")
|
||||
|
||||
start_retries = max(8, len(rotator.nodes) + 2) if rotator.nodes else 8
|
||||
group_html = fetch_html(args.start_url, rotator=rotator, max_retries=start_retries)
|
||||
group_state = parse_initial_state(group_html)
|
||||
group_urls = extract_group_urls_from_group80(group_state)
|
||||
print(f"[group] found group urls: {len(group_urls)}")
|
||||
|
||||
detail_url_to_source: Dict[str, str] = {}
|
||||
for idx, rel_url in enumerate(group_urls, start=1):
|
||||
list_url = f"http://m.zhongfali.com/{rel_url.lstrip('/')}"
|
||||
try:
|
||||
html = fetch_html(list_url, rotator=rotator, max_retries=4, timeout_seconds=12)
|
||||
detail_urls = extract_detail_urls_from_group_html(html)
|
||||
except Exception as exc:
|
||||
print(f"[group] failed {list_url}: {exc}")
|
||||
continue
|
||||
|
||||
for detail_url in detail_urls:
|
||||
detail_url_to_source.setdefault(detail_url, list_url)
|
||||
if idx % 10 == 0:
|
||||
print(f"[group] {idx}/{len(group_urls)} detail_urls={len(detail_url_to_source)}")
|
||||
|
||||
records: List[Dict] = []
|
||||
seen_phones: Set[str] = set()
|
||||
detail_urls = sorted(detail_url_to_source.keys())
|
||||
print(f"[detail] total detail urls: {len(detail_urls)}")
|
||||
|
||||
def process_detail(detail_url: str) -> Optional[Dict]:
|
||||
try:
|
||||
html = fetch_html(detail_url, rotator=rotator, max_retries=2, timeout_seconds=8)
|
||||
record = parse_detail_record(detail_url, html, detail_url_to_source[detail_url])
|
||||
return record
|
||||
except Exception as exc:
|
||||
print(f"[detail] failed {detail_url}: {exc}")
|
||||
return None
|
||||
|
||||
done = 0
|
||||
with ThreadPoolExecutor(max_workers=max(1, int(args.workers))) as executor:
|
||||
futures = [executor.submit(process_detail, detail_url) for detail_url in detail_urls]
|
||||
for future in as_completed(futures):
|
||||
done += 1
|
||||
record = future.result()
|
||||
if record:
|
||||
phone = normalize_phone((record.get("profile", {}) or {}).get("phone", ""))
|
||||
if phone and phone not in seen_phones:
|
||||
seen_phones.add(phone)
|
||||
records.append(record)
|
||||
if done % 50 == 0:
|
||||
print(f"[detail] {done}/{len(detail_urls)} valid_records={len(records)}")
|
||||
|
||||
with open(args.output, "w", encoding="utf-8") as out:
|
||||
for record in records:
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
|
||||
deleted = 0
|
||||
inserted = 0
|
||||
if not args.no_db:
|
||||
with Db() as db:
|
||||
if not args.no_reset:
|
||||
deleted = delete_old_domain_data(db, LEGACY_DOMAIN)
|
||||
inserted = write_records_to_db(db, records)
|
||||
|
||||
print(
|
||||
f"[done] records={len(records)}, db_deleted={deleted}, db_inserted={inserted}, output={args.output}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,501 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
|
||||
import urllib3
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
request_dir = os.path.join(project_root, "request")
|
||||
if request_dir not in sys.path:
|
||||
sys.path.insert(0, request_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from Db import Db
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
SITE_NAME = "zhongfali_single"
|
||||
LEGACY_DOMAIN = "众法利单页"
|
||||
DEFAULT_URL = "http://m.zhongfali.com/h-pd-552.html#mid=3&groupId=196&desc=false"
|
||||
DEFAULT_OUTPUT = "/www/wwwroot/lawyers/data/one_off_sites/zhongfali_records_all.jsonl"
|
||||
|
||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||
INITIAL_STATE_RE = re.compile(r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})</script>", re.S)
|
||||
|
||||
|
||||
def normalize_phone(text: str) -> str:
|
||||
compact = re.sub(r"\D", "", text or "")
|
||||
match = PHONE_RE.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
def split_specialties(text: str) -> List[str]:
|
||||
source = (text or "").strip()
|
||||
if not source:
|
||||
return []
|
||||
parts = [item.strip() for item in re.split(r"[、,,;;\s]+", source) if item.strip()]
|
||||
seen: Set[str] = set()
|
||||
result: List[str] = []
|
||||
for item in parts:
|
||||
if item in seen:
|
||||
continue
|
||||
seen.add(item)
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def strip_html(text: str) -> str:
|
||||
cleaned = re.sub(r"<[^>]+>", " ", text or "")
|
||||
cleaned = cleaned.replace(" ", " ")
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
return cleaned.strip()
|
||||
|
||||
|
||||
def extract_specialties_from_remark(remark: str) -> List[str]:
|
||||
plain = strip_html(remark)
|
||||
if not plain:
|
||||
return []
|
||||
|
||||
match = re.search(r"专业领域[::]\s*([^。;]+)", plain)
|
||||
if match:
|
||||
return split_specialties(match.group(1))
|
||||
return []
|
||||
|
||||
|
||||
def value_at(values: List[str], index: int) -> str:
|
||||
if index < 0 or index >= len(values):
|
||||
return ""
|
||||
return str(values[index] or "").strip()
|
||||
|
||||
|
||||
def parse_initial_state(html: str) -> Dict:
|
||||
match = INITIAL_STATE_RE.search(html)
|
||||
if not match:
|
||||
raise ValueError("未找到 window.__INITIAL_STATE__")
|
||||
return json.loads(match.group(1))
|
||||
|
||||
|
||||
def extract_location_and_name(product_name: str) -> Tuple[str, str, str]:
|
||||
text = re.sub(r"\s+", " ", product_name or "").strip()
|
||||
province = ""
|
||||
city = ""
|
||||
lawyer_name = ""
|
||||
|
||||
province_match = re.search(r"([\u4e00-\u9fa5]{2,}省)", text)
|
||||
city_match = re.search(r"(?:[\u4e00-\u9fa5]+省)?([\u4e00-\u9fa5]+市)", text)
|
||||
name_match = re.search(r"([\u4e00-\u9fa5]{2,4})\s*律师", text)
|
||||
|
||||
if province_match:
|
||||
province = province_match.group(1)
|
||||
if city_match:
|
||||
city = city_match.group(1)
|
||||
if name_match:
|
||||
lawyer_name = name_match.group(1)
|
||||
|
||||
return province, city, lawyer_name
|
||||
|
||||
|
||||
def pick_product_module(state: Dict) -> Optional[Dict]:
|
||||
module_map = state.get("currentPageModuleIdMap", {}) or {}
|
||||
page_ids = state.get("currentPageModuleIds", []) or []
|
||||
|
||||
for module_id in page_ids:
|
||||
module = module_map.get(str(module_id)) or module_map.get(module_id)
|
||||
if not isinstance(module, dict):
|
||||
continue
|
||||
ext_info = module.get("extInfo", {}) or {}
|
||||
if ext_info.get("productInfo"):
|
||||
return module
|
||||
|
||||
for module in module_map.values():
|
||||
if not isinstance(module, dict):
|
||||
continue
|
||||
ext_info = module.get("extInfo", {}) or {}
|
||||
if ext_info.get("productInfo"):
|
||||
return module
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def parse_group_id_from_url(url: str) -> int:
|
||||
match = re.search(r"(?:[?&#]|^)groupId=(\d+)", url)
|
||||
if not match:
|
||||
return 0
|
||||
try:
|
||||
return int(match.group(1))
|
||||
except ValueError:
|
||||
return 0
|
||||
|
||||
|
||||
def extract_records(url: str, state: Dict) -> List[Dict]:
|
||||
module = pick_product_module(state)
|
||||
if not module:
|
||||
return []
|
||||
|
||||
ext_info = module.get("extInfo", {}) or {}
|
||||
product_info = ext_info.get("productInfo", {}) or {}
|
||||
product_name = str(product_info.get("name") or "").strip()
|
||||
|
||||
province, city, current_name = extract_location_and_name(product_name)
|
||||
group_id = product_info.get("groupId")
|
||||
if not group_id:
|
||||
group_id = parse_group_id_from_url(url)
|
||||
module_id = module.get("id")
|
||||
|
||||
prop_map: Dict[str, List[str]] = {}
|
||||
for prop in ext_info.get("propList", []) or []:
|
||||
name = str(prop.get("name") or "").strip()
|
||||
values = [str(item or "").strip() for item in (prop.get("valueList") or [])]
|
||||
if name:
|
||||
prop_map[name] = values
|
||||
|
||||
result: List[Dict] = []
|
||||
seen_phones: Set[str] = set()
|
||||
now = int(time.time())
|
||||
|
||||
phone_values = prop_map.get("电话", [])
|
||||
for idx, raw_phone in enumerate(phone_values):
|
||||
phone = normalize_phone(raw_phone)
|
||||
if not phone or phone in seen_phones:
|
||||
continue
|
||||
seen_phones.add(phone)
|
||||
|
||||
law_firm = value_at(prop_map.get("律师所", []), idx)
|
||||
area = value_at(prop_map.get("所在地区", []), idx)
|
||||
direction = value_at(prop_map.get("主攻方向", []), idx)
|
||||
specialty_text = value_at(prop_map.get("专业特长", []), idx)
|
||||
license_no = value_at(prop_map.get("执业证号", []), idx)
|
||||
address = value_at(prop_map.get("地址", []), idx)
|
||||
email = value_at(prop_map.get("电子邮箱", []), idx)
|
||||
seat_phone = value_at(prop_map.get("座机", []), idx)
|
||||
wechat = value_at(prop_map.get("微信", []), idx)
|
||||
qq = value_at(prop_map.get("QQ", []), idx)
|
||||
first_practice_date = value_at(prop_map.get("首次执业日期", []), idx)
|
||||
|
||||
specialties = split_specialties(direction)
|
||||
if not specialties:
|
||||
specialties = split_specialties(specialty_text)
|
||||
|
||||
record = {
|
||||
"record_id": hashlib.md5(f"{url}|{phone}".encode("utf-8")).hexdigest(),
|
||||
"collected_at": now,
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"list_url": url,
|
||||
"detail_url": "",
|
||||
"province": province,
|
||||
"province_py": "",
|
||||
"city": area or city,
|
||||
"city_py": "",
|
||||
"page": 1,
|
||||
"group_id": group_id,
|
||||
"module_id": module_id,
|
||||
"detail_url_status": "unresolved_from_pool",
|
||||
},
|
||||
"list_snapshot": {
|
||||
"name": "",
|
||||
"law_firm": law_firm,
|
||||
"specialties": specialties,
|
||||
"answer_count": None,
|
||||
},
|
||||
"profile": {
|
||||
"name": "",
|
||||
"law_firm": law_firm,
|
||||
"phone": phone,
|
||||
"license_no": license_no,
|
||||
"practice_years": None,
|
||||
"email": email,
|
||||
"address": address,
|
||||
"specialties": specialties,
|
||||
},
|
||||
"raw": {
|
||||
"source_index": idx,
|
||||
"direction": direction,
|
||||
"specialty_text": specialty_text,
|
||||
"seat_phone": seat_phone,
|
||||
"wechat": wechat,
|
||||
"qq": qq,
|
||||
"first_practice_date": first_practice_date,
|
||||
},
|
||||
}
|
||||
result.append(record)
|
||||
|
||||
current_phone = normalize_phone(str(product_info.get("material") or ""))
|
||||
if current_phone and current_phone not in seen_phones:
|
||||
seen_phones.add(current_phone)
|
||||
remark = str(product_info.get("remark") or "")
|
||||
specialties = extract_specialties_from_remark(remark)
|
||||
result.append(
|
||||
{
|
||||
"record_id": hashlib.md5(f"{url}|{current_phone}".encode("utf-8")).hexdigest(),
|
||||
"collected_at": now,
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"list_url": url,
|
||||
"detail_url": url,
|
||||
"province": province,
|
||||
"province_py": "",
|
||||
"city": city,
|
||||
"city_py": "",
|
||||
"page": 1,
|
||||
"group_id": group_id,
|
||||
"module_id": module_id,
|
||||
},
|
||||
"list_snapshot": {
|
||||
"name": current_name,
|
||||
"law_firm": str(product_info.get("prop0") or "").strip(),
|
||||
"specialties": specialties,
|
||||
"answer_count": None,
|
||||
},
|
||||
"profile": {
|
||||
"name": current_name,
|
||||
"law_firm": str(product_info.get("prop0") or "").strip(),
|
||||
"phone": current_phone,
|
||||
"license_no": str(product_info.get("prop1") or "").strip(),
|
||||
"practice_years": None,
|
||||
"email": "",
|
||||
"address": str(product_info.get("prop3") or "").strip(),
|
||||
"specialties": specialties,
|
||||
},
|
||||
"raw": {
|
||||
"from_product_info": True,
|
||||
"product_name": product_name,
|
||||
"remark": remark,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def to_legacy_row(record: Dict) -> Optional[Dict[str, str]]:
|
||||
source = record.get("source", {}) or {}
|
||||
profile = record.get("profile", {}) or {}
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
return None
|
||||
|
||||
province = str(source.get("province") or "").strip()
|
||||
city = str(source.get("city") or province).strip()
|
||||
return {
|
||||
"name": str(profile.get("name") or "").strip(),
|
||||
"law_firm": str(profile.get("law_firm") or "").strip(),
|
||||
"province": province,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"url": str(source.get("detail_url") or "").strip(),
|
||||
"domain": LEGACY_DOMAIN,
|
||||
"create_time": int(record.get("collected_at") or time.time()),
|
||||
"params": json.dumps(record, ensure_ascii=False),
|
||||
}
|
||||
|
||||
|
||||
def existing_phones_in_db(db: Db, phones: List[str]) -> Set[str]:
|
||||
deduped = sorted({phone for phone in phones if phone})
|
||||
if not deduped:
|
||||
return set()
|
||||
|
||||
existing: Set[str] = set()
|
||||
cur = db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
return existing
|
||||
|
||||
|
||||
def write_records_to_db(db: Db, records: List[Dict]) -> Tuple[int, int]:
|
||||
rows: List[Dict[str, str]] = []
|
||||
for record in records:
|
||||
row = to_legacy_row(record)
|
||||
if row:
|
||||
rows.append(row)
|
||||
if not rows:
|
||||
return 0, 0
|
||||
|
||||
existing = existing_phones_in_db(db, [row["phone"] for row in rows])
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
|
||||
for row in rows:
|
||||
phone = row.get("phone", "")
|
||||
if not phone or phone in existing:
|
||||
skipped += 1
|
||||
continue
|
||||
try:
|
||||
db.insert_data("lawyer", row)
|
||||
existing.add(phone)
|
||||
inserted += 1
|
||||
except Exception as exc:
|
||||
skipped += 1
|
||||
print(f"[db] 插入失败 phone={phone}: {exc}")
|
||||
return inserted, skipped
|
||||
|
||||
|
||||
def lookup_name_map_from_db(db: Db, phones: List[str]) -> Dict[str, str]:
|
||||
deduped = sorted({phone for phone in phones if phone})
|
||||
if not deduped:
|
||||
return {}
|
||||
|
||||
name_map: Dict[str, str] = {}
|
||||
cur = db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = (
|
||||
"SELECT phone, name, create_time FROM lawyer "
|
||||
f"WHERE phone IN ({placeholders}) AND name<>'' "
|
||||
"ORDER BY create_time DESC"
|
||||
)
|
||||
cur.execute(sql, chunk)
|
||||
for phone, name, _ in cur.fetchall():
|
||||
if phone not in name_map and name:
|
||||
name_map[phone] = str(name).strip()
|
||||
finally:
|
||||
cur.close()
|
||||
return name_map
|
||||
|
||||
|
||||
def apply_name_backfill(records: List[Dict], name_map: Dict[str, str]) -> int:
|
||||
updated = 0
|
||||
if not name_map:
|
||||
return updated
|
||||
|
||||
for record in records:
|
||||
profile = record.get("profile", {}) or {}
|
||||
list_snapshot = record.get("list_snapshot", {}) or {}
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
continue
|
||||
|
||||
backfill_name = name_map.get(phone, "")
|
||||
if not backfill_name:
|
||||
continue
|
||||
|
||||
current_name = str(profile.get("name") or "").strip()
|
||||
if current_name:
|
||||
continue
|
||||
|
||||
profile["name"] = backfill_name
|
||||
list_snapshot["name"] = backfill_name
|
||||
record["profile"] = profile
|
||||
record["list_snapshot"] = list_snapshot
|
||||
updated += 1
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="众法利单页律师电话采集")
|
||||
parser.add_argument("--url", default=DEFAULT_URL, help="详情页 URL")
|
||||
parser.add_argument("--output", default=DEFAULT_OUTPUT, help="输出 jsonl 文件路径")
|
||||
parser.add_argument("--direct", action="store_true", help="直连模式,不使用代理")
|
||||
parser.add_argument("--no-db", action="store_true", help="仅输出 JSON,不写入数据库")
|
||||
parser.add_argument("--skip-name-backfill", action="store_true", help="跳过按手机号回填姓名")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
||||
|
||||
client = RequestsClient(
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Connection": "close",
|
||||
},
|
||||
use_proxy=not args.direct,
|
||||
retry_total=2,
|
||||
retry_backoff_factor=1,
|
||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||
retry_allowed_methods=("GET",),
|
||||
)
|
||||
|
||||
try:
|
||||
resp = client.get_text(args.url, timeout=30, verify=False)
|
||||
if resp.status_code >= 400:
|
||||
raise RequestClientError(f"{resp.status_code} Error: {args.url}")
|
||||
state = parse_initial_state(resp.text)
|
||||
records = extract_records(args.url, state)
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
if not records:
|
||||
print("[done] 未采集到有效手机号")
|
||||
return
|
||||
|
||||
seen_ids: Set[str] = set()
|
||||
if os.path.exists(args.output):
|
||||
with open(args.output, "r", encoding="utf-8") as old_file:
|
||||
for line in old_file:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
record_id = item.get("record_id")
|
||||
if record_id:
|
||||
seen_ids.add(record_id)
|
||||
|
||||
json_new = 0
|
||||
with open(args.output, "a", encoding="utf-8") as out:
|
||||
for record in records:
|
||||
record_id = record["record_id"]
|
||||
if record_id in seen_ids:
|
||||
continue
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
seen_ids.add(record_id)
|
||||
json_new += 1
|
||||
|
||||
db_new = 0
|
||||
db_skip = 0
|
||||
name_backfill_count = 0
|
||||
if not args.skip_name_backfill:
|
||||
try:
|
||||
with Db() as db:
|
||||
name_map = lookup_name_map_from_db(
|
||||
db,
|
||||
[normalize_phone((record.get("profile", {}) or {}).get("phone", "")) for record in records],
|
||||
)
|
||||
name_backfill_count = apply_name_backfill(records, name_map)
|
||||
except Exception as exc:
|
||||
print(f"[name-backfill] 跳过,查询失败: {exc}")
|
||||
|
||||
if not args.no_db:
|
||||
with Db() as db:
|
||||
db_new, db_skip = write_records_to_db(db, records)
|
||||
|
||||
print(
|
||||
f"[done] 采集{len(records)}条, 姓名回填{name_backfill_count}条, JSON新增{json_new}条, "
|
||||
f"DB新增{db_new}条, DB跳过{db_skip}条, 输出: {args.output}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+1
-19
@@ -1,19 +1 @@
|
||||
from request.requests_client import (
|
||||
RequestClientError,
|
||||
RequestConnectTimeout,
|
||||
RequestConnectionError,
|
||||
RequestSSLError,
|
||||
RequestTimeout,
|
||||
RequestsClient,
|
||||
ResponseData,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"RequestsClient",
|
||||
"ResponseData",
|
||||
"RequestClientError",
|
||||
"RequestConnectTimeout",
|
||||
"RequestTimeout",
|
||||
"RequestConnectionError",
|
||||
"RequestSSLError",
|
||||
]
|
||||
# Package marker for request utilities.
|
||||
|
||||
+42
-2
@@ -24,6 +24,19 @@ def _normalize_bool(value, default: bool = True) -> bool:
|
||||
return text not in ("0", "false", "no", "off", "")
|
||||
|
||||
|
||||
def _env_proxy_override() -> Optional[bool]:
|
||||
"""
|
||||
环境变量覆盖代理开关:
|
||||
- PROXY_ENABLED 未设置:返回 None(不覆盖,仍读取 proxy_settings.json)
|
||||
- PROXY_ENABLED=0/false/off:强制关闭代理
|
||||
- PROXY_ENABLED=1/true/on:强制开启代理(前提是配置字段齐全)
|
||||
"""
|
||||
raw = os.getenv("PROXY_ENABLED")
|
||||
if raw is None:
|
||||
return None
|
||||
return _normalize_bool(raw, True)
|
||||
|
||||
|
||||
def _load_config() -> Dict[str, str]:
|
||||
if not os.path.exists(CONFIG_PATH):
|
||||
return dict(DEFAULT_CONFIG)
|
||||
@@ -48,7 +61,12 @@ def report_proxy_status() -> None:
|
||||
_PROXY_STATUS_REPORTED = True
|
||||
|
||||
config = _load_config()
|
||||
enabled = _normalize_bool(config.get("enabled"), True)
|
||||
override = _env_proxy_override()
|
||||
if override is False:
|
||||
print("[proxy] disabled by env (PROXY_ENABLED=0)")
|
||||
return
|
||||
|
||||
enabled = _normalize_bool(config.get("enabled"), True) if override is None else True
|
||||
if not enabled:
|
||||
print("[proxy] disabled by config")
|
||||
return
|
||||
@@ -66,7 +84,10 @@ def get_proxies() -> Optional[Dict[str, str]]:
|
||||
代理配置从 proxy_settings.json 读取,不依赖环境变量。
|
||||
"""
|
||||
config = _load_config()
|
||||
if not _normalize_bool(config.get("enabled"), True):
|
||||
override = _env_proxy_override()
|
||||
if override is False:
|
||||
return None
|
||||
if override is None and not _normalize_bool(config.get("enabled"), True):
|
||||
return None
|
||||
|
||||
tunnel = str(config.get("tunnel") or "").strip()
|
||||
@@ -95,3 +116,22 @@ def apply_proxy(session) -> Optional[Dict[str, str]]:
|
||||
|
||||
|
||||
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status"]
|
||||
|
||||
|
||||
def is_proxy_enabled() -> bool:
|
||||
"""
|
||||
判断当前进程是否启用了代理。
|
||||
|
||||
优先遵循环境变量 PROXY_ENABLED;
|
||||
未设置时回退到 proxy_settings.json 的 enabled 配置。
|
||||
"""
|
||||
config = _load_config()
|
||||
override = _env_proxy_override()
|
||||
if override is False:
|
||||
return False
|
||||
if override is True:
|
||||
return True
|
||||
return _normalize_bool(config.get("enabled"), True)
|
||||
|
||||
|
||||
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status", "is_proxy_enabled"]
|
||||
|
||||
+14
-2
@@ -1,6 +1,18 @@
|
||||
# 数据库驱动
|
||||
pymysql>=1.0.2
|
||||
pymongo>=4.0.0
|
||||
|
||||
# 调度器
|
||||
schedule>=1.2.0
|
||||
|
||||
# 其他可能需要的依赖
|
||||
requests>=2.28.0
|
||||
beautifulsoup4>=4.11.0
|
||||
urllib3>=1.26.0
|
||||
lxml>=4.9.0
|
||||
openpyxl>=3.1.0
|
||||
redis>=4.0.0
|
||||
pyppeteer>=1.0.2
|
||||
# 可选:提升反检测能力
|
||||
pyppeteer-stealth>=2.7.4
|
||||
|
||||
# 日志相关
|
||||
python-dateutil>=2.8.2
|
||||
|
||||
@@ -0,0 +1,849 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from Db import Db
|
||||
|
||||
|
||||
AREA_TABLE = os.getenv("AREA_TARGET_TABLE", "area_new")
|
||||
AREA_DOMAIN = os.getenv("AREA_DOMAIN", "maxlaw")
|
||||
DOUYIN_DOMAIN = os.getenv("DOUYIN_DOMAIN", "抖音")
|
||||
DOUYIN_RAW_DIR = os.getenv("DOUYIN_RAW_DIR", os.path.join(project_root, "data", "douyin_raw"))
|
||||
DOUYIN_SAVE_ONLY_ENV = os.getenv("DOUYIN_SAVE_ONLY", "1")
|
||||
LAWYER_KEYWORDS_ENV = os.getenv("DOUYIN_LAWYER_KEYWORDS", "律师,律所")
|
||||
PROGRESS_TABLE = os.getenv("LAYER_PROGRESS_TABLE", "layer_progress")
|
||||
PROGRESS_DEFAULT_KEY = os.getenv("LAYER_PROGRESS_DEFAULT_KEY", "douyin_batch_default")
|
||||
SERVICE_HOST = os.getenv("AREA_SERVICE_HOST", "0.0.0.0")
|
||||
SERVICE_PORT = int(os.getenv("AREA_SERVICE_PORT", "9002"))
|
||||
|
||||
PHONE_REGEX = re.compile(r"(?:\+?86[-\s]?)?(1[3-9]\d{9})")
|
||||
WX_CONTEXT_REGEX = re.compile(r"(?i)(?:微信|微.?信|wx|vx|weixin|v信|v号|v)\s*[::/\-\s]\s*([a-zA-Z0-9._-]{3,40})")
|
||||
LAW_FIRM_REGEX = re.compile(r"([\u4e00-\u9fa5A-Za-z·]{2,40}律师事务所)")
|
||||
RAW_WRITE_LOCK = threading.Lock()
|
||||
|
||||
LAWYER_KEYWORDS: Tuple[str, ...] = tuple(
|
||||
keyword.strip() for keyword in LAWYER_KEYWORDS_ENV.split(",") if keyword.strip()
|
||||
)
|
||||
|
||||
|
||||
def _is_safe_table_name(table_name: str) -> bool:
|
||||
return bool(re.fullmatch(r"[A-Za-z0-9_]+", table_name or ""))
|
||||
|
||||
|
||||
def _parse_int(value: Any, default: int = 0) -> int:
|
||||
try:
|
||||
return int(str(value).strip())
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def _parse_bool(value: Any, default: bool = False) -> bool:
|
||||
if value is None:
|
||||
return default
|
||||
text = str(value).strip().lower()
|
||||
if text in {"1", "true", "yes", "y", "on"}:
|
||||
return True
|
||||
if text in {"0", "false", "no", "n", "off"}:
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def _first_param(params: Dict[str, List[str]], key: str, default: str = "") -> str:
|
||||
values = params.get(key) or []
|
||||
if not values:
|
||||
return default
|
||||
return values[0]
|
||||
|
||||
|
||||
def _append_jsonl(file_path: str, payload: Dict[str, Any]) -> None:
|
||||
os.makedirs(os.path.dirname(file_path) or ".", exist_ok=True)
|
||||
line = json.dumps(payload, ensure_ascii=False)
|
||||
with RAW_WRITE_LOCK:
|
||||
with open(file_path, "a", encoding="utf-8") as out:
|
||||
out.write(line)
|
||||
out.write("\n")
|
||||
|
||||
|
||||
def _save_raw_index_payload(payload: Dict[str, Any], query: Dict[str, List[str]], client_ip: str) -> str:
|
||||
now_ts = int(time.time())
|
||||
day = time.strftime("%Y%m%d", time.localtime(now_ts))
|
||||
file_path = os.path.join(DOUYIN_RAW_DIR, f"douyin_index_{day}.jsonl")
|
||||
|
||||
wrapped = {
|
||||
"received_at": now_ts,
|
||||
"client_ip": client_ip,
|
||||
"query": query,
|
||||
"payload": payload,
|
||||
}
|
||||
_append_jsonl(file_path, wrapped)
|
||||
return file_path
|
||||
|
||||
|
||||
def _ensure_progress_table() -> None:
|
||||
if not _is_safe_table_name(PROGRESS_TABLE):
|
||||
raise ValueError("非法进度表名")
|
||||
|
||||
with Db() as db:
|
||||
cursor = db.db.cursor()
|
||||
sql = f"""
|
||||
CREATE TABLE IF NOT EXISTS `{PROGRESS_TABLE}` (
|
||||
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`progress_key` varchar(128) NOT NULL,
|
||||
`next_city_index` int(11) DEFAULT 0,
|
||||
`area_signature` varchar(128) DEFAULT NULL,
|
||||
`area_total` int(11) DEFAULT 0,
|
||||
`current_city` varchar(128) DEFAULT NULL,
|
||||
`reason` varchar(64) DEFAULT NULL,
|
||||
`status` varchar(32) DEFAULT NULL,
|
||||
`device_id` varchar(128) DEFAULT NULL,
|
||||
`extra_json` longtext,
|
||||
`updated_at` bigint(20) DEFAULT NULL,
|
||||
`create_time` bigint(20) DEFAULT NULL,
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `uk_progress_key` (`progress_key`),
|
||||
KEY `idx_updated_at` (`updated_at`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
|
||||
"""
|
||||
cursor.execute(sql)
|
||||
db.db.commit()
|
||||
cursor.close()
|
||||
|
||||
|
||||
def _get_progress(progress_key: str) -> Optional[Dict[str, Any]]:
|
||||
key = str(progress_key or "").strip()
|
||||
if not key:
|
||||
return None
|
||||
|
||||
_ensure_progress_table()
|
||||
with Db() as db:
|
||||
cursor = db.db.cursor()
|
||||
sql = (
|
||||
f"SELECT progress_key, next_city_index, area_signature, area_total, current_city, "
|
||||
f"reason, status, device_id, extra_json, updated_at, create_time "
|
||||
f"FROM `{PROGRESS_TABLE}` WHERE progress_key=%s LIMIT 1"
|
||||
)
|
||||
cursor.execute(sql, (key,))
|
||||
row = cursor.fetchone()
|
||||
cursor.close()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
extra_json = row[8] or ""
|
||||
extra_obj: Any = {}
|
||||
if extra_json:
|
||||
try:
|
||||
extra_obj = json.loads(extra_json)
|
||||
except Exception:
|
||||
extra_obj = extra_json
|
||||
|
||||
return {
|
||||
"progress_key": row[0] or "",
|
||||
"next_city_index": _parse_int(row[1], 0),
|
||||
"area_signature": row[2] or "",
|
||||
"area_total": _parse_int(row[3], 0),
|
||||
"current_city": row[4] or "",
|
||||
"reason": row[5] or "",
|
||||
"status": row[6] or "",
|
||||
"device_id": row[7] or "",
|
||||
"extra": extra_obj,
|
||||
"updated_at": _parse_int(row[9], 0),
|
||||
"create_time": _parse_int(row[10], 0),
|
||||
}
|
||||
|
||||
|
||||
def _upsert_progress(progress_key: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
key = str(progress_key or "").strip()
|
||||
if not key:
|
||||
raise ValueError("progress_key 不能为空")
|
||||
|
||||
_ensure_progress_table()
|
||||
now_ts = int(time.time())
|
||||
next_city_index = _parse_int(payload.get("next_city_index"), 0)
|
||||
area_signature = str(payload.get("area_signature") or "").strip()
|
||||
area_total = _parse_int(payload.get("area_total"), 0)
|
||||
current_city = str(payload.get("current_city") or "").strip()
|
||||
reason = str(payload.get("reason") or "").strip()
|
||||
status = str(payload.get("status") or "").strip()
|
||||
device_id = str(payload.get("device_id") or "").strip()
|
||||
extra = payload.get("extra")
|
||||
if extra is None:
|
||||
extra = payload.get("extra_json")
|
||||
|
||||
if isinstance(extra, str):
|
||||
extra_json = extra
|
||||
else:
|
||||
try:
|
||||
extra_json = json.dumps(extra or {}, ensure_ascii=False)
|
||||
except Exception:
|
||||
extra_json = "{}"
|
||||
|
||||
with Db() as db:
|
||||
cursor = db.db.cursor()
|
||||
sql = (
|
||||
f"INSERT INTO `{PROGRESS_TABLE}` "
|
||||
"(progress_key, next_city_index, area_signature, area_total, current_city, reason, status, "
|
||||
"device_id, extra_json, updated_at, create_time) "
|
||||
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
|
||||
"ON DUPLICATE KEY UPDATE "
|
||||
"next_city_index=VALUES(next_city_index), "
|
||||
"area_signature=VALUES(area_signature), "
|
||||
"area_total=VALUES(area_total), "
|
||||
"current_city=VALUES(current_city), "
|
||||
"reason=VALUES(reason), "
|
||||
"status=VALUES(status), "
|
||||
"device_id=VALUES(device_id), "
|
||||
"extra_json=VALUES(extra_json), "
|
||||
"updated_at=VALUES(updated_at)"
|
||||
)
|
||||
cursor.execute(
|
||||
sql,
|
||||
(
|
||||
key,
|
||||
next_city_index,
|
||||
area_signature,
|
||||
area_total,
|
||||
current_city,
|
||||
reason,
|
||||
status,
|
||||
device_id,
|
||||
extra_json,
|
||||
now_ts,
|
||||
now_ts,
|
||||
),
|
||||
)
|
||||
db.db.commit()
|
||||
cursor.close()
|
||||
|
||||
return _get_progress(key) or {}
|
||||
|
||||
|
||||
def _clear_progress(progress_key: str) -> int:
|
||||
key = str(progress_key or "").strip()
|
||||
if not key:
|
||||
return 0
|
||||
|
||||
_ensure_progress_table()
|
||||
with Db() as db:
|
||||
cursor = db.db.cursor()
|
||||
sql = f"DELETE FROM `{PROGRESS_TABLE}` WHERE progress_key=%s"
|
||||
cursor.execute(sql, (key,))
|
||||
affected = cursor.rowcount
|
||||
db.db.commit()
|
||||
cursor.close()
|
||||
return affected
|
||||
|
||||
|
||||
def _query_area_data(table_name: str, domain: str) -> List[Dict[str, Any]]:
|
||||
if not _is_safe_table_name(table_name):
|
||||
raise ValueError("非法表名")
|
||||
|
||||
with Db() as db:
|
||||
cursor = db.db.cursor()
|
||||
sql = (
|
||||
f"SELECT province, city, name, pid, pinyin, code, domain, level, create_time "
|
||||
f"FROM `{table_name}` WHERE domain=%s ORDER BY id ASC"
|
||||
)
|
||||
cursor.execute(sql, (domain,))
|
||||
rows = cursor.fetchall()
|
||||
cursor.close()
|
||||
|
||||
result: List[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
result.append(
|
||||
{
|
||||
"province": row[0] or "",
|
||||
"city": row[1] or "",
|
||||
"name": row[2] or "",
|
||||
"pid": row[3] if row[3] is not None else 0,
|
||||
"pinyin": row[4] or "",
|
||||
"code": row[5] or "",
|
||||
"domain": row[6] or "",
|
||||
"level": row[7] if row[7] is not None else 0,
|
||||
"create_time": row[8] if row[8] is not None else 0,
|
||||
}
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _iter_dict_nodes(value: Any) -> Iterable[Dict[str, Any]]:
|
||||
stack: List[Any] = [value]
|
||||
while stack:
|
||||
current = stack.pop()
|
||||
if isinstance(current, dict):
|
||||
yield current
|
||||
stack.extend(current.values())
|
||||
elif isinstance(current, list):
|
||||
stack.extend(current)
|
||||
|
||||
|
||||
def _extract_phones_from_text(text: str) -> List[str]:
|
||||
phones: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
for match in PHONE_REGEX.finditer(text or ""):
|
||||
phone = match.group(1)
|
||||
if not phone or phone in seen:
|
||||
continue
|
||||
seen.add(phone)
|
||||
phones.append(phone)
|
||||
return phones
|
||||
|
||||
|
||||
def _extract_phones_from_user_info(user_info: Dict[str, Any]) -> List[str]:
|
||||
signature = str(user_info.get("signature") or "")
|
||||
unique_id = str(user_info.get("unique_id") or "")
|
||||
versatile = str(user_info.get("versatile_display") or "")
|
||||
|
||||
# 1) 优先从简介直接匹配手机号
|
||||
phones = set(_extract_phones_from_text(signature))
|
||||
if phones:
|
||||
return sorted(phones)
|
||||
|
||||
# 2) 从微信相关标记中提取,再从抖音号字段兜底
|
||||
for text in (signature, unique_id, versatile):
|
||||
for match in WX_CONTEXT_REGEX.finditer(text):
|
||||
wx_value = match.group(1) or ""
|
||||
for phone in _extract_phones_from_text(wx_value):
|
||||
phones.add(phone)
|
||||
|
||||
for text in (unique_id, versatile):
|
||||
for phone in _extract_phones_from_text(text):
|
||||
phones.add(phone)
|
||||
|
||||
return sorted(phones)
|
||||
|
||||
|
||||
def _extract_law_firm_from_user_info(user_info: Dict[str, Any]) -> str:
|
||||
candidates: List[str] = []
|
||||
|
||||
signature = str(user_info.get("signature") or "")
|
||||
if signature:
|
||||
candidates.append(signature)
|
||||
|
||||
verify_reason = str(user_info.get("enterprise_verify_reason") or "")
|
||||
if verify_reason:
|
||||
candidates.append(verify_reason)
|
||||
|
||||
cert_text = ""
|
||||
account_cert_info = user_info.get("account_cert_info")
|
||||
if isinstance(account_cert_info, str) and account_cert_info.strip():
|
||||
try:
|
||||
cert_obj = json.loads(account_cert_info)
|
||||
if isinstance(cert_obj, dict):
|
||||
cert_text = str(cert_obj.get("label_text") or "").strip()
|
||||
except Exception:
|
||||
cert_text = account_cert_info.strip()
|
||||
if cert_text:
|
||||
candidates.append(cert_text)
|
||||
|
||||
for text in candidates:
|
||||
match = LAW_FIRM_REGEX.search(text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_account_cert_text(user_info: Dict[str, Any]) -> str:
|
||||
account_cert_info = user_info.get("account_cert_info")
|
||||
if isinstance(account_cert_info, str) and account_cert_info.strip():
|
||||
try:
|
||||
cert_obj = json.loads(account_cert_info)
|
||||
if isinstance(cert_obj, dict):
|
||||
return str(cert_obj.get("label_text") or "").strip()
|
||||
except Exception:
|
||||
return account_cert_info.strip()
|
||||
return ""
|
||||
|
||||
|
||||
def _is_lawyer_related_user(user_info: Dict[str, Any], name: str, law_firm: str) -> bool:
|
||||
texts = [
|
||||
name,
|
||||
str(user_info.get("nickname") or ""),
|
||||
str(user_info.get("signature") or ""),
|
||||
str(user_info.get("custom_verify") or ""),
|
||||
str(user_info.get("enterprise_verify_reason") or ""),
|
||||
str(user_info.get("versatile_display") or ""),
|
||||
str(user_info.get("unique_id") or ""),
|
||||
_extract_account_cert_text(user_info),
|
||||
law_firm,
|
||||
]
|
||||
merged = "\n".join(text for text in texts if text).strip()
|
||||
if not merged:
|
||||
return False
|
||||
return any(keyword in merged for keyword in LAWYER_KEYWORDS)
|
||||
|
||||
|
||||
def _pick_first_str(node: Dict[str, Any], keys: Tuple[str, ...]) -> str:
|
||||
for key in keys:
|
||||
value = node.get(key)
|
||||
if isinstance(value, str):
|
||||
text = value.strip()
|
||||
if text:
|
||||
return text
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_name(node: Dict[str, Any]) -> str:
|
||||
direct = _pick_first_str(node, ("name", "nickname", "nick_name", "author_name", "title", "account_name"))
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
for nested_key in ("author", "user", "user_info", "profile", "account"):
|
||||
nested = node.get(nested_key)
|
||||
if isinstance(nested, dict):
|
||||
nested_name = _pick_first_str(nested, ("name", "nickname", "nick_name", "author_name", "title"))
|
||||
if nested_name:
|
||||
return nested_name
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_law_firm(node: Dict[str, Any]) -> str:
|
||||
direct = _pick_first_str(
|
||||
node,
|
||||
(
|
||||
"law_firm",
|
||||
"firm",
|
||||
"lawFirm",
|
||||
"office",
|
||||
"org_name",
|
||||
"organization",
|
||||
"company",
|
||||
"enterprise",
|
||||
),
|
||||
)
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
enterprise = node.get("enterprise")
|
||||
if isinstance(enterprise, dict):
|
||||
company_name = _pick_first_str(enterprise, ("name", "company_name", "enterprise_name"))
|
||||
if company_name:
|
||||
return company_name
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_detail_url(node: Dict[str, Any], fallback_api_url: str) -> str:
|
||||
url = _pick_first_str(node, ("share_url", "url", "web_url", "detail_url", "jump_url"))
|
||||
if url:
|
||||
return url
|
||||
|
||||
aweme_id = node.get("aweme_id") or node.get("item_id")
|
||||
if aweme_id:
|
||||
aid = str(aweme_id).strip()
|
||||
if aid:
|
||||
return f"https://www.douyin.com/video/{aid}"
|
||||
|
||||
sec_uid = node.get("sec_uid")
|
||||
if sec_uid:
|
||||
sec_uid_text = str(sec_uid).strip()
|
||||
if sec_uid_text:
|
||||
return f"https://www.douyin.com/user/{sec_uid_text}"
|
||||
|
||||
return fallback_api_url
|
||||
|
||||
|
||||
def _city_from_index(city_index: int, table_name: str, domain: str) -> Tuple[str, str]:
|
||||
if city_index < 0:
|
||||
return "", ""
|
||||
try:
|
||||
areas = _query_area_data(table_name, domain)
|
||||
except Exception:
|
||||
return "", ""
|
||||
if city_index >= len(areas):
|
||||
return "", ""
|
||||
area = areas[city_index]
|
||||
province = str(area.get("province") or "").strip()
|
||||
city = str(area.get("city") or province).strip()
|
||||
return province, city
|
||||
|
||||
|
||||
def _existing_phones(domain: str, phones: List[str]) -> Set[str]:
|
||||
if not phones:
|
||||
return set()
|
||||
|
||||
deduped = sorted({p for p in phones if p})
|
||||
if not deduped:
|
||||
return set()
|
||||
|
||||
existing: Set[str] = set()
|
||||
with Db() as db:
|
||||
cursor = db.db.cursor()
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cursor.execute(sql, [domain, *chunk])
|
||||
for row in cursor.fetchall():
|
||||
existing.add(str(row[0]))
|
||||
cursor.close()
|
||||
return existing
|
||||
|
||||
|
||||
def _insert_lawyer_rows(rows: List[Dict[str, Any]], domain: str) -> Tuple[int, int]:
|
||||
if not rows:
|
||||
return 0, 0
|
||||
|
||||
def row_score(item: Dict[str, Any]) -> int:
|
||||
score = 0
|
||||
if str(item.get("name") or "").strip():
|
||||
score += 5
|
||||
if str(item.get("law_firm") or "").strip():
|
||||
score += 3
|
||||
if str(item.get("url") or "").strip():
|
||||
score += 1
|
||||
if str(item.get("province") or "").strip() or str(item.get("city") or "").strip():
|
||||
score += 1
|
||||
phone_count_in_node = _parse_int(item.get("phone_count_in_node"), 1)
|
||||
if phone_count_in_node > 1:
|
||||
score -= (phone_count_in_node - 1)
|
||||
return score
|
||||
|
||||
deduped_by_phone: Dict[str, Dict[str, Any]] = {}
|
||||
skipped = 0
|
||||
for row in rows:
|
||||
phone = str(row.get("phone") or "").strip()
|
||||
if not phone:
|
||||
skipped += 1
|
||||
continue
|
||||
old_row = deduped_by_phone.get(phone)
|
||||
if old_row is not None:
|
||||
if row_score(row) > row_score(old_row):
|
||||
deduped_by_phone[phone] = row
|
||||
skipped += 1
|
||||
continue
|
||||
deduped_by_phone[phone] = row
|
||||
|
||||
existing = _existing_phones(domain, list(deduped_by_phone.keys()))
|
||||
|
||||
inserted = 0
|
||||
with Db() as db:
|
||||
cursor = db.db.cursor()
|
||||
sql = (
|
||||
"INSERT INTO lawyer "
|
||||
"(name, phone, law_firm, province, city, url, domain, create_time, site_time, params) "
|
||||
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
)
|
||||
for phone, row in deduped_by_phone.items():
|
||||
if phone in existing:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
cursor.execute(
|
||||
sql,
|
||||
(
|
||||
row.get("name") or "",
|
||||
phone,
|
||||
row.get("law_firm") or "",
|
||||
row.get("province") or "",
|
||||
row.get("city") or "",
|
||||
row.get("url") or "",
|
||||
domain,
|
||||
_parse_int(row.get("create_time"), int(time.time())),
|
||||
_parse_int(row.get("site_time"), int(time.time())),
|
||||
row.get("params") or "{}",
|
||||
),
|
||||
)
|
||||
inserted += 1
|
||||
existing.add(phone)
|
||||
|
||||
db.db.commit()
|
||||
cursor.close()
|
||||
|
||||
return inserted, skipped
|
||||
|
||||
|
||||
def _extract_lawyer_rows_from_payload(
|
||||
payload: Dict[str, Any],
|
||||
area_table: str,
|
||||
area_domain: str,
|
||||
save_domain: str,
|
||||
) -> List[Dict[str, Any]]:
|
||||
now_ts = int(time.time())
|
||||
api_url = str(payload.get("url") or "").strip()
|
||||
city_index = _parse_int(payload.get("cityIndex"), -1)
|
||||
city_province, city_name = _city_from_index(city_index, area_table, area_domain)
|
||||
|
||||
rows: List[Dict[str, Any]] = []
|
||||
data = payload.get("data") if isinstance(payload, dict) else None
|
||||
user_list = data.get("user_list") if isinstance(data, dict) else None
|
||||
if not isinstance(user_list, list):
|
||||
return rows
|
||||
|
||||
for user_item in user_list:
|
||||
if not isinstance(user_item, dict):
|
||||
continue
|
||||
user_info = user_item.get("user_info")
|
||||
if not isinstance(user_info, dict):
|
||||
continue
|
||||
|
||||
name = str(user_info.get("nickname") or "").strip()
|
||||
law_firm = _extract_law_firm_from_user_info(user_info)
|
||||
|
||||
# 强约束:必须出现“律师/律所”等关键词,避免非法律相关账号入库
|
||||
if not _is_lawyer_related_user(user_info, name, law_firm):
|
||||
continue
|
||||
|
||||
phones = _extract_phones_from_user_info(user_info)
|
||||
if not phones:
|
||||
continue
|
||||
|
||||
sec_uid = str(user_info.get("sec_uid") or "").strip()
|
||||
if not sec_uid:
|
||||
continue
|
||||
url = f"https://www.douyin.com/user/{sec_uid}"
|
||||
|
||||
province = city_province
|
||||
city = city_name or city_province
|
||||
|
||||
source_record = {
|
||||
"source": "douyin",
|
||||
"api_source": payload.get("source") or "",
|
||||
"api_url": api_url,
|
||||
"city_index": city_index,
|
||||
"captured_at": now_ts,
|
||||
"sec_uid": sec_uid,
|
||||
"user_info": {
|
||||
"uid": user_info.get("uid"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"signature": user_info.get("signature"),
|
||||
"unique_id": user_info.get("unique_id"),
|
||||
"versatile_display": user_info.get("versatile_display"),
|
||||
},
|
||||
}
|
||||
|
||||
for phone in phones:
|
||||
rows.append(
|
||||
{
|
||||
"name": name,
|
||||
"phone": phone,
|
||||
"law_firm": law_firm,
|
||||
"province": province,
|
||||
"city": city,
|
||||
"url": url,
|
||||
"domain": save_domain,
|
||||
"create_time": now_ts,
|
||||
"site_time": _parse_int(payload.get("ts"), now_ts),
|
||||
"phone_count_in_node": len(phones),
|
||||
"params": json.dumps(source_record, ensure_ascii=False),
|
||||
}
|
||||
)
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
class AreaSyncHandler(BaseHTTPRequestHandler):
|
||||
server_version = "AreaSyncService/2.0"
|
||||
|
||||
def _write_json(self, status: int, payload: Any) -> None:
|
||||
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.send_header("Access-Control-Allow-Origin", "*")
|
||||
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def _read_json_body(self) -> Any:
|
||||
length = _parse_int(self.headers.get("Content-Length"), 0)
|
||||
if length <= 0:
|
||||
return {}
|
||||
|
||||
raw = self.rfile.read(length)
|
||||
if not raw:
|
||||
return {}
|
||||
|
||||
try:
|
||||
return json.loads(raw.decode("utf-8"))
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def do_OPTIONS(self) -> None:
|
||||
self.send_response(204)
|
||||
self.send_header("Access-Control-Allow-Origin", "*")
|
||||
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||
self.end_headers()
|
||||
|
||||
def do_GET(self) -> None:
|
||||
parsed = urlparse(self.path)
|
||||
params = parse_qs(parsed.query)
|
||||
|
||||
if parsed.path == "/health":
|
||||
self._write_json(200, {"ok": True, "service": "layer-service"})
|
||||
return
|
||||
|
||||
if parsed.path == "/api/layer/get_area":
|
||||
table_name = _first_param(params, "table", AREA_TABLE).strip() or AREA_TABLE
|
||||
domain = _first_param(params, "domain", AREA_DOMAIN).strip() or AREA_DOMAIN
|
||||
with_meta = _parse_bool(_first_param(params, "meta", "0"), False)
|
||||
|
||||
try:
|
||||
rows = _query_area_data(table_name, domain)
|
||||
except Exception as exc:
|
||||
self._write_json(500, {"ok": False, "error": str(exc)})
|
||||
return
|
||||
|
||||
if with_meta:
|
||||
self._write_json(
|
||||
200,
|
||||
{
|
||||
"ok": True,
|
||||
"count": len(rows),
|
||||
"table": table_name,
|
||||
"domain": domain,
|
||||
"data": rows,
|
||||
},
|
||||
)
|
||||
else:
|
||||
self._write_json(200, rows)
|
||||
return
|
||||
|
||||
if parsed.path == "/api/layer/progress":
|
||||
progress_key = _first_param(params, "progress_key", PROGRESS_DEFAULT_KEY).strip() or PROGRESS_DEFAULT_KEY
|
||||
try:
|
||||
row = _get_progress(progress_key)
|
||||
except Exception as exc:
|
||||
self._write_json(500, {"ok": False, "error": str(exc)})
|
||||
return
|
||||
|
||||
self._write_json(
|
||||
200,
|
||||
{
|
||||
"ok": True,
|
||||
"progress_key": progress_key,
|
||||
"data": row,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
self._write_json(404, {"ok": False, "error": "not found"})
|
||||
|
||||
def do_POST(self) -> None:
|
||||
parsed = urlparse(self.path)
|
||||
params = parse_qs(parsed.query)
|
||||
|
||||
if parsed.path == "/api/layer/progress":
|
||||
body = self._read_json_body()
|
||||
if not isinstance(body, dict):
|
||||
body = {}
|
||||
|
||||
progress_key = str(body.get("progress_key") or _first_param(params, "progress_key", PROGRESS_DEFAULT_KEY)).strip() or PROGRESS_DEFAULT_KEY
|
||||
action = str(body.get("action") or _first_param(params, "action", "upsert")).strip().lower() or "upsert"
|
||||
|
||||
try:
|
||||
if action == "clear":
|
||||
deleted = _clear_progress(progress_key)
|
||||
self._write_json(
|
||||
200,
|
||||
{
|
||||
"ok": True,
|
||||
"action": "clear",
|
||||
"progress_key": progress_key,
|
||||
"deleted": deleted,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
saved = _upsert_progress(progress_key, body)
|
||||
self._write_json(
|
||||
200,
|
||||
{
|
||||
"ok": True,
|
||||
"action": "upsert",
|
||||
"progress_key": progress_key,
|
||||
"data": saved,
|
||||
},
|
||||
)
|
||||
return
|
||||
except Exception as exc:
|
||||
self._write_json(500, {"ok": False, "error": str(exc)})
|
||||
return
|
||||
|
||||
if parsed.path == "/api/layer/index":
|
||||
body = self._read_json_body()
|
||||
if not isinstance(body, dict) or not body:
|
||||
self._write_json(400, {"ok": False, "error": "invalid json body"})
|
||||
return
|
||||
|
||||
area_table = _first_param(params, "table", AREA_TABLE).strip() or AREA_TABLE
|
||||
area_domain = _first_param(params, "area_domain", AREA_DOMAIN).strip() or AREA_DOMAIN
|
||||
save_domain = _first_param(params, "save_domain", DOUYIN_DOMAIN).strip() or DOUYIN_DOMAIN
|
||||
save_only_default = _parse_bool(DOUYIN_SAVE_ONLY_ENV, True)
|
||||
save_only = _parse_bool(_first_param(params, "save_only", DOUYIN_SAVE_ONLY_ENV), save_only_default)
|
||||
|
||||
try:
|
||||
saved_file = _save_raw_index_payload(body, params, self.client_address[0] if self.client_address else "")
|
||||
except Exception as exc:
|
||||
self._write_json(500, {"ok": False, "error": f"save raw payload failed: {exc}"})
|
||||
return
|
||||
|
||||
if save_only:
|
||||
self._write_json(
|
||||
200,
|
||||
{
|
||||
"ok": True,
|
||||
"message": "saved_only",
|
||||
"save_only": True,
|
||||
"saved_file": saved_file,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
extracted = _extract_lawyer_rows_from_payload(body, area_table, area_domain, save_domain)
|
||||
inserted, skipped = _insert_lawyer_rows(extracted, save_domain)
|
||||
except Exception as exc:
|
||||
self._write_json(500, {"ok": False, "error": str(exc)})
|
||||
return
|
||||
|
||||
self._write_json(
|
||||
200,
|
||||
{
|
||||
"ok": True,
|
||||
"message": "received",
|
||||
"save_domain": save_domain,
|
||||
"save_only": False,
|
||||
"saved_file": saved_file,
|
||||
"extracted": len(extracted),
|
||||
"inserted": inserted,
|
||||
"skipped": skipped,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
self._write_json(404, {"ok": False, "error": "not found"})
|
||||
|
||||
|
||||
def run() -> None:
|
||||
try:
|
||||
_ensure_progress_table()
|
||||
except Exception as exc:
|
||||
print(f"[layer-service] init progress table failed: {exc}")
|
||||
|
||||
server = ThreadingHTTPServer((SERVICE_HOST, SERVICE_PORT), AreaSyncHandler)
|
||||
print(f"[layer-service] running on http://{SERVICE_HOST}:{SERVICE_PORT}")
|
||||
print(f"[layer-service] get_area -> table/domain: {AREA_TABLE}/{AREA_DOMAIN}")
|
||||
print(f"[layer-service] index -> save domain: {DOUYIN_DOMAIN}")
|
||||
print(f"[layer-service] progress table/default key: {PROGRESS_TABLE}/{PROGRESS_DEFAULT_KEY}")
|
||||
server.serve_forever()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
+189
-55
@@ -1,76 +1,210 @@
|
||||
"""
|
||||
全局请求速率限制器
|
||||
确保代理每秒不超过5次请求
|
||||
|
||||
默认按“所有爬虫进程共享一个桶”来限流,避免 `bash start.sh`
|
||||
同时启动多个进程时,每个进程各自 5 次/秒,叠加后把代理冲爆。
|
||||
"""
|
||||
from contextlib import contextmanager
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
import threading
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
import fcntl
|
||||
from request.proxy_config import is_proxy_enabled
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""
|
||||
令牌桶算法实现的速率限制器
|
||||
基于文件锁的跨进程滑动窗口限流器。
|
||||
|
||||
- 同一台机器上的多个 Python 进程会共享同一个状态文件
|
||||
- 同一个进程内的多个线程也会一起走这个限流器
|
||||
"""
|
||||
def __init__(self, max_requests_per_second: int = 5):
|
||||
"""
|
||||
初始化速率限制器
|
||||
|
||||
Args:
|
||||
max_requests_per_second: 每秒最大请求数
|
||||
"""
|
||||
self.max_requests = max_requests_per_second
|
||||
self.requests = deque()
|
||||
self.lock = threading.RLock()
|
||||
|
||||
def acquire(self):
|
||||
"""
|
||||
获取请求权限,如果需要则等待
|
||||
"""
|
||||
with self.lock:
|
||||
now = time.time()
|
||||
|
||||
# 清理超过1秒的请求记录
|
||||
while self.requests and now - self.requests[0] >= 1.0:
|
||||
self.requests.popleft()
|
||||
|
||||
# 如果当前请求数已达上限,等待
|
||||
if len(self.requests) >= self.max_requests:
|
||||
# 计算需要等待的时间
|
||||
wait_time = 1.0 - (now - self.requests[0])
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
return self.acquire() # 递归调用以重新检查
|
||||
|
||||
# 记录这次请求
|
||||
self.requests.append(now)
|
||||
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_requests_per_second: int = 5,
|
||||
window_seconds: float = 1.0,
|
||||
state_file: str | None = None,
|
||||
):
|
||||
self.max_requests = max(1, int(max_requests_per_second))
|
||||
self.max_concurrent = max(
|
||||
1,
|
||||
int(os.getenv("PROXY_MAX_CONCURRENT_REQUESTS", str(self.max_requests))),
|
||||
)
|
||||
self.window_seconds = max(0.1, float(window_seconds))
|
||||
self.lease_seconds = max(
|
||||
5.0,
|
||||
float(os.getenv("PROXY_REQUEST_LEASE_SECONDS", "120")),
|
||||
)
|
||||
default_state = os.path.join(
|
||||
tempfile.gettempdir(),
|
||||
"lawyers_proxy_rate_limiter.json",
|
||||
)
|
||||
self.state_file = Path(
|
||||
state_file or os.getenv("PROXY_RATE_LIMIT_FILE", default_state)
|
||||
)
|
||||
self.lock_file = self.state_file.with_suffix(self.state_file.suffix + ".lock")
|
||||
self._thread_lock = threading.RLock()
|
||||
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.lock_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _load_state(self) -> dict:
|
||||
if not self.state_file.exists():
|
||||
return {"timestamps": [], "leases": {}}
|
||||
try:
|
||||
raw = self.state_file.read_text(encoding="utf-8").strip()
|
||||
if not raw:
|
||||
return {"timestamps": [], "leases": {}}
|
||||
data = json.loads(raw)
|
||||
if isinstance(data, list):
|
||||
return {
|
||||
"timestamps": [float(item) for item in data],
|
||||
"leases": {},
|
||||
}
|
||||
if not isinstance(data, dict):
|
||||
return {"timestamps": [], "leases": {}}
|
||||
timestamps = data.get("timestamps", []) or []
|
||||
leases = data.get("leases", {}) or {}
|
||||
return {
|
||||
"timestamps": [float(item) for item in timestamps],
|
||||
"leases": {str(key): float(value) for key, value in leases.items()},
|
||||
}
|
||||
except Exception:
|
||||
return {"timestamps": [], "leases": {}}
|
||||
|
||||
def _save_state(self, state: dict) -> None:
|
||||
payload = json.dumps(state, ensure_ascii=False)
|
||||
self.state_file.write_text(payload, encoding="utf-8")
|
||||
|
||||
def _normalize_state(self, state: dict, now: float) -> dict:
|
||||
timestamps = [
|
||||
float(ts)
|
||||
for ts in (state.get("timestamps", []) or [])
|
||||
if now - float(ts) < self.window_seconds
|
||||
]
|
||||
leases = {
|
||||
str(key): float(value)
|
||||
for key, value in (state.get("leases", {}) or {}).items()
|
||||
if now - float(value) < self.lease_seconds
|
||||
}
|
||||
return {"timestamps": timestamps, "leases": leases}
|
||||
|
||||
def acquire(self) -> None:
|
||||
token = None
|
||||
while True:
|
||||
token = self.try_acquire_slot()
|
||||
if token:
|
||||
self.release(token)
|
||||
return
|
||||
time.sleep(0.05)
|
||||
|
||||
def try_acquire_slot(self) -> str | None:
|
||||
while True:
|
||||
wait_time = 0.0
|
||||
with self._thread_lock:
|
||||
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
|
||||
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
|
||||
now = time.time()
|
||||
state = self._normalize_state(self._load_state(), now)
|
||||
timestamps = state["timestamps"]
|
||||
leases = state["leases"]
|
||||
|
||||
if len(timestamps) < self.max_requests and len(leases) < self.max_concurrent:
|
||||
token = uuid4().hex
|
||||
timestamps.append(now)
|
||||
leases[token] = now
|
||||
self._save_state(state)
|
||||
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
|
||||
return token
|
||||
|
||||
wait_candidates = []
|
||||
if len(timestamps) >= self.max_requests and timestamps:
|
||||
wait_candidates.append(self.window_seconds - (now - timestamps[0]))
|
||||
if len(leases) >= self.max_concurrent:
|
||||
wait_candidates.append(0.05)
|
||||
wait_time = max(0.05, min([item for item in wait_candidates if item > 0] or [0.05]))
|
||||
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
|
||||
|
||||
time.sleep(wait_time)
|
||||
|
||||
def release(self, token: str | None) -> None:
|
||||
if not token:
|
||||
return
|
||||
with self._thread_lock:
|
||||
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
|
||||
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
|
||||
now = time.time()
|
||||
state = self._normalize_state(self._load_state(), now)
|
||||
leases = state["leases"]
|
||||
if token in leases:
|
||||
leases.pop(token, None)
|
||||
self._save_state(state)
|
||||
else:
|
||||
self._save_state(state)
|
||||
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
|
||||
|
||||
def can_make_request(self) -> bool:
|
||||
"""
|
||||
检查是否可以立即发起请求(非阻塞)
|
||||
"""
|
||||
with self.lock:
|
||||
now = time.time()
|
||||
|
||||
# 清理超过1秒的请求记录
|
||||
while self.requests and now - self.requests[0] >= 1.0:
|
||||
self.requests.popleft()
|
||||
|
||||
return len(self.requests) < self.max_requests
|
||||
with self._thread_lock:
|
||||
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
|
||||
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
|
||||
now = time.time()
|
||||
state = self._normalize_state(self._load_state(), now)
|
||||
self._save_state(state)
|
||||
allowed = (
|
||||
len(state["timestamps"]) < self.max_requests
|
||||
and len(state["leases"]) < self.max_concurrent
|
||||
)
|
||||
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
|
||||
return allowed
|
||||
|
||||
|
||||
# 全局速率限制器实例
|
||||
global_rate_limiter = RateLimiter(max_requests_per_second=5)
|
||||
global_rate_limiter = RateLimiter(
|
||||
max_requests_per_second=int(os.getenv("PROXY_MAX_REQUESTS_PER_SECOND", "5"))
|
||||
)
|
||||
|
||||
|
||||
def _should_limit_proxy_requests() -> bool:
|
||||
"""
|
||||
仅在当前进程实际启用代理时启用全局代理限流。
|
||||
"""
|
||||
try:
|
||||
return is_proxy_enabled()
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
|
||||
def wait_for_request():
|
||||
"""
|
||||
等待直到可以发起请求
|
||||
"""
|
||||
"""等待直到可以发起请求。"""
|
||||
if not _should_limit_proxy_requests():
|
||||
return
|
||||
global_rate_limiter.acquire()
|
||||
|
||||
|
||||
def can_request_now() -> bool:
|
||||
"""
|
||||
检查是否可以立即发起请求
|
||||
"""
|
||||
"""检查是否可以立即发起请求。"""
|
||||
if not _should_limit_proxy_requests():
|
||||
return True
|
||||
return global_rate_limiter.can_make_request()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def request_slot():
|
||||
"""
|
||||
申请一个跨进程共享的请求槽位,请求结束后自动释放。
|
||||
|
||||
这样既能限制“每秒启动多少请求”,也能限制“同时在飞多少请求”。
|
||||
"""
|
||||
if not _should_limit_proxy_requests():
|
||||
yield
|
||||
return
|
||||
|
||||
token = global_rate_limiter.try_acquire_slot()
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
global_rate_limiter.release(token)
|
||||
|
||||
@@ -0,0 +1,377 @@
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from html import unescape
|
||||
from http.cookies import SimpleCookie
|
||||
from typing import Dict, Optional
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
for path in (current_dir, project_root):
|
||||
if path not in sys.path:
|
||||
sys.path.append(path)
|
||||
|
||||
import config as project_config
|
||||
from utils.rate_limiter import wait_for_request, global_rate_limiter
|
||||
|
||||
API_ENDPOINT = "https://mp.weixin.qq.com/cgi-bin/videosnap"
|
||||
DOMAIN = "mp.weixin.qq.com"
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/146.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "*/*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
|
||||
"DNT": "1",
|
||||
"Priority": "u=1, i",
|
||||
"Sec-CH-UA": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"',
|
||||
"Sec-CH-UA-Mobile": "?0",
|
||||
"Sec-CH-UA-Platform": '"Windows"',
|
||||
"Sec-Fetch-Dest": "empty",
|
||||
"Sec-Fetch-Mode": "cors",
|
||||
"Sec-Fetch-Site": "same-origin",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
}
|
||||
DEFAULT_WEIXIN_CONFIG = {
|
||||
"TOKEN": "609153506",
|
||||
"FINGERPRINT": "46a7e6ac6ccf205986adc0aa99127860",
|
||||
"COOKIE": {
|
||||
"appmsglist_action_3258147150": "card",
|
||||
"_qimei_uuid42": "1a302160d051008226aec905b63f99ff3989f30009",
|
||||
"_qimei_i_3": "63b22b84c15204dfc595ac6452d722b1f0bdf0f6145b568ae68a7c0e70947438686637943989e2a1d792",
|
||||
"_qimei_h38": "215986ce26aec905b63f99ff0200000e81a302",
|
||||
"ua_id": "S7gglu0eZh9NkAzLAAAAADH8dynpnFZVN29lxm7BQo0=",
|
||||
"wxuin": "73074968761097",
|
||||
"mm_lang": "zh_CN",
|
||||
"eas_sid": "91X7I7K4K5k364U2z3k2I980F5",
|
||||
"_qimei_q36": "",
|
||||
"_qimei_fingerprint": "d895c46d5fda98cab67d9daec00068ed",
|
||||
"_clck": "501quy|1|g4t|0",
|
||||
"uuid": "210d1c199a63afd4c774eccd9a06a27f",
|
||||
"rand_info": "CAESIE4WqrFFVVjqrrNflbCUM7wPD5NXjuGbjfHolAEsMmEm",
|
||||
"slave_bizuin": "3258147150",
|
||||
"data_bizuin": "3258147150",
|
||||
"bizuin": "3258147150",
|
||||
"data_ticket": "tpcLjRB7B7AlUY3rFe/ILEjtCKs7dEEGsn8kXnHVzdTb9dgIpSPN1aP8FlE6FDhj",
|
||||
"slave_sid": "U3hfU1Z0UV91N0U5d0lkRDhyTzh3d3hmbnBHMjBnbmFNdzVJeGlJeTJ6OTVxRjJQVVE2VkNhejYzTkxETVVSZkF3eWRORmtRS01XWFBjdnFZZWFLNjR2ZGtwdUJ2MzByclg0NjF4SHlDeVJneEhsczdSYUJVNE45VEhNRWVTQXg1dlpGdWQ0bU5VM3pnRzJN",
|
||||
"slave_user": "gh_fe76760560d0",
|
||||
"xid": "ef503a6864cceaef225c615a45606e4a",
|
||||
"_clsk": "12arnf1|1774975723874|4|1|mp.weixin.qq.com/weheat-agent/payload/record",
|
||||
"_qimei_i_1": "2ddc6a80945f59d3c7c4ab325dd526b3feeea1a31458558bbdd97e582493206c6163629d39d8e1dcd49fddc7"
|
||||
},
|
||||
"COUNT": 21,
|
||||
"REFERER": "https://mp.weixin.qq.com/",
|
||||
"HEADERS": {},
|
||||
"REQUEST_PARAMS": {
|
||||
"action": "search",
|
||||
"scene": "1",
|
||||
"lang": "zh_CN",
|
||||
"f": "json",
|
||||
"ajax": "1",
|
||||
},
|
||||
"REQUESTS_PER_SECOND": 5,
|
||||
"PAGE_DELAY": 5,
|
||||
"CITY_DELAY": 2,
|
||||
}
|
||||
|
||||
|
||||
def _deep_merge_dict(base: Dict, incoming: Dict) -> Dict:
|
||||
merged = copy.deepcopy(base)
|
||||
for key, value in incoming.items():
|
||||
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||
merged[key] = _deep_merge_dict(merged[key], value)
|
||||
else:
|
||||
merged[key] = value
|
||||
return merged
|
||||
|
||||
|
||||
def _parse_cookie_value(cookie_value) -> Dict[str, str]:
|
||||
if isinstance(cookie_value, dict):
|
||||
return {str(key): str(value) for key, value in cookie_value.items()}
|
||||
|
||||
if not cookie_value:
|
||||
return {}
|
||||
|
||||
if isinstance(cookie_value, str):
|
||||
text = cookie_value.strip()
|
||||
if not text:
|
||||
return {}
|
||||
try:
|
||||
parsed = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
parsed = None
|
||||
if isinstance(parsed, dict):
|
||||
return {str(key): str(value) for key, value in parsed.items()}
|
||||
|
||||
cookie = SimpleCookie()
|
||||
cookie.load(text)
|
||||
return {key: morsel.value for key, morsel in cookie.items()}
|
||||
|
||||
return {}
|
||||
|
||||
|
||||
def _load_weixin_config() -> Dict:
|
||||
config = copy.deepcopy(DEFAULT_WEIXIN_CONFIG)
|
||||
module_config = getattr(project_config, "WEIXIN_CONFIG", None)
|
||||
if isinstance(module_config, dict):
|
||||
config = _deep_merge_dict(config, module_config)
|
||||
|
||||
env_mapping = {
|
||||
"TOKEN": os.getenv("WEIXIN_TOKEN"),
|
||||
"FINGERPRINT": os.getenv("WEIXIN_FINGERPRINT"),
|
||||
"COOKIE": os.getenv("WEIXIN_COOKIE"),
|
||||
"REFERER": os.getenv("WEIXIN_REFERER"),
|
||||
"COUNT": os.getenv("WEIXIN_COUNT"),
|
||||
"REQUESTS_PER_SECOND": os.getenv("WEIXIN_REQUESTS_PER_SECOND"),
|
||||
"PAGE_DELAY": os.getenv("WEIXIN_PAGE_DELAY"),
|
||||
"CITY_DELAY": os.getenv("WEIXIN_CITY_DELAY"),
|
||||
}
|
||||
for key, value in env_mapping.items():
|
||||
if value not in (None, ""):
|
||||
config[key] = value
|
||||
|
||||
config["COOKIE"] = _parse_cookie_value(config.get("COOKIE"))
|
||||
|
||||
for key in ("COUNT", "REQUESTS_PER_SECOND"):
|
||||
try:
|
||||
config[key] = int(config[key])
|
||||
except (TypeError, ValueError):
|
||||
config[key] = DEFAULT_WEIXIN_CONFIG[key]
|
||||
|
||||
for key in ("PAGE_DELAY", "CITY_DELAY"):
|
||||
try:
|
||||
config[key] = float(config[key])
|
||||
except (TypeError, ValueError):
|
||||
config[key] = DEFAULT_WEIXIN_CONFIG[key]
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _strip_html(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
return re.sub(r"<[^>]+>", "", unescape(text)).strip()
|
||||
|
||||
|
||||
class WeixinSpider:
|
||||
"""基于 requests 的微信视频号采集器"""
|
||||
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
self.config = _load_weixin_config()
|
||||
self.token = str(self.config.get("TOKEN", "")).strip()
|
||||
self.fingerprint = str(self.config.get("FINGERPRINT", "")).strip()
|
||||
self.cookies = self.config.get("COOKIE", {})
|
||||
self.count = str(self.config.get("COUNT", DEFAULT_WEIXIN_CONFIG["COUNT"]))
|
||||
self.referer = str(self.config.get("REFERER", DEFAULT_WEIXIN_CONFIG["REFERER"])).strip()
|
||||
self.request_params = {
|
||||
str(key): str(value)
|
||||
for key, value in (self.config.get("REQUEST_PARAMS", {}) or {}).items()
|
||||
if value is not None
|
||||
}
|
||||
self.page_delay = max(0.0, float(self.config.get("PAGE_DELAY", DEFAULT_WEIXIN_CONFIG["PAGE_DELAY"])))
|
||||
self.city_delay = max(0.0, float(self.config.get("CITY_DELAY", DEFAULT_WEIXIN_CONFIG["CITY_DELAY"])))
|
||||
max_rps = self.config.get("REQUESTS_PER_SECOND")
|
||||
if max_rps:
|
||||
global_rate_limiter.max_requests = int(max_rps)
|
||||
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
project_headers = getattr(project_config, "HEADERS", None)
|
||||
if isinstance(project_headers, dict):
|
||||
headers.update(project_headers)
|
||||
config_headers = self.config.get("HEADERS", {})
|
||||
if isinstance(config_headers, dict):
|
||||
headers.update({str(key): str(value) for key, value in config_headers.items()})
|
||||
if self.referer:
|
||||
headers["Referer"] = self.referer
|
||||
self.session = requests.Session()
|
||||
self.session.trust_env = False
|
||||
self.session.headers.update(headers)
|
||||
if self.cookies:
|
||||
self.session.cookies.update(self.cookies)
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
def _validate_runtime_config(self) -> bool:
|
||||
missing = []
|
||||
if not self.token:
|
||||
missing.append("TOKEN")
|
||||
if not self.fingerprint:
|
||||
missing.append("FINGERPRINT")
|
||||
if not self.cookies:
|
||||
missing.append("COOKIE")
|
||||
|
||||
if not missing:
|
||||
return True
|
||||
|
||||
print(
|
||||
"[微信] 配置不完整,缺少: "
|
||||
+ ", ".join(missing)
|
||||
+ "。请在 config.py 的 WEIXIN_CONFIG 中补齐,"
|
||||
+ "或通过环境变量 WEIXIN_TOKEN / WEIXIN_FINGERPRINT / WEIXIN_COOKIE 提供。"
|
||||
)
|
||||
return False
|
||||
|
||||
def _load_areas(self):
|
||||
condition = "domain='maxlaw' AND level=2"
|
||||
tables = ("area_new", "area", "area2")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
try:
|
||||
rows = self.db.select_data(table, "province, city", condition) or []
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
continue
|
||||
if rows:
|
||||
print(f"[微信] 城市来源表: {table}, 城市数: {len(rows)}")
|
||||
return rows
|
||||
|
||||
if last_error:
|
||||
print(f"[微信] 加载地区数据失败: {last_error}")
|
||||
print("[微信] 无城市数据(已尝试 area_new/area/area2)")
|
||||
return []
|
||||
|
||||
def _build_query_url(self, query: str, buffer: str) -> str:
|
||||
params = self.request_params.copy()
|
||||
params.update({
|
||||
"query": query,
|
||||
"count": self.count,
|
||||
"buffer": buffer,
|
||||
"fingerprint": self.fingerprint,
|
||||
"token": self.token,
|
||||
})
|
||||
return f"{API_ENDPOINT}?{urlencode(params)}"
|
||||
|
||||
def _extract_phone(self, text: str) -> Optional[str]:
|
||||
if not text:
|
||||
return None
|
||||
match = re.search(r"1[3-9]\d{9}", text)
|
||||
return match.group(0) if match else None
|
||||
|
||||
def _parse_name(self, acct: Dict) -> str:
|
||||
highlight = _strip_html(acct.get("highlight_nickname", ""))
|
||||
if highlight:
|
||||
return highlight
|
||||
return _strip_html(acct.get("nickname", ""))
|
||||
|
||||
def _store_account(self, acct: Dict, province: str, city: str) -> None:
|
||||
signature = acct.get("signature", "")
|
||||
phone = self._extract_phone(signature)
|
||||
if not phone:
|
||||
return
|
||||
|
||||
if self.db.is_data_exist("lawyer", f"phone='{phone}' and domain='{DOMAIN}'"):
|
||||
name = self._parse_name(acct)
|
||||
print(f" -- 已存在律师: {name} ({phone})")
|
||||
return
|
||||
|
||||
params = json.dumps(acct, ensure_ascii=False)
|
||||
lawyer_data = {
|
||||
"phone": phone,
|
||||
"province": province,
|
||||
"city": city,
|
||||
"law_firm": acct.get("auth_info", {}).get("auth_profession"),
|
||||
"url": f"https://channels.weixin.qq.com/finder/creator/feeds?finder_username={acct.get('username', '')}",
|
||||
"create_time": int(time.time()),
|
||||
"domain": DOMAIN,
|
||||
"name": self._parse_name(acct),
|
||||
"params": params,
|
||||
}
|
||||
|
||||
try:
|
||||
inserted_id = self.db.insert_data("lawyer", lawyer_data)
|
||||
print(f" -> 新增律师: {lawyer_data['name']} ({phone}), ID: {inserted_id}")
|
||||
except Exception as exc:
|
||||
print(f" 插入失败 {lawyer_data['name']} ({phone}): {exc}")
|
||||
|
||||
def _search_city(self, province: str, city: str) -> None:
|
||||
city_name = city.replace('市', '')
|
||||
query = f"{city_name}律所"
|
||||
print(f"--- [微信] 开始采集城市: {province} - {city_name} ---")
|
||||
|
||||
buffer = ""
|
||||
has_more = True
|
||||
page_no = 0
|
||||
|
||||
while has_more:
|
||||
page_no += 1
|
||||
url = self._build_query_url(query, buffer)
|
||||
print(f"正在采集 '{query}' 第 {page_no} 页: {url}")
|
||||
|
||||
wait_for_request()
|
||||
try:
|
||||
response = self.session.get(
|
||||
url,
|
||||
timeout=15,
|
||||
cookies=self.cookies,
|
||||
proxies={}, # 明确禁用代理
|
||||
verify=False,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"网络请求失败: {exc}")
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
print("解析返回的JSON失败。返回内容:", response.text[:200])
|
||||
break
|
||||
|
||||
base_resp = data.get("base_resp", {})
|
||||
if base_resp.get("ret") != 0:
|
||||
print(f"API返回错误: {base_resp.get('err_msg')}")
|
||||
if "invalid ticket" in (base_resp.get('err_msg') or ""):
|
||||
print("Token 或 Cookie 可能失效,请更新配置。")
|
||||
break
|
||||
|
||||
accounts = data.get("acct_list", [])
|
||||
if not accounts:
|
||||
print("本页未找到更多律师信息。")
|
||||
break
|
||||
|
||||
for acct in accounts:
|
||||
self._store_account(acct, province, city_name)
|
||||
|
||||
has_more = bool(data.get("acct_continue_flag"))
|
||||
buffer = data.get("last_buff", "")
|
||||
time.sleep(self.page_delay)
|
||||
|
||||
print(f"--- [微信] 城市: {city_name} 采集完成 ---\n")
|
||||
|
||||
def run(self) -> None:
|
||||
print("启动微信视频号律师信息采集...")
|
||||
if not self._validate_runtime_config():
|
||||
return
|
||||
|
||||
areas = self._load_areas()
|
||||
if not areas:
|
||||
print("[微信] 未能从 `area_new` 表获取到地区信息。")
|
||||
return
|
||||
|
||||
for area in areas:
|
||||
province = area.get("province", "")
|
||||
city = area.get("city", "")
|
||||
if not city:
|
||||
continue
|
||||
try:
|
||||
self._search_city(province, city)
|
||||
except Exception as exc:
|
||||
print(f"采集 {province}-{city} 时发生错误: {exc}")
|
||||
time.sleep(self.city_delay)
|
||||
|
||||
print("微信视频号律师信息采集完成。")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from Db import Db
|
||||
|
||||
with Db() as db:
|
||||
spider = WeixinSpider(db)
|
||||
spider.run()
|
||||
Reference in New Issue
Block a user