Compare commits
10 Commits
19cf9ce901
...
f67cb30f0d
| Author | SHA1 | Date | |
|---|---|---|---|
| f67cb30f0d | |||
| ba04fe42fc | |||
| ff5e04d986 | |||
| 7d5f5b1054 | |||
| 38e7c284e8 | |||
| c2b77975c1 | |||
| e10437cd90 | |||
| 86cf933913 | |||
| a96b9a50e4 | |||
| bc4a2aa4d5 |
+5
-35
@@ -1,36 +1,6 @@
|
|||||||
# Python
|
|
||||||
__pycache__/
|
|
||||||
*.py[cod]
|
|
||||||
*$py.class
|
|
||||||
|
|
||||||
# Build / packaging
|
|
||||||
build/
|
|
||||||
dist/
|
|
||||||
*.egg-info/
|
|
||||||
.eggs/
|
|
||||||
|
|
||||||
# Virtual environments
|
|
||||||
.venv/
|
.venv/
|
||||||
venv/
|
__pycache__/
|
||||||
env/
|
*.pyc
|
||||||
|
common_sites/*.log
|
||||||
# Test / type caches
|
logs/*
|
||||||
.pytest_cache/
|
data/*
|
||||||
.mypy_cache/
|
|
||||||
.ruff_cache/
|
|
||||||
|
|
||||||
# IDE
|
|
||||||
.vscode/
|
|
||||||
.idea/
|
|
||||||
|
|
||||||
# OS
|
|
||||||
.DS_Store
|
|
||||||
Thumbs.db
|
|
||||||
|
|
||||||
# Local runtime files
|
|
||||||
*.log
|
|
||||||
logs/
|
|
||||||
data/
|
|
||||||
|
|
||||||
# accidental local files
|
|
||||||
=*
|
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"launchOptions": {
|
||||||
|
"chromiumSandbox": false
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,62 +1,70 @@
|
|||||||
# lawyers
|
# lawyers-common-sites
|
||||||
|
|
||||||
`common_sites` 独立采集项目。
|
从 `/www/wwwroot/lawyer` 中抽离出的 `common_sites` 独立项目。
|
||||||
|
|
||||||
## 目录
|
## 目录
|
||||||
|
|
||||||
- `common_sites/`:大律师、找法网、法律快车、律图、华律 5 个采集脚本
|
- `common_sites/`: 站点采集脚本
|
||||||
- `request/proxy_config.py`:代理配置加载逻辑
|
- `request/`: 代理配置
|
||||||
- `request/proxy_settings.json`:代理配置文件
|
- `utils/`: 公共工具
|
||||||
- `Db.py`:数据库连接与基础操作
|
- `Db.py`: 数据库封装
|
||||||
- `config.py`:数据库与请求头配置
|
- `config.py`: 项目配置
|
||||||
|
|
||||||
## 运行
|
## 快速启动
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd /www/wwwroot/lawyers
|
cd /www/wwwroot/lawyers
|
||||||
python3 -m venv .venv
|
python -m venv .venv
|
||||||
.venv/bin/pip install -r requirements.txt
|
source .venv/bin/activate
|
||||||
./common_sites/start.sh
|
pip install -r requirements.txt
|
||||||
|
bash common_sites/start.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
## 启动参数
|
## 拆分运行(直连/代理)
|
||||||
|
|
||||||
`start.sh` 默认并行启动 5 个站点采集(大律师使用 `dls_fresh.py`)。
|
本仓库支持用环境变量 `PROXY_ENABLED` 在一次运行内强制开/关代理:
|
||||||
|
|
||||||
- 日志目录:`/www/wwwroot/lawyers/logs`
|
- **直连**:`PROXY_ENABLED=0`(不使用代理 IP)
|
||||||
- 大律师 JSON 输出:`/www/wwwroot/lawyers/data/dls_records.jsonl`
|
- **代理**:`PROXY_ENABLED=1`(强制使用 `request/proxy_settings.json` 的代理配置)
|
||||||
|
- **默认**:不设置(跟随 `request/proxy_settings.json` 的 `enabled` 字段)
|
||||||
|
|
||||||
常用环境变量:
|
对应提供两套入口脚本:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 顺序执行(默认 parallel)
|
# 直连(默认包含:大律师/大律师PC/找法网/法律快车)
|
||||||
RUN_MODE=sequential ./common_sites/start.sh
|
bash common_sites/start_direct_twice_weekly.sh
|
||||||
|
|
||||||
# 大律师限制采集范围
|
# 代理(默认包含:华律/律图)
|
||||||
DLS_CITY_FILTER=beijing DLS_MAX_CITIES=1 DLS_MAX_PAGES=1 ./common_sites/start.sh
|
bash common_sites/start_proxy_weekly.sh
|
||||||
|
|
||||||
# 大律师直连(不走代理)/ 仅导出JSON不写库
|
|
||||||
DLS_DIRECT=1 DLS_NO_DB=1 ./common_sites/start.sh
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## 导出 Excel
|
## cron 示例(每周两次直连 + 每周一次代理)
|
||||||
|
|
||||||
新增导出脚本:`common_sites/export_lawyers_excel.py`
|
> 下面仅给示例,你可以按机器负载调整时间;日志会输出到 `common_sites/*.log`。
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 无参数:默认导出最近7天数据(含手机号/姓名/律所/省份/市区/站点名称)
|
# 编辑定时任务
|
||||||
# 并默认解析 params 扩展信息(邮箱/地址/执业证号/执业年限/擅长领域等)
|
crontab -e
|
||||||
./.venv/bin/python ./common_sites/export_lawyers_excel.py
|
|
||||||
|
|
||||||
# 按 create_time 时间戳范围导出
|
# 每周二、周五 02:10 直连跑一次
|
||||||
./.venv/bin/python ./common_sites/export_lawyers_excel.py \
|
10 2 * * 2,5 cd /www/wwwroot/lawyers && bash common_sites/start_direct_twice_weekly.sh
|
||||||
--start-ts 1772380000 --end-ts 1772429999 \
|
|
||||||
--output ./data/lawyers_20260302.xlsx
|
|
||||||
|
|
||||||
# 只导出某站点,并带技术字段(url/域名/时间等)
|
# 每周日 03:20 走代理跑一次(你手动续费代理 IP)
|
||||||
./.venv/bin/python ./common_sites/export_lawyers_excel.py \
|
20 3 * * 0 cd /www/wwwroot/lawyers && bash common_sites/start_proxy_weekly.sh
|
||||||
--domain 大律师 --include-extra
|
|
||||||
|
|
||||||
# 如果不需要解析 params 扩展信息
|
|
||||||
./.venv/bin/python ./common_sites/export_lawyers_excel.py --no-parse-params
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 常用参数(可选)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 限流(跨进程共享),直连可适当调高,代理建议保守
|
||||||
|
export PROXY_MAX_REQUESTS_PER_SECOND=8
|
||||||
|
|
||||||
|
# 代理连通性输出(部分脚本会打印测试信息)
|
||||||
|
export PROXY_TEST=1
|
||||||
|
```
|
||||||
|
|
||||||
|
## 说明
|
||||||
|
|
||||||
|
- 当前项目直接复用原项目数据库配置和代理配置。
|
||||||
|
- 采集依赖原库中的 `lawyer`、`area_new`、`area`、`area2` 等表。
|
||||||
|
- 日志默认输出到 `common_sites/*.log`。
|
||||||
|
|||||||
@@ -0,0 +1,473 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
request_dir = os.path.join(project_root, "request")
|
||||||
|
if request_dir not in sys.path:
|
||||||
|
sys.path.insert(0, request_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
|
||||||
|
from Db import Db
|
||||||
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
|
|
||||||
|
|
||||||
|
DOMAIN = "百度法行宝"
|
||||||
|
BASE_URL = "https://lvlin.baidu.com"
|
||||||
|
CITY_API = f"{BASE_URL}/pc/api/law/sync/city"
|
||||||
|
LIST_API = f"{BASE_URL}/pc/api/law/api/lawyerlist"
|
||||||
|
DETAIL_API = f"{BASE_URL}/pc/api/law/api/lawyerhome"
|
||||||
|
DEFAULT_PAGE_SIZE = 16
|
||||||
|
DEFAULT_MAX_PAGES = 30
|
||||||
|
DEFAULT_STOP_ZERO_NEW_PAGES = 3
|
||||||
|
DEFAULT_SLEEP_SECONDS = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="采集百度法行宝律师信息并落库")
|
||||||
|
parser.add_argument("--province", default="", help="仅采集指定省份,例如:山东")
|
||||||
|
parser.add_argument("--city", default="", help="仅采集指定城市,例如:聊城 / 聊城市")
|
||||||
|
parser.add_argument(
|
||||||
|
"--areas",
|
||||||
|
default="",
|
||||||
|
help="指定案件类型,逗号分隔;不传时自动发现顶级类型并追加不限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--limit-cities",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="仅处理前 N 个城市,0 表示不限",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--page-size",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_PAGE_SIZE,
|
||||||
|
help=f"每次列表请求条数,默认 {DEFAULT_PAGE_SIZE}",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages-per-query",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_MAX_PAGES,
|
||||||
|
help=f"单城市单类型最大翻页数,默认 {DEFAULT_MAX_PAGES}",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--stop-zero-new-pages",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_STOP_ZERO_NEW_PAGES,
|
||||||
|
help=f"连续多少页无新增就停止当前查询,默认 {DEFAULT_STOP_ZERO_NEW_PAGES}",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sleep-seconds",
|
||||||
|
type=float,
|
||||||
|
default=DEFAULT_SLEEP_SECONDS,
|
||||||
|
help=f"请求间隔秒数,默认 {DEFAULT_SLEEP_SECONDS}",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
class BaiduLvlinSpider:
|
||||||
|
def __init__(self, db_connection: Db, args: argparse.Namespace):
|
||||||
|
self.db = db_connection
|
||||||
|
self.args = args
|
||||||
|
self.page_size = max(1, int(args.page_size or DEFAULT_PAGE_SIZE))
|
||||||
|
self.max_pages_per_query = max(1, int(args.max_pages_per_query or DEFAULT_MAX_PAGES))
|
||||||
|
self.stop_zero_new_pages = max(1, int(args.stop_zero_new_pages or DEFAULT_STOP_ZERO_NEW_PAGES))
|
||||||
|
self.sleep_seconds = max(0.0, float(args.sleep_seconds or 0.0))
|
||||||
|
self.proxy_enabled = False
|
||||||
|
self.session = self._build_session()
|
||||||
|
self.existing_urls = self._load_existing_urls()
|
||||||
|
self.cities = self._load_cities()
|
||||||
|
self.areas = self._load_areas()
|
||||||
|
self.inserted_count = 0
|
||||||
|
|
||||||
|
def _build_session(self) -> requests.Session:
|
||||||
|
report_proxy_status()
|
||||||
|
session = requests.Session()
|
||||||
|
session.trust_env = False
|
||||||
|
proxies = get_proxies()
|
||||||
|
if proxies:
|
||||||
|
session.proxies.update(proxies)
|
||||||
|
self.proxy_enabled = True
|
||||||
|
else:
|
||||||
|
session.proxies.clear()
|
||||||
|
self.proxy_enabled = False
|
||||||
|
|
||||||
|
retries = Retry(
|
||||||
|
total=3,
|
||||||
|
backoff_factor=1,
|
||||||
|
status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
allowed_methods=frozenset(["GET"]),
|
||||||
|
raise_on_status=False,
|
||||||
|
)
|
||||||
|
adapter = HTTPAdapter(max_retries=retries)
|
||||||
|
session.mount("https://", adapter)
|
||||||
|
session.mount("http://", adapter)
|
||||||
|
session.headers.update(
|
||||||
|
{
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/123.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "application/json, text/plain, */*",
|
||||||
|
"Referer": f"{BASE_URL}/pc/r?vn=law",
|
||||||
|
"Connection": "close",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return session
|
||||||
|
|
||||||
|
def _disable_proxy(self) -> None:
|
||||||
|
if not self.proxy_enabled:
|
||||||
|
return
|
||||||
|
self.session.proxies.clear()
|
||||||
|
self.proxy_enabled = False
|
||||||
|
print(f"[{DOMAIN}] 代理不可用,已切换直连")
|
||||||
|
|
||||||
|
def _sleep(self) -> None:
|
||||||
|
if self.sleep_seconds > 0:
|
||||||
|
time.sleep(self.sleep_seconds)
|
||||||
|
|
||||||
|
def _get_json(self, url: str, params: Optional[Dict[str, object]] = None, referer: str = "") -> Dict:
|
||||||
|
headers = {}
|
||||||
|
if referer:
|
||||||
|
headers["Referer"] = referer
|
||||||
|
try:
|
||||||
|
resp = self.session.get(url, params=params or {}, timeout=20, headers=headers)
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
self._disable_proxy()
|
||||||
|
resp = self.session.get(url, params=params or {}, timeout=20, headers=headers)
|
||||||
|
try:
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
finally:
|
||||||
|
resp.close()
|
||||||
|
|
||||||
|
def _load_existing_urls(self) -> Set[str]:
|
||||||
|
urls: Set[str] = set()
|
||||||
|
cursor = self.db.db.cursor()
|
||||||
|
try:
|
||||||
|
cursor.execute("SELECT url FROM lawyer WHERE domain=%s AND url IS NOT NULL", (DOMAIN,))
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
url = (row[0] or "").strip()
|
||||||
|
if url:
|
||||||
|
urls.add(url)
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
print(f"[{DOMAIN}] 已存在 URL 数: {len(urls)}")
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def _normalize_city_name(self, city_name: str) -> str:
|
||||||
|
text = str(city_name or "").strip()
|
||||||
|
if text.endswith("市"):
|
||||||
|
return text[:-1]
|
||||||
|
return text
|
||||||
|
|
||||||
|
def _city_matches(self, expected_city: str, actual_city: str) -> bool:
|
||||||
|
left = self._normalize_city_name(expected_city)
|
||||||
|
right = self._normalize_city_name(actual_city)
|
||||||
|
if not left or not right:
|
||||||
|
return False
|
||||||
|
return left == right
|
||||||
|
|
||||||
|
def _load_cities(self) -> List[Dict[str, str]]:
|
||||||
|
payload = self._get_json(CITY_API, params={"vn": "law"}, referer=f"{BASE_URL}/pc/r?vn=law")
|
||||||
|
all_city_list = payload.get("data", {}).get("AllCityList", []) or []
|
||||||
|
cities: List[Dict[str, str]] = []
|
||||||
|
province_filter = self.args.province.strip()
|
||||||
|
city_filter = self._normalize_city_name(self.args.city)
|
||||||
|
|
||||||
|
for block in all_city_list:
|
||||||
|
for item in block.get("cityList", []) or []:
|
||||||
|
city_name = str(item.get("name") or "").strip()
|
||||||
|
province = str(item.get("province") or "").strip()
|
||||||
|
city_code = str(item.get("code") or "").strip()
|
||||||
|
if not city_name or not province or not city_code:
|
||||||
|
continue
|
||||||
|
if province_filter and province != province_filter:
|
||||||
|
continue
|
||||||
|
if city_filter and self._normalize_city_name(city_name) != city_filter:
|
||||||
|
continue
|
||||||
|
cities.append(
|
||||||
|
{
|
||||||
|
"province": province,
|
||||||
|
"city": city_name,
|
||||||
|
"city_code": city_code,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
cities.sort(key=lambda item: (item["province"], item["city"]))
|
||||||
|
if self.args.limit_cities and self.args.limit_cities > 0:
|
||||||
|
cities = cities[: self.args.limit_cities]
|
||||||
|
print(f"[{DOMAIN}] 本次待采城市数: {len(cities)}")
|
||||||
|
return cities
|
||||||
|
|
||||||
|
def _discover_top_level_areas(self) -> List[str]:
|
||||||
|
sample_city = self.cities[0]["city"] if self.cities else "北京"
|
||||||
|
payload = self._get_json(
|
||||||
|
LIST_API,
|
||||||
|
params={
|
||||||
|
"city_name": sample_city,
|
||||||
|
"page_num": 1,
|
||||||
|
"page_size": self.page_size,
|
||||||
|
"ts": int(time.time()),
|
||||||
|
"clientType": "pc",
|
||||||
|
"list_type": 1,
|
||||||
|
},
|
||||||
|
referer=f"{BASE_URL}/pc/r?vn=law",
|
||||||
|
)
|
||||||
|
filters = payload.get("data", {}).get("filters", []) or []
|
||||||
|
areas: List[str] = ["不限"]
|
||||||
|
seen = {"不限"}
|
||||||
|
for item in filters:
|
||||||
|
if item.get("key") != "type":
|
||||||
|
continue
|
||||||
|
for option in item.get("options", []) or []:
|
||||||
|
value = str(option.get("value") or "").strip()
|
||||||
|
if not value or value in seen:
|
||||||
|
continue
|
||||||
|
seen.add(value)
|
||||||
|
areas.append(value)
|
||||||
|
return areas
|
||||||
|
|
||||||
|
def _load_areas(self) -> List[str]:
|
||||||
|
if self.args.areas.strip():
|
||||||
|
areas = [part.strip() for part in self.args.areas.split(",") if part.strip()]
|
||||||
|
unique: List[str] = []
|
||||||
|
seen = set()
|
||||||
|
for area in areas:
|
||||||
|
if area not in seen:
|
||||||
|
seen.add(area)
|
||||||
|
unique.append(area)
|
||||||
|
print(f"[{DOMAIN}] 使用指定案件类型: {unique}")
|
||||||
|
return unique
|
||||||
|
|
||||||
|
areas = self._discover_top_level_areas()
|
||||||
|
print(f"[{DOMAIN}] 自动发现案件类型: {areas}")
|
||||||
|
return areas
|
||||||
|
|
||||||
|
def _build_pc_detail_url(self, qc_no: str, rs_id: str) -> str:
|
||||||
|
return f"{BASE_URL}/pc/lawyer?vn=law&qc_no={qc_no}&rs_id={rs_id}"
|
||||||
|
|
||||||
|
def _build_list_page_url(self, city_name: str, area_name: str) -> str:
|
||||||
|
params = {"city": city_name, "vn": "law"}
|
||||||
|
if area_name and area_name != "不限":
|
||||||
|
params["expertiseArea"] = area_name
|
||||||
|
return f"{BASE_URL}/pc/r?{urlencode(params)}"
|
||||||
|
|
||||||
|
def _fetch_list(self, city_name: str, area_name: str, page_num: int) -> List[Dict]:
|
||||||
|
params: Dict[str, object] = {
|
||||||
|
"city_name": city_name,
|
||||||
|
"page_num": page_num,
|
||||||
|
"page_size": self.page_size,
|
||||||
|
"ts": int(time.time()),
|
||||||
|
"clientType": "pc",
|
||||||
|
"list_type": 1,
|
||||||
|
}
|
||||||
|
if area_name and area_name != "不限":
|
||||||
|
params["expertiseArea"] = area_name
|
||||||
|
payload = self._get_json(
|
||||||
|
LIST_API,
|
||||||
|
params=params,
|
||||||
|
referer=self._build_list_page_url(city_name, area_name),
|
||||||
|
)
|
||||||
|
return payload.get("data", {}).get("lawyer_list", []) or []
|
||||||
|
|
||||||
|
def _fetch_detail(self, qc_no: str, rs_id: str) -> Dict:
|
||||||
|
payload = self._get_json(
|
||||||
|
DETAIL_API,
|
||||||
|
params={"vn": "law", "qc_no": qc_no, "rs_id": rs_id},
|
||||||
|
referer=self._build_pc_detail_url(qc_no, rs_id),
|
||||||
|
)
|
||||||
|
return payload.get("data", {}).get("lawyer", {}) or {}
|
||||||
|
|
||||||
|
def _extract_phone(self, detail: Dict) -> Optional[str]:
|
||||||
|
for service in detail.get("lawyer_service", []) or []:
|
||||||
|
phone = str(service.get("phone_num") or "").strip()
|
||||||
|
if phone:
|
||||||
|
return phone
|
||||||
|
for service in detail.get("lawyer_service_new", []) or []:
|
||||||
|
phone = str(service.get("phone_num") or "").strip()
|
||||||
|
if phone:
|
||||||
|
return phone
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _safe_json(self, payload: Dict) -> str:
|
||||||
|
return json.dumps(payload, ensure_ascii=False)
|
||||||
|
|
||||||
|
def _build_record(
|
||||||
|
self,
|
||||||
|
city_info: Dict[str, str],
|
||||||
|
area_name: str,
|
||||||
|
page_num: int,
|
||||||
|
list_item: Dict,
|
||||||
|
detail: Dict,
|
||||||
|
) -> Dict[str, object]:
|
||||||
|
qc_no = str(list_item.get("qc_no") or detail.get("qc_no") or "").strip()
|
||||||
|
rs_id = str(list_item.get("rs_id") or detail.get("rs_id") or "").strip()
|
||||||
|
detail_url = self._build_pc_detail_url(qc_no, rs_id)
|
||||||
|
name = str(detail.get("lawyer_name") or list_item.get("lawyer_name") or "").strip()
|
||||||
|
law_firm = str(detail.get("practice_company") or list_item.get("practice_company") or "").strip()
|
||||||
|
city_name = str(list_item.get("city") or city_info.get("city") or "").strip()
|
||||||
|
avatar_url = str(detail.get("lawyer_avatar_big") or detail.get("lawyer_avatar") or list_item.get("lawyer_avatar_big") or list_item.get("lawyer_avatar") or "").strip()
|
||||||
|
phone = self._extract_phone(detail)
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"source": {
|
||||||
|
"site": "baidu_lvlin",
|
||||||
|
"city_name": city_info.get("city"),
|
||||||
|
"city_code": city_info.get("city_code"),
|
||||||
|
"province": city_info.get("province"),
|
||||||
|
"expertise_area": area_name,
|
||||||
|
"page_num": page_num,
|
||||||
|
"list_url": self._build_list_page_url(city_info.get("city", ""), area_name),
|
||||||
|
"detail_url": detail_url,
|
||||||
|
"list_api": LIST_API,
|
||||||
|
"detail_api": DETAIL_API,
|
||||||
|
},
|
||||||
|
"list_item": list_item,
|
||||||
|
"detail": detail,
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name or None,
|
||||||
|
"phone": phone or None,
|
||||||
|
"law_firm": law_firm or None,
|
||||||
|
"province": city_info.get("province") or None,
|
||||||
|
"city": city_name or city_info.get("city") or None,
|
||||||
|
"url": detail_url,
|
||||||
|
"avatar_url": avatar_url or None,
|
||||||
|
"domain": DOMAIN,
|
||||||
|
"create_time": int(time.time()),
|
||||||
|
"site_time": None,
|
||||||
|
"params": self._safe_json(params),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _insert_record(self, record: Dict[str, object]) -> bool:
|
||||||
|
url = str(record.get("url") or "").strip()
|
||||||
|
if not url or url in self.existing_urls:
|
||||||
|
return False
|
||||||
|
self.db.insert_data("lawyer", record)
|
||||||
|
self.existing_urls.add(url)
|
||||||
|
self.inserted_count += 1
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _iter_city_area(self, city_info: Dict[str, str], area_name: str) -> Tuple[int, int]:
|
||||||
|
inserted = 0
|
||||||
|
pages = 0
|
||||||
|
zero_new_pages = 0
|
||||||
|
city_name = city_info["city"]
|
||||||
|
|
||||||
|
for page_num in range(1, self.max_pages_per_query + 1):
|
||||||
|
pages = page_num
|
||||||
|
try:
|
||||||
|
items = self._fetch_list(city_name, area_name, page_num)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[{DOMAIN}] 列表请求失败 {city_name}-{area_name}-p{page_num}: {exc}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
print(f"[{DOMAIN}] {city_name}-{area_name} 第 {page_num} 页无数据,停止")
|
||||||
|
break
|
||||||
|
|
||||||
|
page_new = 0
|
||||||
|
for item in items:
|
||||||
|
qc_no = str(item.get("qc_no") or "").strip()
|
||||||
|
rs_id = str(item.get("rs_id") or "").strip()
|
||||||
|
actual_city = str(item.get("city") or "").strip()
|
||||||
|
if not qc_no or not rs_id:
|
||||||
|
continue
|
||||||
|
if actual_city and not self._city_matches(city_name, actual_city):
|
||||||
|
continue
|
||||||
|
|
||||||
|
detail_url = self._build_pc_detail_url(qc_no, rs_id)
|
||||||
|
if detail_url in self.existing_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
detail: Dict = {}
|
||||||
|
try:
|
||||||
|
detail = self._fetch_detail(qc_no, rs_id)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[{DOMAIN}] 详情请求失败 {qc_no}-{rs_id}: {exc}")
|
||||||
|
|
||||||
|
record = self._build_record(city_info, area_name, page_num, item, detail)
|
||||||
|
try:
|
||||||
|
if self._insert_record(record):
|
||||||
|
page_new += 1
|
||||||
|
inserted += 1
|
||||||
|
print(
|
||||||
|
f"[{DOMAIN}] -> 新增 {record.get('name') or qc_no} "
|
||||||
|
f"| {city_name} | {area_name} | p{page_num}"
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[{DOMAIN}] 插入失败 {record.get('url')}: {exc}")
|
||||||
|
self._sleep()
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[{DOMAIN}] {city_name} | {area_name} | p{page_num} "
|
||||||
|
f"| 列表 {len(items)} | 新增 {page_new}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(items) < self.page_size:
|
||||||
|
break
|
||||||
|
if page_new == 0:
|
||||||
|
zero_new_pages += 1
|
||||||
|
if zero_new_pages >= self.stop_zero_new_pages:
|
||||||
|
print(
|
||||||
|
f"[{DOMAIN}] {city_name}-{area_name} 连续 {zero_new_pages} 页无新增,停止"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
zero_new_pages = 0
|
||||||
|
|
||||||
|
self._sleep()
|
||||||
|
|
||||||
|
return inserted, pages
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
print(f"[{DOMAIN}] 启动采集")
|
||||||
|
if not self.cities:
|
||||||
|
print(f"[{DOMAIN}] 无可采城市")
|
||||||
|
return
|
||||||
|
if not self.areas:
|
||||||
|
print(f"[{DOMAIN}] 无可采案件类型")
|
||||||
|
return
|
||||||
|
|
||||||
|
total_queries = len(self.cities) * len(self.areas)
|
||||||
|
query_index = 0
|
||||||
|
for city_info in self.cities:
|
||||||
|
city_inserted = 0
|
||||||
|
for area_name in self.areas:
|
||||||
|
query_index += 1
|
||||||
|
print(
|
||||||
|
f"[{DOMAIN}] 进度 {query_index}/{total_queries} | "
|
||||||
|
f"{city_info['province']}-{city_info['city']} | {area_name}"
|
||||||
|
)
|
||||||
|
inserted, pages = self._iter_city_area(city_info, area_name)
|
||||||
|
city_inserted += inserted
|
||||||
|
print(
|
||||||
|
f"[{DOMAIN}] 完成 {city_info['city']} | {area_name} "
|
||||||
|
f"| 翻页 {pages} | 新增 {inserted}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"[{DOMAIN}] 城市完成 {city_info['province']}-{city_info['city']} "
|
||||||
|
f"| 本城新增 {city_inserted} | 总新增 {self.inserted_count}"
|
||||||
|
)
|
||||||
|
print(f"[{DOMAIN}] 采集完成,总新增 {self.inserted_count}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cli_args = parse_args()
|
||||||
|
with Db() as db:
|
||||||
|
spider = BaiduLvlinSpider(db, cli_args)
|
||||||
|
spider.run()
|
||||||
+186
-287
@@ -1,14 +1,9 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from typing import Dict, List, Optional, Set, Tuple
|
import random
|
||||||
from urllib.parse import urljoin
|
from typing import Dict, Optional
|
||||||
|
|
||||||
import urllib3
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -18,144 +13,191 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
from Db import Db
|
import requests
|
||||||
from request.requests_client import (
|
from requests.adapters import HTTPAdapter
|
||||||
RequestClientError,
|
from urllib3.util.retry import Retry
|
||||||
RequestConnectTimeout,
|
import urllib3
|
||||||
RequestConnectionError,
|
from bs4 import BeautifulSoup
|
||||||
RequestTimeout,
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
RequestsClient,
|
|
||||||
)
|
|
||||||
from utils.rate_limiter import wait_for_request
|
|
||||||
|
|
||||||
|
# 禁用 SSL 警告
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
from Db import Db
|
||||||
|
from utils.rate_limiter import request_slot
|
||||||
|
|
||||||
DOMAIN = "大律师"
|
DOMAIN = "大律师"
|
||||||
SITE_BASE = "https://m.maxlaw.cn"
|
LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
|
||||||
LIST_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}"
|
_PROXY_TESTED = False
|
||||||
PHONE_PATTERN = re.compile(r"1[3-9]\d{9}")
|
|
||||||
MAX_PAGES_PER_CITY = int(os.getenv("MAXLAW_MAX_PAGES", "0"))
|
|
||||||
PROXY_TESTED = False
|
|
||||||
|
|
||||||
|
|
||||||
class DlsSpider:
|
class DlsSpider:
|
||||||
def __init__(self, db_connection):
|
def __init__(self, db_connection):
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = self._build_client()
|
self.session = self._build_session()
|
||||||
self.areas = self._load_areas()
|
self.areas = self._load_areas()
|
||||||
|
|
||||||
def _build_client(self) -> RequestsClient:
|
def _build_session(self) -> requests.Session:
|
||||||
client = RequestsClient(
|
"""构建带重试机制的 session"""
|
||||||
headers={
|
report_proxy_status()
|
||||||
"User-Agent": (
|
s = requests.Session()
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
s.trust_env = False
|
||||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
proxies = get_proxies()
|
||||||
"Mobile/15E148 Safari/604.1"
|
if proxies:
|
||||||
),
|
s.proxies.update(proxies)
|
||||||
"Host": "m.maxlaw.cn",
|
else:
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
s.proxies.clear()
|
||||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
self._proxy_test(s, proxies)
|
||||||
"Connection": "close",
|
# 配置重试策略
|
||||||
},
|
retries = Retry(
|
||||||
retry_total=3,
|
total=3, # 总共重试3次
|
||||||
retry_backoff_factor=1,
|
backoff_factor=1, # 重试间隔:1s, 2s, 4s
|
||||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
status_forcelist=(429, 500, 502, 503, 504), # 对这些状态码进行重试
|
||||||
retry_allowed_methods=("GET", "POST"),
|
allowed_methods=frozenset(["GET", "POST"]),
|
||||||
|
raise_on_status=False # 不立即抛出异常,让代码处理
|
||||||
)
|
)
|
||||||
self._proxy_test(client, client.proxies or None)
|
adapter = HTTPAdapter(max_retries=retries)
|
||||||
return client
|
s.mount("https://", adapter)
|
||||||
|
s.mount("http://", adapter)
|
||||||
|
s.headers.update({
|
||||||
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
|
||||||
|
"Host": "m.maxlaw.cn",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Connection": "close",
|
||||||
|
})
|
||||||
|
return s
|
||||||
|
|
||||||
def _refresh_client(self) -> None:
|
def _refresh_session(self) -> None:
|
||||||
self.client.refresh()
|
try:
|
||||||
self._proxy_test(self.client, self.client.proxies or None)
|
self.session.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self.session = self._build_session()
|
||||||
|
|
||||||
def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
|
def _proxy_test(self, session: requests.Session, proxies: Optional[Dict[str, str]]) -> None:
|
||||||
global PROXY_TESTED
|
global _PROXY_TESTED
|
||||||
if PROXY_TESTED or not os.getenv("PROXY_TEST"):
|
if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
|
||||||
return
|
return
|
||||||
PROXY_TESTED = True
|
_PROXY_TESTED = True
|
||||||
if not proxies:
|
if not proxies:
|
||||||
print("[proxy] test skipped: no proxy configured")
|
print("[proxy] test skipped: no proxy configured")
|
||||||
return
|
return
|
||||||
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
||||||
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
||||||
try:
|
try:
|
||||||
resp = client.get_text(test_url, timeout=timeout, headers={"Connection": "close"})
|
with request_slot():
|
||||||
|
resp = session.get(
|
||||||
|
test_url,
|
||||||
|
timeout=timeout,
|
||||||
|
headers={"Connection": "close"},
|
||||||
|
)
|
||||||
print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
|
print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"[proxy] test failed: {exc}")
|
print(f"[proxy] test failed: {exc}")
|
||||||
|
|
||||||
def _load_areas(self) -> List[Dict[str, str]]:
|
def _load_areas(self):
|
||||||
tables = ("area_new", "area2", "area")
|
try:
|
||||||
last_error = None
|
return self.db.select_data(
|
||||||
for table in tables:
|
"area_new",
|
||||||
try:
|
"province, city, pinyin",
|
||||||
rows = self.db.select_data(table, "province, city, pinyin", "domain='maxlaw'") or []
|
"domain='maxlaw'"
|
||||||
except Exception as exc:
|
) or []
|
||||||
last_error = exc
|
except Exception as exc:
|
||||||
continue
|
print(f"加载地区失败: {exc}")
|
||||||
if rows:
|
return []
|
||||||
missing_pinyin = sum(1 for row in rows if not (row.get("pinyin") or "").strip())
|
|
||||||
print(f"[大律师] 地区来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
|
||||||
return rows
|
|
||||||
if last_error:
|
|
||||||
print(f"[大律师] 加载地区失败: {last_error}")
|
|
||||||
print("[大律师] 无地区数据(已尝试 area_new/area2/area)")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _get(
|
def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
|
||||||
self,
|
"""发送 GET 请求,带重试机制"""
|
||||||
url: str,
|
|
||||||
*,
|
|
||||||
headers: Optional[Dict[str, str]] = None,
|
|
||||||
max_retries: int = 3,
|
|
||||||
timeout: Tuple[int, int] = (10, 30),
|
|
||||||
) -> Optional[str]:
|
|
||||||
wait_for_request()
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
|
# 使用更长的超时时间,分别设置连接和读取超时
|
||||||
if resp.status_code == 403:
|
with request_slot():
|
||||||
|
resp = self.session.get(
|
||||||
|
url,
|
||||||
|
timeout=(10, 30), # (connect_timeout, read_timeout)
|
||||||
|
verify=False,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
status_code = resp.status_code
|
||||||
|
content = resp.text
|
||||||
|
resp.close()
|
||||||
|
if status_code == 403:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = (2 ** attempt) + random.uniform(0.3, 1.0)
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
print(f"请求403,{wait_time:.2f}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
self._refresh_client()
|
self._refresh_session()
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
continue
|
continue
|
||||||
print(f"请求失败 {url}: 403 Forbidden")
|
print(f"请求失败 {url}: 403 Forbidden")
|
||||||
return None
|
return None
|
||||||
if resp.status_code >= 400:
|
if status_code >= 400:
|
||||||
raise RequestClientError(f"{resp.status_code} Error: {url}")
|
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||||||
return resp.text
|
return content
|
||||||
except RequestConnectTimeout as exc:
|
except requests.exceptions.ConnectTimeout as exc:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2 ** attempt # 指数退避:2s, 4s, 8s
|
||||||
|
print(f"连接超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
else:
|
||||||
|
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
|
||||||
|
return None
|
||||||
|
except requests.exceptions.Timeout as exc:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt
|
wait_time = 2 ** attempt
|
||||||
print(f"连接超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
print(f"请求超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
continue
|
else:
|
||||||
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
|
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
|
||||||
return None
|
return None
|
||||||
except RequestTimeout as exc:
|
except requests.exceptions.ConnectionError as exc:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = 2 ** attempt
|
wait_time = 2 ** attempt
|
||||||
print(f"请求超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
print(f"连接错误,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
continue
|
else:
|
||||||
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
|
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
|
||||||
return None
|
return None
|
||||||
except RequestConnectionError as exc:
|
except requests.exceptions.RequestException as exc:
|
||||||
if attempt < max_retries - 1:
|
|
||||||
wait_time = 2 ** attempt
|
|
||||||
print(f"连接错误,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
|
|
||||||
time.sleep(wait_time)
|
|
||||||
continue
|
|
||||||
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
|
|
||||||
return None
|
|
||||||
except RequestClientError as exc:
|
|
||||||
print(f"请求失败 {url}: {exc}")
|
print(f"请求失败 {url}: {exc}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
cards = soup.find_all("div", class_="lstx")
|
||||||
|
if not cards:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
inserted = 0
|
||||||
|
for card in cards:
|
||||||
|
link = card.find("a")
|
||||||
|
if not link or not link.get("href"):
|
||||||
|
continue
|
||||||
|
detail = self._parse_detail(link['href'], province, city, list_url)
|
||||||
|
if not detail:
|
||||||
|
continue
|
||||||
|
phone = detail.get("phone")
|
||||||
|
if not phone:
|
||||||
|
continue
|
||||||
|
condition = f"phone='{phone}' and domain='{DOMAIN}'"
|
||||||
|
if self.db.is_data_exist("lawyer", condition):
|
||||||
|
print(f" -- 已存在: {detail['name']} ({phone})")
|
||||||
|
time.sleep(0.3)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
self.db.insert_data("lawyer", detail)
|
||||||
|
inserted += 1
|
||||||
|
print(f" -> 新增: {detail['name']} ({phone})")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" 插入失败: {exc}")
|
||||||
|
time.sleep(1)
|
||||||
|
time.sleep(0.3)
|
||||||
|
# 列表页结束后再缓一缓,降低风控
|
||||||
|
time.sleep(0.6)
|
||||||
|
return inserted
|
||||||
|
|
||||||
def _detail_headers(self, referer: str) -> Dict[str, str]:
|
def _detail_headers(self, referer: str) -> Dict[str, str]:
|
||||||
return {
|
return {
|
||||||
"Referer": referer,
|
"Referer": referer,
|
||||||
@@ -166,215 +208,72 @@ class DlsSpider:
|
|||||||
"Upgrade-Insecure-Requests": "1",
|
"Upgrade-Insecure-Requests": "1",
|
||||||
}
|
}
|
||||||
|
|
||||||
def _extract_detail_urls(self, html: str) -> List[str]:
|
def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
url = f"https://m.maxlaw.cn{path}"
|
||||||
urls: List[str] = []
|
print(f" 详情: {url}")
|
||||||
seen: Set[str] = set()
|
html = self._get(url, headers=self._detail_headers(list_url))
|
||||||
|
|
||||||
# 主选择器:当前站点列表卡片
|
|
||||||
for a_tag in soup.select("div.lstx a[href]"):
|
|
||||||
href = (a_tag.get("href") or "").strip()
|
|
||||||
if not href:
|
|
||||||
continue
|
|
||||||
url = urljoin(SITE_BASE, href)
|
|
||||||
if url in seen:
|
|
||||||
continue
|
|
||||||
seen.add(url)
|
|
||||||
urls.append(url)
|
|
||||||
|
|
||||||
# 回退选择器:页面结构轻微变化时尽量保活
|
|
||||||
if not urls:
|
|
||||||
for a_tag in soup.select("a[href]"):
|
|
||||||
href = (a_tag.get("href") or "").strip()
|
|
||||||
if "/lawyer/" not in href:
|
|
||||||
continue
|
|
||||||
url = urljoin(SITE_BASE, href)
|
|
||||||
if url in seen:
|
|
||||||
continue
|
|
||||||
seen.add(url)
|
|
||||||
urls.append(url)
|
|
||||||
return urls
|
|
||||||
|
|
||||||
def _extract_name(self, soup: BeautifulSoup) -> str:
|
|
||||||
for selector in ("h2.lawyerName", "h1.lawyerName", "h1", "h2"):
|
|
||||||
tag = soup.select_one(selector)
|
|
||||||
if tag:
|
|
||||||
name = tag.get_text(strip=True)
|
|
||||||
if name:
|
|
||||||
return name
|
|
||||||
title = soup.title.get_text(strip=True) if soup.title else ""
|
|
||||||
match = re.search(r"(\S+律师)", title)
|
|
||||||
return match.group(1) if match else ""
|
|
||||||
|
|
||||||
def _extract_law_firm(self, soup: BeautifulSoup) -> str:
|
|
||||||
for selector in ("p.law-firm", "div.law-firm", "p[class*=firm]"):
|
|
||||||
tag = soup.select_one(selector)
|
|
||||||
if tag:
|
|
||||||
text = tag.get_text(strip=True)
|
|
||||||
if text:
|
|
||||||
return text
|
|
||||||
page_text = soup.get_text(" ", strip=True)
|
|
||||||
match = re.search(r"(执业机构|律所)\s*[::]?\s*([^\s,。,;;]{2,40})", page_text)
|
|
||||||
if match:
|
|
||||||
return match.group(2).strip()
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _normalize_phone(self, text: str) -> str:
|
|
||||||
compact = re.sub(r"\D", "", text or "")
|
|
||||||
match = PHONE_PATTERN.search(compact)
|
|
||||||
return match.group(0) if match else ""
|
|
||||||
|
|
||||||
def _extract_phone(self, soup: BeautifulSoup) -> str:
|
|
||||||
contact = soup.select_one("ul.contact-content")
|
|
||||||
if contact:
|
|
||||||
phone = self._normalize_phone(contact.get_text(" ", strip=True))
|
|
||||||
if phone:
|
|
||||||
return phone
|
|
||||||
for selector in ("a[href^='tel:']", "span.phone", "p.phone"):
|
|
||||||
tag = soup.select_one(selector)
|
|
||||||
if tag:
|
|
||||||
phone = self._normalize_phone(tag.get_text(" ", strip=True))
|
|
||||||
if phone:
|
|
||||||
return phone
|
|
||||||
return self._normalize_phone(soup.get_text(" ", strip=True))
|
|
||||||
|
|
||||||
def _parse_detail(self, detail_url: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
|
|
||||||
print(f" 详情: {detail_url}")
|
|
||||||
html = self._get(detail_url, headers=self._detail_headers(list_url))
|
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
name = self._extract_name(soup)
|
name_tag = soup.find("h2", class_="lawyerName")
|
||||||
phone = self._extract_phone(soup)
|
law_firm_tag = soup.find("p", class_="law-firm")
|
||||||
|
contact_list = soup.find("ul", class_="contact-content")
|
||||||
|
|
||||||
|
name = name_tag.get_text(strip=True) if name_tag else ""
|
||||||
|
law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
|
||||||
|
phone = ""
|
||||||
|
|
||||||
|
if contact_list:
|
||||||
|
items = contact_list.find_all("li")
|
||||||
|
if len(items) > 2:
|
||||||
|
phone_tag = items[2].find("p")
|
||||||
|
if phone_tag:
|
||||||
|
phone = phone_tag.get_text(strip=True)
|
||||||
|
phone = phone.split("咨询请说明来自大律师网")[0].strip()
|
||||||
|
|
||||||
|
phone = phone.replace('-', '').strip()
|
||||||
if not name or not phone:
|
if not name or not phone:
|
||||||
print(" 信息不完整,跳过")
|
print(" 信息不完整,跳过")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
safe_city = city or province
|
safe_city = city if city else province
|
||||||
return {
|
return {
|
||||||
"name": name,
|
"name": name,
|
||||||
"law_firm": self._extract_law_firm(soup),
|
"law_firm": law_firm,
|
||||||
"province": province,
|
"province": province,
|
||||||
"city": safe_city,
|
"city": safe_city,
|
||||||
"phone": phone,
|
"phone": phone,
|
||||||
"url": detail_url,
|
"url": url,
|
||||||
"domain": DOMAIN,
|
"domain": DOMAIN,
|
||||||
"create_time": int(time.time()),
|
"create_time": int(time.time()),
|
||||||
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False),
|
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
|
||||||
}
|
}
|
||||||
|
|
||||||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
|
||||||
if not phones:
|
|
||||||
return set()
|
|
||||||
existing: Set[str] = set()
|
|
||||||
cur = self.db.db.cursor()
|
|
||||||
try:
|
|
||||||
chunk_size = 500
|
|
||||||
for idx in range(0, len(phones), chunk_size):
|
|
||||||
chunk = phones[idx:idx + chunk_size]
|
|
||||||
placeholders = ",".join(["%s"] * len(chunk))
|
|
||||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
|
||||||
cur.execute(sql, [DOMAIN, *chunk])
|
|
||||||
for row in cur.fetchall():
|
|
||||||
existing.add(row[0])
|
|
||||||
finally:
|
|
||||||
cur.close()
|
|
||||||
return existing
|
|
||||||
|
|
||||||
def _save_lawyers(self, lawyers: List[Dict[str, str]]) -> Tuple[int, int]:
|
|
||||||
if not lawyers:
|
|
||||||
return 0, 0
|
|
||||||
phones = [row["phone"] for row in lawyers if row.get("phone")]
|
|
||||||
existing = self._existing_phones(phones)
|
|
||||||
inserted = 0
|
|
||||||
skipped = 0
|
|
||||||
|
|
||||||
for row in lawyers:
|
|
||||||
phone = row.get("phone", "")
|
|
||||||
if not phone:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
if phone in existing:
|
|
||||||
skipped += 1
|
|
||||||
print(f" -- 已存在: {row.get('name', '')} ({phone})")
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
self.db.insert_data("lawyer", row)
|
|
||||||
existing.add(phone)
|
|
||||||
inserted += 1
|
|
||||||
print(f" -> 新增: {row.get('name', '')} ({phone})")
|
|
||||||
except Exception as exc:
|
|
||||||
skipped += 1
|
|
||||||
print(f" 插入失败 {row.get('url', '')}: {exc}")
|
|
||||||
return inserted, skipped
|
|
||||||
|
|
||||||
def _crawl_city(self, area: Dict[str, str]) -> Tuple[int, int]:
|
|
||||||
pinyin = (area.get("pinyin") or "").strip()
|
|
||||||
province = area.get("province", "")
|
|
||||||
city = area.get("city", "")
|
|
||||||
if not pinyin:
|
|
||||||
return 0, 0
|
|
||||||
|
|
||||||
total_inserted = 0
|
|
||||||
total_parsed = 0
|
|
||||||
page = 1
|
|
||||||
prev_fingerprint = ""
|
|
||||||
|
|
||||||
while True:
|
|
||||||
if MAX_PAGES_PER_CITY > 0 and page > MAX_PAGES_PER_CITY:
|
|
||||||
print(f"达到分页上限({MAX_PAGES_PER_CITY}),停止 {province}-{city}")
|
|
||||||
break
|
|
||||||
|
|
||||||
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
|
||||||
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
|
||||||
html = self._get(list_url)
|
|
||||||
if not html:
|
|
||||||
break
|
|
||||||
|
|
||||||
detail_urls = self._extract_detail_urls(html)
|
|
||||||
if not detail_urls:
|
|
||||||
print(" 列表为空,结束当前城市")
|
|
||||||
break
|
|
||||||
|
|
||||||
fingerprint = "|".join(detail_urls[:8])
|
|
||||||
if fingerprint and fingerprint == prev_fingerprint:
|
|
||||||
print(" 列表页重复,提前停止当前城市")
|
|
||||||
break
|
|
||||||
prev_fingerprint = fingerprint
|
|
||||||
|
|
||||||
lawyers: List[Dict[str, str]] = []
|
|
||||||
for detail_url in detail_urls:
|
|
||||||
row = self._parse_detail(detail_url, province, city, list_url)
|
|
||||||
if row:
|
|
||||||
lawyers.append(row)
|
|
||||||
time.sleep(0.25)
|
|
||||||
|
|
||||||
inserted, skipped = self._save_lawyers(lawyers)
|
|
||||||
total_inserted += inserted
|
|
||||||
total_parsed += len(lawyers)
|
|
||||||
print(
|
|
||||||
f" 第 {page} 页完成: 列表{len(detail_urls)}条, "
|
|
||||||
f"解析{len(lawyers)}条, 新增{inserted}条, 跳过{skipped}条"
|
|
||||||
)
|
|
||||||
|
|
||||||
page += 1
|
|
||||||
time.sleep(0.5)
|
|
||||||
return total_inserted, total_parsed
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
print("启动大律师采集...")
|
print("启动大律师采集...")
|
||||||
if not self.areas:
|
if not self.areas:
|
||||||
print("无地区数据")
|
print("无地区数据")
|
||||||
return
|
return
|
||||||
|
|
||||||
all_inserted = 0
|
|
||||||
all_parsed = 0
|
|
||||||
for area in self.areas:
|
for area in self.areas:
|
||||||
inserted, parsed = self._crawl_city(area)
|
pinyin = area.get("pinyin")
|
||||||
all_inserted += inserted
|
province = area.get("province", "")
|
||||||
all_parsed += parsed
|
city = area.get("city", "")
|
||||||
print(f"大律师采集完成: 解析{all_parsed}条, 新增{all_inserted}条")
|
if not pinyin:
|
||||||
|
continue
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
||||||
|
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
||||||
|
html = self._get(list_url)
|
||||||
|
if not html:
|
||||||
|
break
|
||||||
|
inserted = self._parse_list(html, province, city, list_url)
|
||||||
|
if inserted == 0:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
print("大律师采集完成")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ if project_root not in sys.path:
|
|||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
from request.requests_client import RequestClientError, RequestsClient
|
from request.requests_client import RequestClientError, RequestsClient
|
||||||
from utils.rate_limiter import wait_for_request
|
from utils.rate_limiter import request_slot
|
||||||
from Db import Db
|
from Db import Db
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
@@ -107,9 +107,9 @@ class DlsFreshCrawler:
|
|||||||
def _get_text(self, url: str, timeout: int = 20, max_retries: int = 3) -> str:
|
def _get_text(self, url: str, timeout: int = 20, max_retries: int = 3) -> str:
|
||||||
last_error: Optional[Exception] = None
|
last_error: Optional[Exception] = None
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
wait_for_request()
|
|
||||||
try:
|
try:
|
||||||
resp = self.client.get_text(url, timeout=timeout, verify=False)
|
with request_slot():
|
||||||
|
resp = self.client.get_text(url, timeout=timeout, verify=False)
|
||||||
code = resp.status_code
|
code = resp.status_code
|
||||||
if code == 403:
|
if code == 403:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
|
|||||||
@@ -0,0 +1,438 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
request_dir = os.path.join(project_root, "request")
|
||||||
|
if request_dir not in sys.path:
|
||||||
|
sys.path.insert(0, request_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
import urllib3
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
|
from utils.rate_limiter import request_slot
|
||||||
|
from Db import Db
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
DOMAIN = "大律师"
|
||||||
|
SITE_BASE = "https://www.maxlaw.cn"
|
||||||
|
LIST_URL_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}"
|
||||||
|
PROVINCE_API = "https://js.maxlaw.cn/js/ajax/common/getprovice.js"
|
||||||
|
CITY_API_TEMPLATE = "https://js.maxlaw.cn/js/ajax/common/getcity_{province_id}.js"
|
||||||
|
|
||||||
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||||
|
REPLY_RE = re.compile(r"已回复[::]?\s*(\d+)")
|
||||||
|
AREA_PREFIX_RE = re.compile(r"^[A-Za-z]\s*")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(text: str) -> str:
|
||||||
|
compact = re.sub(r"\D", "", text or "")
|
||||||
|
match = PHONE_RE.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
def clean_area_name(text: str) -> str:
|
||||||
|
value = AREA_PREFIX_RE.sub("", (text or "").strip())
|
||||||
|
return value.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_region_text(text: str) -> str:
|
||||||
|
value = (text or "").strip()
|
||||||
|
value = value.replace("\xa0", " ")
|
||||||
|
value = value.replace("-", "-").replace("—", "-").replace("–", "-")
|
||||||
|
value = re.sub(r"\s*-\s*", "-", value)
|
||||||
|
value = re.sub(r"\s+", "", value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
class DlsPcSpider:
|
||||||
|
def __init__(self, db_connection):
|
||||||
|
self.db = db_connection
|
||||||
|
self.session = self._build_session()
|
||||||
|
self.max_pages = int(os.getenv("MAXLAW_PC_MAX_PAGES", "100"))
|
||||||
|
self.areas = self._load_areas()
|
||||||
|
|
||||||
|
def _build_session(self) -> requests.Session:
|
||||||
|
report_proxy_status()
|
||||||
|
session = requests.Session()
|
||||||
|
session.trust_env = False
|
||||||
|
proxies = get_proxies()
|
||||||
|
if proxies:
|
||||||
|
session.proxies.update(proxies)
|
||||||
|
else:
|
||||||
|
session.proxies.clear()
|
||||||
|
|
||||||
|
retries = Retry(
|
||||||
|
total=3,
|
||||||
|
backoff_factor=1,
|
||||||
|
status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
allowed_methods=frozenset(["GET"]),
|
||||||
|
raise_on_status=False,
|
||||||
|
)
|
||||||
|
adapter = HTTPAdapter(max_retries=retries)
|
||||||
|
session.mount("https://", adapter)
|
||||||
|
session.mount("http://", adapter)
|
||||||
|
session.headers.update({
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/136.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Connection": "close",
|
||||||
|
})
|
||||||
|
return session
|
||||||
|
|
||||||
|
def _refresh_session(self) -> None:
|
||||||
|
try:
|
||||||
|
self.session.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self.session = self._build_session()
|
||||||
|
|
||||||
|
def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
with request_slot():
|
||||||
|
resp = self.session.get(url, timeout=(10, 25), verify=False, headers=headers)
|
||||||
|
status_code = resp.status_code
|
||||||
|
text = resp.text
|
||||||
|
resp.close()
|
||||||
|
if status_code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
|
print(f"403被拦截,{wait_time:.1f}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
|
self._refresh_session()
|
||||||
|
time.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
print(f"请求失败 {url}: 403 Forbidden")
|
||||||
|
return None
|
||||||
|
if status_code >= 400:
|
||||||
|
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||||||
|
return text
|
||||||
|
except requests.exceptions.RequestException as exc:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
|
print(f"请求失败,{wait_time:.1f}秒后重试 ({attempt + 1}/{max_retries}): {url} -> {exc}")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
print(f"请求失败 {url}: {exc}")
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_json(self, url: str) -> Optional[Dict]:
|
||||||
|
text = self._get(url)
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(text.strip().lstrip("\ufeff"))
|
||||||
|
except ValueError as exc:
|
||||||
|
print(f"解析JSON失败 {url}: {exc}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _load_areas(self) -> List[Dict[str, str]]:
|
||||||
|
areas = self._load_areas_from_site()
|
||||||
|
if areas:
|
||||||
|
print(f"[大律师PC] 地区来源: site, 地区数: {len(areas)}")
|
||||||
|
return areas
|
||||||
|
|
||||||
|
areas = self._load_areas_from_db()
|
||||||
|
if areas:
|
||||||
|
print(f"[大律师PC] 地区来源: db, 地区数: {len(areas)}")
|
||||||
|
return areas
|
||||||
|
|
||||||
|
print("[大律师PC] 无地区数据")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _load_areas_from_site(self) -> List[Dict[str, str]]:
|
||||||
|
data = self._get_json(PROVINCE_API)
|
||||||
|
if not data or str(data.get("status")) != "1":
|
||||||
|
return []
|
||||||
|
|
||||||
|
result: List[Dict[str, str]] = []
|
||||||
|
seen_pinyin: Set[str] = set()
|
||||||
|
|
||||||
|
for province in data.get("ds", []) or []:
|
||||||
|
province_id = province.get("id")
|
||||||
|
province_name = clean_area_name(province.get("name", ""))
|
||||||
|
province_pinyin = (province.get("py_code") or "").strip()
|
||||||
|
|
||||||
|
city_rows = []
|
||||||
|
if province_id:
|
||||||
|
city_data = self._get_json(CITY_API_TEMPLATE.format(province_id=province_id))
|
||||||
|
if city_data and str(city_data.get("status")) == "1":
|
||||||
|
city_rows = city_data.get("ds", []) or []
|
||||||
|
|
||||||
|
if not city_rows and province_pinyin and province_pinyin not in seen_pinyin:
|
||||||
|
seen_pinyin.add(province_pinyin)
|
||||||
|
result.append({
|
||||||
|
"province": province_name,
|
||||||
|
"city": province_name,
|
||||||
|
"pinyin": province_pinyin,
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
for city in city_rows:
|
||||||
|
city_name = clean_area_name(city.get("name", ""))
|
||||||
|
city_pinyin = (city.get("py_code") or "").strip()
|
||||||
|
if not city_pinyin or city_pinyin in seen_pinyin:
|
||||||
|
continue
|
||||||
|
seen_pinyin.add(city_pinyin)
|
||||||
|
result.append({
|
||||||
|
"province": province_name,
|
||||||
|
"city": city_name or province_name,
|
||||||
|
"pinyin": city_pinyin,
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _load_areas_from_db(self) -> List[Dict[str, str]]:
|
||||||
|
tables = ("area_new", "area", "area2")
|
||||||
|
last_error = None
|
||||||
|
for table in tables:
|
||||||
|
try:
|
||||||
|
rows = self.db.select_data(
|
||||||
|
table,
|
||||||
|
"province, city, pinyin",
|
||||||
|
"domain='maxlaw' AND level=2",
|
||||||
|
) or []
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = exc
|
||||||
|
continue
|
||||||
|
|
||||||
|
if rows:
|
||||||
|
return rows
|
||||||
|
|
||||||
|
if last_error:
|
||||||
|
print(f"[大律师PC] 加载数据库地区失败: {last_error}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||||
|
if not phones:
|
||||||
|
return set()
|
||||||
|
existing: Set[str] = set()
|
||||||
|
cur = self.db.db.cursor()
|
||||||
|
try:
|
||||||
|
chunk_size = 500
|
||||||
|
for i in range(0, len(phones), chunk_size):
|
||||||
|
chunk = phones[i:i + chunk_size]
|
||||||
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
|
cur.execute(sql, [DOMAIN, *chunk])
|
||||||
|
for row in cur.fetchall():
|
||||||
|
existing.add(row[0])
|
||||||
|
finally:
|
||||||
|
cur.close()
|
||||||
|
return existing
|
||||||
|
|
||||||
|
def _build_list_url(self, pinyin: str, page: int) -> str:
|
||||||
|
return LIST_URL_TEMPLATE.format(pinyin=pinyin, page=page)
|
||||||
|
|
||||||
|
def _parse_location_line(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
fallback_province: str,
|
||||||
|
fallback_city: str,
|
||||||
|
) -> Tuple[str, str, str]:
|
||||||
|
raw = (text or "").replace("\xa0", " ")
|
||||||
|
raw = re.sub(r"\s+", " ", raw).strip()
|
||||||
|
if not raw:
|
||||||
|
return fallback_province, fallback_city or fallback_province, ""
|
||||||
|
|
||||||
|
parts = raw.split(" ", 1)
|
||||||
|
area_text = parts[0].strip()
|
||||||
|
law_firm = parts[1].strip() if len(parts) > 1 else ""
|
||||||
|
|
||||||
|
province = fallback_province
|
||||||
|
city = fallback_city or fallback_province
|
||||||
|
if "-" in area_text:
|
||||||
|
area_parts = [item.strip() for item in area_text.split("-", 1)]
|
||||||
|
if area_parts[0]:
|
||||||
|
province = area_parts[0]
|
||||||
|
if len(area_parts) > 1 and area_parts[1]:
|
||||||
|
city = area_parts[1]
|
||||||
|
elif area_text:
|
||||||
|
province = area_text
|
||||||
|
city = area_text
|
||||||
|
|
||||||
|
return province, city, law_firm
|
||||||
|
|
||||||
|
def _extract_page_region(self, soup: BeautifulSoup) -> str:
|
||||||
|
button = soup.select_one(".filter .filter-btn")
|
||||||
|
if button:
|
||||||
|
return normalize_region_text(button.get_text(" ", strip=True))
|
||||||
|
title = soup.select_one(".findLawyer-title h1")
|
||||||
|
if title:
|
||||||
|
return normalize_region_text(title.get_text(strip=True).replace("律师", ""))
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _page_matches_area(self, soup: BeautifulSoup, province: str, city: str) -> Tuple[bool, str]:
|
||||||
|
current_region = self._extract_page_region(soup)
|
||||||
|
if not current_region:
|
||||||
|
return True, current_region
|
||||||
|
if "全国" in current_region:
|
||||||
|
return False, current_region
|
||||||
|
|
||||||
|
norm_province = normalize_region_text(province)
|
||||||
|
norm_city = normalize_region_text(city or province)
|
||||||
|
|
||||||
|
if norm_city and norm_city != norm_province:
|
||||||
|
matched = norm_province in current_region and norm_city in current_region
|
||||||
|
else:
|
||||||
|
matched = norm_province in current_region
|
||||||
|
|
||||||
|
if matched:
|
||||||
|
return True, current_region
|
||||||
|
|
||||||
|
title = soup.select_one(".findLawyer-title h1")
|
||||||
|
title_text = ""
|
||||||
|
if title:
|
||||||
|
title_text = normalize_region_text(title.get_text(strip=True).replace("律师", ""))
|
||||||
|
|
||||||
|
if norm_city and norm_city != norm_province:
|
||||||
|
matched = norm_city in title_text
|
||||||
|
else:
|
||||||
|
matched = norm_province in title_text
|
||||||
|
|
||||||
|
return matched, current_region or title_text
|
||||||
|
|
||||||
|
def _parse_list(self, html: str, province: str, city: str, list_url: str, area_pinyin: str) -> Tuple[bool, int, int]:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
matched, current_region = self._page_matches_area(soup, province, city)
|
||||||
|
if not matched:
|
||||||
|
print(f" 页面地区不匹配,停止分页: 目标={province}-{city} 当前={current_region or '未知'}")
|
||||||
|
return False, 0, 0
|
||||||
|
|
||||||
|
cards = []
|
||||||
|
seen_page_phone: Set[str] = set()
|
||||||
|
|
||||||
|
for item in soup.select("ul.findLawyer-list > li.clearfix"):
|
||||||
|
name_link = item.select_one(".findLawyer-list-detail-name a[href]")
|
||||||
|
phone_tag = item.select_one(".findLawyer-list-detail-name span")
|
||||||
|
if not name_link or not phone_tag:
|
||||||
|
continue
|
||||||
|
|
||||||
|
phone = normalize_phone(phone_tag.get_text(" ", strip=True))
|
||||||
|
if not phone or phone in seen_page_phone:
|
||||||
|
continue
|
||||||
|
seen_page_phone.add(phone)
|
||||||
|
|
||||||
|
name = name_link.get_text(strip=True)
|
||||||
|
detail_url = urljoin(SITE_BASE, name_link.get("href", "").strip())
|
||||||
|
|
||||||
|
location_tag = item.select_one(".findLawyer-list-detail-the")
|
||||||
|
card_province, card_city, law_firm = self._parse_location_line(
|
||||||
|
location_tag.get_text(" ", strip=True) if location_tag else "",
|
||||||
|
province,
|
||||||
|
city,
|
||||||
|
)
|
||||||
|
|
||||||
|
specialties = []
|
||||||
|
for dd in item.select(".findLawyer-list-detail-fields dd"):
|
||||||
|
text = dd.get_text(strip=True)
|
||||||
|
if text:
|
||||||
|
specialties.append(text)
|
||||||
|
|
||||||
|
reply_count = None
|
||||||
|
reply_tag = item.select_one(".findLawyer-list-detail-other a")
|
||||||
|
if reply_tag:
|
||||||
|
match = REPLY_RE.search(reply_tag.get_text(" ", strip=True))
|
||||||
|
if match:
|
||||||
|
reply_count = int(match.group(1))
|
||||||
|
|
||||||
|
cards.append({
|
||||||
|
"name": name,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"province": card_province or province,
|
||||||
|
"city": card_city or city or province,
|
||||||
|
"phone": phone,
|
||||||
|
"url": detail_url,
|
||||||
|
"domain": DOMAIN,
|
||||||
|
"create_time": int(time.time()),
|
||||||
|
"params": json.dumps({
|
||||||
|
"area_pinyin": area_pinyin,
|
||||||
|
"source": list_url,
|
||||||
|
"specialties": specialties,
|
||||||
|
"reply_count": reply_count,
|
||||||
|
}, ensure_ascii=False),
|
||||||
|
})
|
||||||
|
|
||||||
|
if not cards:
|
||||||
|
return True, 0, 0
|
||||||
|
|
||||||
|
phones = [item["phone"] for item in cards if item.get("phone")]
|
||||||
|
existing = self._existing_phones(phones)
|
||||||
|
inserted = 0
|
||||||
|
|
||||||
|
for item in cards:
|
||||||
|
phone = item.get("phone")
|
||||||
|
if not phone:
|
||||||
|
continue
|
||||||
|
if phone in existing:
|
||||||
|
print(f" -- 已存在: {item['name']} ({phone})")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
self.db.insert_data("lawyer", item)
|
||||||
|
inserted += 1
|
||||||
|
print(f" -> 新增: {item['name']} ({phone})")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" 插入失败 {item.get('url')}: {exc}")
|
||||||
|
|
||||||
|
return True, inserted, len(cards)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
print("启动大律师 PC 站采集...")
|
||||||
|
if not self.areas:
|
||||||
|
print("无地区数据")
|
||||||
|
return
|
||||||
|
|
||||||
|
for area in self.areas:
|
||||||
|
province = (area.get("province") or "").strip()
|
||||||
|
city = (area.get("city") or province).strip()
|
||||||
|
pinyin = (area.get("pinyin") or "").strip()
|
||||||
|
if not province or not pinyin:
|
||||||
|
continue
|
||||||
|
|
||||||
|
area_label = province if not city or city == province else f"{province}-{city}"
|
||||||
|
print(f"采集地区: {area_label} ({pinyin})")
|
||||||
|
|
||||||
|
for page in range(1, self.max_pages + 1):
|
||||||
|
list_url = self._build_list_url(pinyin, page)
|
||||||
|
print(f" 第 {page} 页: {list_url}")
|
||||||
|
html = self._get(list_url, headers={"Referer": SITE_BASE + "/law"})
|
||||||
|
if not html:
|
||||||
|
break
|
||||||
|
|
||||||
|
page_ok, inserted, parsed_count = self._parse_list(html, province, city, list_url, pinyin)
|
||||||
|
if not page_ok:
|
||||||
|
break
|
||||||
|
if parsed_count == 0:
|
||||||
|
print(" 当前页无律师卡片,停止")
|
||||||
|
break
|
||||||
|
|
||||||
|
if inserted == 0:
|
||||||
|
print(" 当前页无新增数据")
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
print("大律师 PC 站采集完成")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
with Db() as db:
|
||||||
|
spider = DlsPcSpider(db)
|
||||||
|
spider.run()
|
||||||
@@ -19,6 +19,9 @@ if project_root not in sys.path:
|
|||||||
from Db import Db
|
from Db import Db
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_EXPORT_START_TS = 1772932103
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
|
parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -30,7 +33,10 @@ def parse_args() -> argparse.Namespace:
|
|||||||
"--start-ts",
|
"--start-ts",
|
||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
help="create_time 起始时间戳(含),不传时默认取最近7天",
|
help=(
|
||||||
|
"create_time 起始时间戳(含),"
|
||||||
|
f"不传时默认取 {DEFAULT_EXPORT_START_TS} 之后的数据"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--end-ts",
|
"--end-ts",
|
||||||
@@ -43,6 +49,11 @@ def parse_args() -> argparse.Namespace:
|
|||||||
default="",
|
default="",
|
||||||
help="按 domain 过滤,例如:大律师 / 找法网 / 华律",
|
help="按 domain 过滤,例如:大律师 / 找法网 / 华律",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--exclude-domain",
|
||||||
|
default="",
|
||||||
|
help="排除指定 domain,例如:高德地图",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--province",
|
"--province",
|
||||||
default="",
|
default="",
|
||||||
@@ -74,13 +85,18 @@ def parse_args() -> argparse.Namespace:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="关闭 params JSON 扩展信息解析(默认开启)",
|
help="关闭 params JSON 扩展信息解析(默认开启)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--douyin-only",
|
||||||
|
action="store_true",
|
||||||
|
help="仅导出抖音采集数据(domain=抖音),并追加抖音专用字段",
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def apply_default_time_filter(args: argparse.Namespace) -> None:
|
def apply_default_time_filter(args: argparse.Namespace) -> None:
|
||||||
# 未显式传时间范围时,默认导出最近7天的数据
|
# 未显式传时间范围时,默认导出指定时间戳之后的数据
|
||||||
if args.start_ts is None and args.end_ts is None:
|
if args.start_ts is None and args.end_ts is None:
|
||||||
args.start_ts = int(time.time()) - 7 * 24 * 3600
|
args.start_ts = DEFAULT_EXPORT_START_TS
|
||||||
args.end_ts = 0
|
args.end_ts = 0
|
||||||
return
|
return
|
||||||
if args.start_ts is None:
|
if args.start_ts is None:
|
||||||
@@ -109,15 +125,23 @@ def build_query(args: argparse.Namespace) -> (str, List):
|
|||||||
where: List[str] = []
|
where: List[str] = []
|
||||||
params: List = []
|
params: List = []
|
||||||
|
|
||||||
|
if args.douyin_only:
|
||||||
|
target_domain = args.domain.strip() or "抖音"
|
||||||
|
where.append("domain = %s")
|
||||||
|
params.append(target_domain)
|
||||||
|
|
||||||
if args.start_ts > 0:
|
if args.start_ts > 0:
|
||||||
where.append("create_time >= %s")
|
where.append("create_time >= %s")
|
||||||
params.append(args.start_ts)
|
params.append(args.start_ts)
|
||||||
if args.end_ts > 0:
|
if args.end_ts > 0:
|
||||||
where.append("create_time <= %s")
|
where.append("create_time <= %s")
|
||||||
params.append(args.end_ts)
|
params.append(args.end_ts)
|
||||||
if args.domain.strip():
|
if args.domain.strip() and not args.douyin_only:
|
||||||
where.append("domain = %s")
|
where.append("domain = %s")
|
||||||
params.append(args.domain.strip())
|
params.append(args.domain.strip())
|
||||||
|
if args.exclude_domain.strip():
|
||||||
|
where.append("domain <> %s")
|
||||||
|
params.append(args.exclude_domain.strip())
|
||||||
if args.province.strip():
|
if args.province.strip():
|
||||||
where.append("province = %s")
|
where.append("province = %s")
|
||||||
params.append(args.province.strip())
|
params.append(args.province.strip())
|
||||||
@@ -161,6 +185,13 @@ def parse_params(params_text: str) -> Dict[str, str]:
|
|||||||
else:
|
else:
|
||||||
specialties_text = ""
|
specialties_text = ""
|
||||||
|
|
||||||
|
user_info = data.get("user_info") or {}
|
||||||
|
if not isinstance(user_info, dict):
|
||||||
|
user_info = {}
|
||||||
|
|
||||||
|
sec_uid = str(data.get("sec_uid") or "")
|
||||||
|
douyin_url = f"https://www.douyin.com/user/{sec_uid}" if sec_uid else ""
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"email": str(profile.get("email") or ""),
|
"email": str(profile.get("email") or ""),
|
||||||
"address": str(profile.get("address") or ""),
|
"address": str(profile.get("address") or ""),
|
||||||
@@ -170,19 +201,34 @@ def parse_params(params_text: str) -> Dict[str, str]:
|
|||||||
"source_site": str(source.get("site") or ""),
|
"source_site": str(source.get("site") or ""),
|
||||||
"detail_url": str(source.get("detail_url") or ""),
|
"detail_url": str(source.get("detail_url") or ""),
|
||||||
"list_url": str(source.get("list_url") or ""),
|
"list_url": str(source.get("list_url") or ""),
|
||||||
|
"api_source": str(data.get("api_source") or ""),
|
||||||
|
"api_url": str(data.get("api_url") or ""),
|
||||||
|
"city_index": str(data.get("city_index") or ""),
|
||||||
|
"captured_at": str(data.get("captured_at") or ""),
|
||||||
|
"sec_uid": sec_uid,
|
||||||
|
"douyin_uid": str(user_info.get("uid") or ""),
|
||||||
|
"douyin_unique_id": str(user_info.get("unique_id") or ""),
|
||||||
|
"douyin_signature": str(user_info.get("signature") or ""),
|
||||||
|
"douyin_nickname": str(user_info.get("nickname") or ""),
|
||||||
|
"douyin_url": douyin_url,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, parse_params_flag: bool) -> int:
|
def export_to_excel(
|
||||||
|
rows: List[Dict],
|
||||||
|
output_path: str,
|
||||||
|
include_extra: bool,
|
||||||
|
parse_params_flag: bool,
|
||||||
|
douyin_only: bool,
|
||||||
|
) -> int:
|
||||||
wb = Workbook()
|
wb = Workbook()
|
||||||
ws = wb.active
|
ws = wb.active
|
||||||
ws.title = "lawyers"
|
ws.title = "lawyers"
|
||||||
|
|
||||||
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"]
|
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain", "URL"]
|
||||||
if include_extra:
|
if include_extra:
|
||||||
headers.extend(
|
headers.extend(
|
||||||
[
|
[
|
||||||
"URL",
|
|
||||||
"站点",
|
"站点",
|
||||||
"create_time",
|
"create_time",
|
||||||
"create_time_text",
|
"create_time_text",
|
||||||
@@ -204,6 +250,22 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
|
|||||||
"list_url",
|
"list_url",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
if parse_params_flag and douyin_only:
|
||||||
|
headers.extend(
|
||||||
|
[
|
||||||
|
"sec_uid",
|
||||||
|
"抖音uid",
|
||||||
|
"抖音号",
|
||||||
|
"抖音昵称",
|
||||||
|
"抖音简介",
|
||||||
|
"抖音主页URL",
|
||||||
|
"api_source",
|
||||||
|
"api_url",
|
||||||
|
"city_index",
|
||||||
|
"captured_at",
|
||||||
|
"captured_at_text",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
ws.append(headers)
|
ws.append(headers)
|
||||||
for cell in ws[1]:
|
for cell in ws[1]:
|
||||||
@@ -221,12 +283,12 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
|
|||||||
row.get("city", "") or "",
|
row.get("city", "") or "",
|
||||||
site_name,
|
site_name,
|
||||||
row.get("domain", "") or "",
|
row.get("domain", "") or "",
|
||||||
|
row.get("url", "") or "",
|
||||||
]
|
]
|
||||||
|
|
||||||
if include_extra:
|
if include_extra:
|
||||||
line.extend(
|
line.extend(
|
||||||
[
|
[
|
||||||
row.get("url", "") or "",
|
|
||||||
row.get("domain", "") or "",
|
row.get("domain", "") or "",
|
||||||
row.get("create_time", "") or "",
|
row.get("create_time", "") or "",
|
||||||
ts_to_text(row.get("create_time")),
|
ts_to_text(row.get("create_time")),
|
||||||
@@ -250,6 +312,29 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if parse_params_flag and douyin_only:
|
||||||
|
captured_at_text = ""
|
||||||
|
try:
|
||||||
|
captured_at_text = ts_to_text(int(info.get("captured_at", "") or 0))
|
||||||
|
except Exception:
|
||||||
|
captured_at_text = ""
|
||||||
|
|
||||||
|
line.extend(
|
||||||
|
[
|
||||||
|
info.get("sec_uid", ""),
|
||||||
|
info.get("douyin_uid", ""),
|
||||||
|
info.get("douyin_unique_id", ""),
|
||||||
|
info.get("douyin_nickname", ""),
|
||||||
|
info.get("douyin_signature", ""),
|
||||||
|
info.get("douyin_url", ""),
|
||||||
|
info.get("api_source", ""),
|
||||||
|
info.get("api_url", ""),
|
||||||
|
info.get("city_index", ""),
|
||||||
|
info.get("captured_at", ""),
|
||||||
|
captured_at_text,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
ws.append(line)
|
ws.append(line)
|
||||||
exported += 1
|
exported += 1
|
||||||
|
|
||||||
@@ -277,6 +362,7 @@ def main() -> None:
|
|||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
include_extra=args.include_extra,
|
include_extra=args.include_extra,
|
||||||
parse_params_flag=not args.no_parse_params,
|
parse_params_flag=not args.no_parse_params,
|
||||||
|
douyin_only=args.douyin_only,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"[export] 导出完成,共 {count} 条")
|
print(f"[export] 导出完成,共 {count} 条")
|
||||||
|
|||||||
+176
-429
@@ -1,16 +1,9 @@
|
|||||||
import argparse
|
|
||||||
import ast
|
|
||||||
import hashlib
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
import random
|
||||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
from typing import Dict, List, Set, Optional
|
||||||
|
|
||||||
import urllib3
|
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -20,460 +13,214 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
from Db import Db
|
from Db import Db
|
||||||
from request.requests_client import RequestClientError, RequestsClient
|
from utils.rate_limiter import request_slot
|
||||||
from utils.rate_limiter import wait_for_request
|
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
DOMAIN = "找法网"
|
||||||
|
LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
|
||||||
SITE_NAME = "findlaw"
|
|
||||||
LEGACY_DOMAIN = "找法网"
|
|
||||||
SITE_BASE = "https://m.findlaw.cn"
|
|
||||||
CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
|
|
||||||
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
|
|
||||||
|
|
||||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class FindlawSpider:
|
||||||
class CityTarget:
|
def __init__(self, db_connection):
|
||||||
province_id: str
|
|
||||||
province_name: str
|
|
||||||
province_py: str
|
|
||||||
city_id: str
|
|
||||||
city_name: str
|
|
||||||
city_py: str
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_phone(text: str) -> str:
|
|
||||||
compact = re.sub(r"\D", "", text or "")
|
|
||||||
match = PHONE_RE.search(compact)
|
|
||||||
return match.group(0) if match else ""
|
|
||||||
|
|
||||||
|
|
||||||
class FindlawCrawler:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
max_pages: int = 9999,
|
|
||||||
sleep_seconds: float = 0.1,
|
|
||||||
use_proxy: bool = True,
|
|
||||||
db_connection=None,
|
|
||||||
):
|
|
||||||
self.max_pages = max_pages
|
|
||||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = RequestsClient(
|
self.session = self._build_session()
|
||||||
headers={
|
self.cities = self._load_cities()
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
|
||||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
|
||||||
"Mobile/15E148 Safari/604.1"
|
|
||||||
),
|
|
||||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
||||||
"X-Requested-With": "XMLHttpRequest",
|
|
||||||
"Connection": "close",
|
|
||||||
},
|
|
||||||
use_proxy=use_proxy,
|
|
||||||
retry_total=2,
|
|
||||||
retry_backoff_factor=1,
|
|
||||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
|
||||||
retry_allowed_methods=("GET",),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_text(
|
def _build_session(self) -> requests.Session:
|
||||||
self,
|
report_proxy_status()
|
||||||
url: str,
|
session = requests.Session()
|
||||||
timeout: int = 20,
|
session.trust_env = False
|
||||||
max_retries: int = 3,
|
proxies = get_proxies()
|
||||||
referer: str = SITE_BASE,
|
if proxies:
|
||||||
) -> str:
|
session.proxies.update(proxies)
|
||||||
headers = {"Referer": referer}
|
else:
|
||||||
last_error: Optional[Exception] = None
|
session.proxies.clear()
|
||||||
|
session.headers.update({
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||||
|
"Mobile/15E148 Safari/604.1"
|
||||||
|
),
|
||||||
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||||
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
|
"Connection": "close",
|
||||||
|
})
|
||||||
|
return session
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
def _refresh_session(self) -> None:
|
||||||
wait_for_request()
|
|
||||||
try:
|
|
||||||
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
|
|
||||||
code = resp.status_code
|
|
||||||
if code == 403:
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
self.client.refresh()
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
raise RequestClientError(f"{code} Error: {url}")
|
|
||||||
if code >= 500 and attempt < max_retries - 1:
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
if code >= 400:
|
|
||||||
raise RequestClientError(f"{code} Error: {url}")
|
|
||||||
return resp.text
|
|
||||||
except Exception as exc:
|
|
||||||
last_error = exc
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
raise
|
|
||||||
|
|
||||||
if last_error is not None:
|
|
||||||
raise last_error
|
|
||||||
raise RequestClientError(f"Unknown request error: {url}")
|
|
||||||
|
|
||||||
def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
|
|
||||||
pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
|
|
||||||
match = re.search(pattern, script_text)
|
|
||||||
if not match:
|
|
||||||
return []
|
|
||||||
raw = match.group(1)
|
|
||||||
try:
|
try:
|
||||||
rows = ast.literal_eval(raw)
|
self.session.close()
|
||||||
return rows if isinstance(rows, list) else []
|
|
||||||
except Exception:
|
except Exception:
|
||||||
return []
|
pass
|
||||||
|
self.session = self._build_session()
|
||||||
|
|
||||||
def discover_cities(self) -> List[CityTarget]:
|
def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
|
||||||
js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
|
headers = {"Referer": referer}
|
||||||
provinces = self._parse_city_js_array(js_text, "iosProvinces")
|
for attempt in range(max_retries):
|
||||||
cities = self._parse_city_js_array(js_text, "iosCitys")
|
|
||||||
|
|
||||||
province_map: Dict[str, Dict] = {}
|
|
||||||
for item in provinces:
|
|
||||||
pid = str(item.get("id") or "").strip()
|
|
||||||
if pid:
|
|
||||||
province_map[pid] = item
|
|
||||||
|
|
||||||
results: List[CityTarget] = []
|
|
||||||
seen_py: Set[str] = set()
|
|
||||||
for city in cities:
|
|
||||||
city_py = str(city.get("pinyin") or "").strip()
|
|
||||||
city_name = str(city.get("value") or "").strip()
|
|
||||||
city_id = str(city.get("id") or "").strip()
|
|
||||||
province_id = str(city.get("parentId") or "").strip()
|
|
||||||
if not city_py or not city_name or not city_id:
|
|
||||||
continue
|
|
||||||
if city_py in seen_py:
|
|
||||||
continue
|
|
||||||
seen_py.add(city_py)
|
|
||||||
|
|
||||||
province_row = province_map.get(province_id, {})
|
|
||||||
province_name = str(province_row.get("value") or city_name).strip()
|
|
||||||
province_py = str(province_row.get("pinyin") or city_py).strip()
|
|
||||||
|
|
||||||
results.append(
|
|
||||||
CityTarget(
|
|
||||||
province_id=province_id,
|
|
||||||
province_name=province_name,
|
|
||||||
province_py=province_py,
|
|
||||||
city_id=city_id,
|
|
||||||
city_name=city_name,
|
|
||||||
city_py=city_py,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return results
|
|
||||||
|
|
||||||
def _parse_list_payload(self, text: str) -> Dict:
|
|
||||||
cleaned = (text or "").strip().lstrip("\ufeff")
|
|
||||||
try:
|
|
||||||
return json.loads(cleaned)
|
|
||||||
except ValueError:
|
|
||||||
start = cleaned.find("{")
|
|
||||||
end = cleaned.rfind("}")
|
|
||||||
if start == -1 or end == -1:
|
|
||||||
return {}
|
|
||||||
return json.loads(cleaned[start:end + 1])
|
|
||||||
|
|
||||||
def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
|
|
||||||
list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
|
|
||||||
referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
|
|
||||||
text = self._get_text(list_url, referer=referer)
|
|
||||||
payload = self._parse_list_payload(text)
|
|
||||||
if payload.get("errcode") != 0:
|
|
||||||
return [], False, list_url
|
|
||||||
|
|
||||||
data = payload.get("data", {}) or {}
|
|
||||||
items = data.get("lawyer_list", []) or []
|
|
||||||
has_more = str(data.get("has_more", "0")) == "1"
|
|
||||||
return items, has_more, list_url
|
|
||||||
|
|
||||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
|
||||||
for page in range(1, self.max_pages + 1):
|
|
||||||
try:
|
try:
|
||||||
items, has_more, list_url = self.fetch_list_page(target.city_py, page)
|
with request_slot():
|
||||||
except Exception as exc:
|
resp = self.session.get(url, timeout=15, verify=verify, headers=headers)
|
||||||
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
|
status_code = resp.status_code
|
||||||
break
|
text = resp.text
|
||||||
|
resp.close()
|
||||||
|
if status_code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
|
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||||||
|
self._refresh_session()
|
||||||
|
time.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
print(f"请求失败 {url}: 403 Forbidden")
|
||||||
|
return None
|
||||||
|
if status_code >= 400:
|
||||||
|
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||||||
|
return text
|
||||||
|
except requests.exceptions.SSLError:
|
||||||
|
if verify:
|
||||||
|
return self._get(url, referer, verify=False, max_retries=max_retries)
|
||||||
|
print(f"SSL错误 {url}")
|
||||||
|
return None
|
||||||
|
except requests.exceptions.RequestException as exc:
|
||||||
|
print(f"请求失败 {url}: {exc}")
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
if not items:
|
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||||
break
|
if not phones:
|
||||||
|
|
||||||
for item in items:
|
|
||||||
detail_url = item.get("siteask_m") or item.get("site_url") or ""
|
|
||||||
detail_url = str(detail_url).strip()
|
|
||||||
if not detail_url.startswith("http"):
|
|
||||||
detail_url = list_url
|
|
||||||
|
|
||||||
phone = normalize_phone(item.get("mobile", ""))
|
|
||||||
profile = {
|
|
||||||
"uid": str(item.get("uid") or ""),
|
|
||||||
"name": str(item.get("username") or "").strip(),
|
|
||||||
"law_firm": str(item.get("lawyer_lawroom") or "").strip(),
|
|
||||||
"phone": phone,
|
|
||||||
"lawyer_year": item.get("lawyer_year"),
|
|
||||||
"service_area": str(item.get("service_area") or "").strip(),
|
|
||||||
"address": str(item.get("addr") or "").strip(),
|
|
||||||
"specialties": item.get("professionArr") or [],
|
|
||||||
"answer_count": item.get("ansnum"),
|
|
||||||
"comment_count": item.get("askcommentnum"),
|
|
||||||
}
|
|
||||||
|
|
||||||
now = int(time.time())
|
|
||||||
uid = profile.get("uid", "")
|
|
||||||
record_key = uid or detail_url
|
|
||||||
record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
area = item.get("areaInfo", {}) or {}
|
|
||||||
yield {
|
|
||||||
"record_id": record_id,
|
|
||||||
"collected_at": now,
|
|
||||||
"source": {
|
|
||||||
"site": SITE_NAME,
|
|
||||||
"list_url": list_url,
|
|
||||||
"detail_url": detail_url,
|
|
||||||
"province": str(area.get("province") or target.province_name),
|
|
||||||
"province_py": target.province_py,
|
|
||||||
"city": str(area.get("city") or target.city_name),
|
|
||||||
"city_py": target.city_py,
|
|
||||||
"page": page,
|
|
||||||
},
|
|
||||||
"list_snapshot": {
|
|
||||||
"uid": uid,
|
|
||||||
"name": profile["name"],
|
|
||||||
"law_firm": profile["law_firm"],
|
|
||||||
"answer_count": profile["answer_count"],
|
|
||||||
"comment_count": profile["comment_count"],
|
|
||||||
},
|
|
||||||
"profile": profile,
|
|
||||||
"raw": item,
|
|
||||||
}
|
|
||||||
if self.sleep_seconds:
|
|
||||||
time.sleep(self.sleep_seconds)
|
|
||||||
|
|
||||||
if not has_more:
|
|
||||||
break
|
|
||||||
|
|
||||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
|
||||||
source = record.get("source", {}) or {}
|
|
||||||
profile = record.get("profile", {}) or {}
|
|
||||||
phone = normalize_phone(profile.get("phone", ""))
|
|
||||||
if not phone:
|
|
||||||
return None
|
|
||||||
|
|
||||||
province = (source.get("province") or "").strip()
|
|
||||||
city = (source.get("city") or province).strip()
|
|
||||||
return {
|
|
||||||
"name": (profile.get("name") or "").strip(),
|
|
||||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
|
||||||
"province": province,
|
|
||||||
"city": city,
|
|
||||||
"phone": phone,
|
|
||||||
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
|
|
||||||
"domain": LEGACY_DOMAIN,
|
|
||||||
"create_time": int(record.get("collected_at") or time.time()),
|
|
||||||
"params": json.dumps(record, ensure_ascii=False),
|
|
||||||
}
|
|
||||||
|
|
||||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
|
||||||
if not self.db or not phones:
|
|
||||||
return set()
|
return set()
|
||||||
deduped = sorted({p for p in phones if p})
|
|
||||||
if not deduped:
|
|
||||||
return set()
|
|
||||||
|
|
||||||
existing: Set[str] = set()
|
existing: Set[str] = set()
|
||||||
cur = self.db.db.cursor()
|
cur = self.db.db.cursor()
|
||||||
try:
|
try:
|
||||||
chunk_size = 500
|
chunk_size = 500
|
||||||
for i in range(0, len(deduped), chunk_size):
|
for i in range(0, len(phones), chunk_size):
|
||||||
chunk = deduped[i:i + chunk_size]
|
chunk = phones[i:i + chunk_size]
|
||||||
placeholders = ",".join(["%s"] * len(chunk))
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
cur.execute(sql, [DOMAIN, *chunk])
|
||||||
for row in cur.fetchall():
|
for row in cur.fetchall():
|
||||||
existing.add(row[0])
|
existing.add(row[0])
|
||||||
finally:
|
finally:
|
||||||
cur.close()
|
cur.close()
|
||||||
return existing
|
return existing
|
||||||
|
|
||||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
def _load_cities(self):
|
||||||
if not self.db:
|
condition = "domain='findlaw' AND level=2"
|
||||||
return 0, 0
|
tables = ("area_new", "area2", "area")
|
||||||
|
last_error = None
|
||||||
rows: List[Dict[str, str]] = []
|
for table in tables:
|
||||||
for record in records:
|
|
||||||
row = self._to_legacy_lawyer_row(record)
|
|
||||||
if row:
|
|
||||||
rows.append(row)
|
|
||||||
if not rows:
|
|
||||||
return 0, 0
|
|
||||||
|
|
||||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
|
||||||
inserted = 0
|
|
||||||
skipped = 0
|
|
||||||
for row in rows:
|
|
||||||
phone = row.get("phone", "")
|
|
||||||
if not phone or phone in existing:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
try:
|
try:
|
||||||
self.db.insert_data("lawyer", row)
|
rows = self.db.select_data(table, "city, province, pinyin", condition) or []
|
||||||
existing.add(phone)
|
|
||||||
inserted += 1
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
skipped += 1
|
last_error = exc
|
||||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
continue
|
||||||
return inserted, skipped
|
if rows:
|
||||||
|
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
|
||||||
|
print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
||||||
|
return rows
|
||||||
|
|
||||||
def crawl(
|
if last_error:
|
||||||
self,
|
print(f"[找法网] 加载地区数据失败: {last_error}")
|
||||||
output_path: str,
|
print("[找法网] 无城市数据(已尝试 area_new/area2/area)")
|
||||||
max_cities: int = 0,
|
for table in tables:
|
||||||
city_filter: Optional[str] = None,
|
try:
|
||||||
) -> None:
|
cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
|
||||||
cities = self.discover_cities()
|
c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
|
||||||
print(f"[discover] 共发现城市 {len(cities)} 个")
|
print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
|
||||||
if city_filter:
|
except Exception:
|
||||||
key = city_filter.strip().lower()
|
pass
|
||||||
cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
|
return []
|
||||||
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
|
||||||
if max_cities > 0:
|
|
||||||
cities = cities[:max_cities]
|
|
||||||
print(f"[discover] 截断城市数 {len(cities)}")
|
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
def _fetch_page(self, url: str, referer: str) -> List[Dict]:
|
||||||
|
text = self._get(url, referer, verify=True)
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
seen_ids: Set[str] = set()
|
try:
|
||||||
if os.path.exists(output_path):
|
# 某些返回体前会携带 BOM 或包装脚本,此处做兼容
|
||||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
text = text.strip().lstrip("\ufeff")
|
||||||
for line in old_file:
|
try:
|
||||||
line = line.strip()
|
data = json.loads(text)
|
||||||
if not line:
|
except ValueError:
|
||||||
|
json_start = text.find('{')
|
||||||
|
json_end = text.rfind('}')
|
||||||
|
if json_start == -1 or json_end == -1:
|
||||||
|
print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
|
||||||
|
return []
|
||||||
|
cleaned = text[json_start:json_end + 1]
|
||||||
|
data = json.loads(cleaned)
|
||||||
|
if isinstance(data, str):
|
||||||
|
try:
|
||||||
|
data = json.loads(data)
|
||||||
|
except ValueError:
|
||||||
|
print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}")
|
||||||
|
return []
|
||||||
|
except ValueError as exc:
|
||||||
|
print(f"解析JSON失败 {url}: {exc}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
items = data.get("data", {}).get("lawyer_list", [])
|
||||||
|
parsed = []
|
||||||
|
for item in items:
|
||||||
|
phone = (item.get("mobile") or "").replace("-", "")
|
||||||
|
parsed.append({
|
||||||
|
"name": item.get("username", ""),
|
||||||
|
"law_firm": item.get("lawyer_lawroom", ""),
|
||||||
|
"province": item.get("areaInfo", {}).get("province", ""),
|
||||||
|
"city": item.get("areaInfo", {}).get("city", ""),
|
||||||
|
"phone": phone,
|
||||||
|
"url": url,
|
||||||
|
"domain": DOMAIN,
|
||||||
|
"create_time": int(time.time()),
|
||||||
|
"params": json.dumps(item, ensure_ascii=False)
|
||||||
|
})
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
print("启动找法网采集...")
|
||||||
|
if not self.cities:
|
||||||
|
print("无城市数据")
|
||||||
|
return
|
||||||
|
|
||||||
|
for city in self.cities:
|
||||||
|
pinyin = city.get("pinyin")
|
||||||
|
province = city.get("province", "")
|
||||||
|
city_name = city.get("city", "")
|
||||||
|
if not pinyin:
|
||||||
|
continue
|
||||||
|
print(f"采集 {province}-{city_name}")
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
||||||
|
referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
|
||||||
|
print(f" 第 {page} 页: {url}")
|
||||||
|
items = self._fetch_page(url, referer)
|
||||||
|
if not items:
|
||||||
|
break
|
||||||
|
|
||||||
|
phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
|
||||||
|
existing = self._existing_phones(phones)
|
||||||
|
|
||||||
|
for entry in items:
|
||||||
|
phone = entry.get("phone")
|
||||||
|
if not phone:
|
||||||
|
continue
|
||||||
|
if phone in existing:
|
||||||
|
print(f" -- 已存在: {entry['name']} ({phone})")
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
item = json.loads(line)
|
self.db.insert_data("lawyer", entry)
|
||||||
except Exception:
|
print(f" -> 新增: {entry['name']} ({phone})")
|
||||||
continue
|
except Exception as exc:
|
||||||
rid = item.get("record_id")
|
print(f" 插入失败: {exc}")
|
||||||
if rid:
|
|
||||||
seen_ids.add(rid)
|
|
||||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
|
||||||
|
|
||||||
total_new_json = 0
|
page += 1
|
||||||
total_new_db = 0
|
|
||||||
total_skip_db = 0
|
|
||||||
|
|
||||||
with open(output_path, "a", encoding="utf-8") as out:
|
print("找法网采集完成")
|
||||||
for idx, target in enumerate(cities, start=1):
|
|
||||||
print(
|
|
||||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
|
||||||
f"({target.city_py})"
|
|
||||||
)
|
|
||||||
city_records = list(self.crawl_city(target))
|
|
||||||
|
|
||||||
city_new_json = 0
|
|
||||||
for record in city_records:
|
|
||||||
rid = record["record_id"]
|
|
||||||
if rid in seen_ids:
|
|
||||||
continue
|
|
||||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
||||||
seen_ids.add(rid)
|
|
||||||
city_new_json += 1
|
|
||||||
total_new_json += 1
|
|
||||||
|
|
||||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
|
||||||
total_new_db += city_new_db
|
|
||||||
total_skip_db += city_skip_db
|
|
||||||
print(
|
|
||||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
|
||||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
|
||||||
)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
|
||||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="找法网全新采集脚本(重写版)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--output",
|
|
||||||
default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
|
|
||||||
help="输出 jsonl 文件路径",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-cities",
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help="最多采集多少个城市,0 表示不限",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-pages",
|
|
||||||
type=int,
|
|
||||||
default=9999,
|
|
||||||
help="每个城市最多采集多少页",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--city-filter",
|
|
||||||
default="",
|
|
||||||
help="按城市拼音或城市名过滤,如 beijing",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--sleep",
|
|
||||||
type=float,
|
|
||||||
default=0.1,
|
|
||||||
help="每条记录采集间隔秒数",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--direct",
|
|
||||||
action="store_true",
|
|
||||||
help="直连模式,不使用 proxy_settings.json 代理",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--no-db",
|
|
||||||
action="store_true",
|
|
||||||
help="只输出 JSONL,不写入数据库",
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
if args.no_db:
|
|
||||||
crawler = FindlawCrawler(
|
|
||||||
max_pages=args.max_pages,
|
|
||||||
sleep_seconds=args.sleep,
|
|
||||||
use_proxy=not args.direct,
|
|
||||||
db_connection=None,
|
|
||||||
)
|
|
||||||
crawler.crawl(
|
|
||||||
output_path=args.output,
|
|
||||||
max_cities=args.max_cities,
|
|
||||||
city_filter=args.city_filter or None,
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
with Db() as db:
|
|
||||||
crawler = FindlawCrawler(
|
|
||||||
max_pages=args.max_pages,
|
|
||||||
sleep_seconds=args.sleep,
|
|
||||||
use_proxy=not args.direct,
|
|
||||||
db_connection=db,
|
|
||||||
)
|
|
||||||
crawler.crawl(
|
|
||||||
output_path=args.output,
|
|
||||||
max_cities=args.max_cities,
|
|
||||||
city_filter=args.city_filter or None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
with Db() as db:
|
||||||
|
spider = FindlawSpider(db)
|
||||||
|
spider.run()
|
||||||
|
|||||||
+291
-606
@@ -1,18 +1,10 @@
|
|||||||
import argparse
|
|
||||||
import ast
|
|
||||||
import hashlib
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
import random
|
||||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
from typing import Dict, Optional
|
||||||
from urllib.parse import urljoin
|
|
||||||
|
|
||||||
import urllib3
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -22,638 +14,331 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
|
|
||||||
from Db import Db
|
from Db import Db
|
||||||
from request.requests_client import RequestClientError, RequestsClient
|
from config import HEADERS
|
||||||
from utils.rate_limiter import wait_for_request
|
from utils.rate_limiter import request_slot
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
|
||||||
|
DOMAIN = "华律"
|
||||||
SITE_NAME = "hualv"
|
|
||||||
LEGACY_DOMAIN = "华律"
|
|
||||||
SITE_BASE = "https://m.66law.cn"
|
|
||||||
CITY_DATA_URL = "https://cache.66law.cn/dist/main-v2.0.js"
|
|
||||||
LIST_API_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
|
|
||||||
|
|
||||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
|
||||||
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
|
||||||
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class HualvSpider:
|
||||||
class CityTarget:
|
def __init__(self, db_connection):
|
||||||
province_id: int
|
|
||||||
province_name: str
|
|
||||||
city_id: int
|
|
||||||
city_name: str
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_phone(text: str) -> str:
|
|
||||||
compact = re.sub(r"\D", "", text or "")
|
|
||||||
match = PHONE_RE.search(compact)
|
|
||||||
return match.group(0) if match else ""
|
|
||||||
|
|
||||||
|
|
||||||
def strip_html_tags(text: str) -> str:
|
|
||||||
return re.sub(r"<[^>]+>", "", text or "").strip()
|
|
||||||
|
|
||||||
|
|
||||||
class HualvCrawler:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
max_pages: int = 9999,
|
|
||||||
sleep_seconds: float = 0.15,
|
|
||||||
use_proxy: bool = True,
|
|
||||||
db_connection=None,
|
|
||||||
):
|
|
||||||
self.max_pages = max_pages
|
|
||||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = RequestsClient(
|
self.session = self._build_session()
|
||||||
headers={
|
self.areas = self._load_areas()
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
def _build_session(self) -> requests.Session:
|
||||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
report_proxy_status()
|
||||||
"Mobile/15E148 Safari/604.1"
|
session = requests.Session()
|
||||||
),
|
session.trust_env = False
|
||||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
proxies = get_proxies()
|
||||||
"X-Requested-With": "XMLHttpRequest",
|
if proxies:
|
||||||
"Connection": "close",
|
session.proxies.update(proxies)
|
||||||
},
|
else:
|
||||||
use_proxy=use_proxy,
|
session.proxies.clear()
|
||||||
retry_total=2,
|
custom_headers = HEADERS.copy()
|
||||||
retry_backoff_factor=1,
|
custom_headers['User-Agent'] = (
|
||||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
|
||||||
retry_allowed_methods=("GET", "POST"),
|
'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
|
||||||
|
'Mobile/15E148 Safari/604.1'
|
||||||
)
|
)
|
||||||
|
custom_headers["Connection"] = "close"
|
||||||
|
session.headers.update(custom_headers)
|
||||||
|
return session
|
||||||
|
|
||||||
def _request_text(
|
def _refresh_session(self) -> None:
|
||||||
self,
|
try:
|
||||||
method: str,
|
self.session.close()
|
||||||
url: str,
|
except Exception:
|
||||||
*,
|
pass
|
||||||
timeout: int = 20,
|
self.session = self._build_session()
|
||||||
max_retries: int = 3,
|
|
||||||
referer: str = SITE_BASE,
|
|
||||||
data: Optional[Dict] = None,
|
|
||||||
) -> str:
|
|
||||||
headers = {"Referer": referer}
|
|
||||||
last_error: Optional[Exception] = None
|
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
def _load_areas(self):
|
||||||
wait_for_request()
|
tables = ("area_new", "area2", "area")
|
||||||
|
last_error = None
|
||||||
|
for table in tables:
|
||||||
try:
|
try:
|
||||||
if method.upper() == "POST":
|
provinces = self.db.select_data(
|
||||||
resp = self.client.post_text(
|
table,
|
||||||
url,
|
"code, province, pinyin, id",
|
||||||
timeout=timeout,
|
"domain='66law' AND level=1"
|
||||||
verify=False,
|
) or []
|
||||||
headers=headers,
|
cities = self.db.select_data(
|
||||||
data=data,
|
table,
|
||||||
)
|
"code, city, province, pid",
|
||||||
else:
|
"domain='66law' AND level=2"
|
||||||
resp = self.client.get_text(
|
) or []
|
||||||
url,
|
|
||||||
timeout=timeout,
|
|
||||||
verify=False,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
|
|
||||||
code = resp.status_code
|
|
||||||
if code == 403:
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
self.client.refresh()
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
raise RequestClientError(f"{code} Error: {url}")
|
|
||||||
if code >= 500 and attempt < max_retries - 1:
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
if code >= 400:
|
|
||||||
raise RequestClientError(f"{code} Error: {url}")
|
|
||||||
return resp.text
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
last_error = exc
|
last_error = exc
|
||||||
if attempt < max_retries - 1:
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
raise
|
|
||||||
|
|
||||||
if last_error is not None:
|
|
||||||
raise last_error
|
|
||||||
raise RequestClientError(f"Unknown request error: {url}")
|
|
||||||
|
|
||||||
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
|
|
||||||
return self._request_text(
|
|
||||||
"GET",
|
|
||||||
url,
|
|
||||||
timeout=timeout,
|
|
||||||
max_retries=max_retries,
|
|
||||||
referer=referer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _post_text(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
*,
|
|
||||||
data: Dict,
|
|
||||||
timeout: int = 20,
|
|
||||||
max_retries: int = 3,
|
|
||||||
referer: str = SITE_BASE,
|
|
||||||
) -> str:
|
|
||||||
return self._request_text(
|
|
||||||
"POST",
|
|
||||||
url,
|
|
||||||
timeout=timeout,
|
|
||||||
max_retries=max_retries,
|
|
||||||
referer=referer,
|
|
||||||
data=data,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_spc_location(self, script_text: str) -> List:
|
|
||||||
# main-v2.js 内置了 sPCLocation=new Array(...),后面紧跟 cateinfo 数组
|
|
||||||
marker = "sPCLocation = new Array("
|
|
||||||
start = script_text.find(marker)
|
|
||||||
if start == -1:
|
|
||||||
marker = "sPCLocation=new Array("
|
|
||||||
start = script_text.find(marker)
|
|
||||||
if start == -1:
|
|
||||||
return []
|
|
||||||
start += len(marker)
|
|
||||||
|
|
||||||
next_marker = script_text.find("cateinfo = new Array(", start)
|
|
||||||
if next_marker == -1:
|
|
||||||
next_marker = script_text.find("cateinfo=new Array(", start)
|
|
||||||
|
|
||||||
if next_marker != -1:
|
|
||||||
end = script_text.rfind(");", start, next_marker)
|
|
||||||
else:
|
|
||||||
end = script_text.find(");", start)
|
|
||||||
|
|
||||||
if end == -1 or end <= start:
|
|
||||||
return []
|
|
||||||
|
|
||||||
raw = "[" + script_text[start:end] + "]"
|
|
||||||
try:
|
|
||||||
data = ast.literal_eval(raw)
|
|
||||||
except Exception:
|
|
||||||
return []
|
|
||||||
return data if isinstance(data, list) else []
|
|
||||||
|
|
||||||
def discover_cities(self) -> List[CityTarget]:
|
|
||||||
script_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyer/")
|
|
||||||
rows = self._extract_spc_location(script_text)
|
|
||||||
|
|
||||||
targets: List[CityTarget] = []
|
|
||||||
seen: Set[Tuple[int, int]] = set()
|
|
||||||
|
|
||||||
for province in rows:
|
|
||||||
if not isinstance(province, list) or len(province) < 3:
|
|
||||||
continue
|
continue
|
||||||
try:
|
|
||||||
province_id = int(province[0])
|
if not cities:
|
||||||
except Exception:
|
|
||||||
continue
|
continue
|
||||||
province_name = str(province[1] or "").strip()
|
|
||||||
city_rows = province[2] if isinstance(province[2], list) else []
|
|
||||||
|
|
||||||
for city in city_rows:
|
province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
|
||||||
if not isinstance(city, list) or len(city) < 2:
|
city_map = {}
|
||||||
continue
|
for city in cities:
|
||||||
try:
|
province_info = province_map.get(city.get('pid'), {}) or {}
|
||||||
city_id = int(city[0])
|
province_code = province_info.get('code')
|
||||||
except Exception:
|
city_map[city.get('code')] = {
|
||||||
continue
|
"name": city.get('city'),
|
||||||
city_name = str(city[1] or "").strip()
|
"province": city.get('province'),
|
||||||
if city_id <= 0 or not city_name:
|
"province_code": province_code,
|
||||||
continue
|
|
||||||
|
|
||||||
key = (province_id, city_id)
|
|
||||||
if key in seen:
|
|
||||||
continue
|
|
||||||
seen.add(key)
|
|
||||||
|
|
||||||
targets.append(
|
|
||||||
CityTarget(
|
|
||||||
province_id=province_id,
|
|
||||||
province_name=province_name,
|
|
||||||
city_id=city_id,
|
|
||||||
city_name=city_name,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return targets
|
|
||||||
|
|
||||||
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[Dict], int]:
|
|
||||||
payload = {
|
|
||||||
"pid": str(target.province_id),
|
|
||||||
"cid": str(target.city_id),
|
|
||||||
"page": str(page),
|
|
||||||
}
|
|
||||||
text = self._post_text(
|
|
||||||
LIST_API_URL,
|
|
||||||
data=payload,
|
|
||||||
referer=SITE_BASE + "/findlawyer/",
|
|
||||||
)
|
|
||||||
data = json.loads((text or "").strip().lstrip("\ufeff") or "{}")
|
|
||||||
items = data.get("lawyerList") or data.get("queryLawyerList") or []
|
|
||||||
if not isinstance(items, list):
|
|
||||||
items = []
|
|
||||||
|
|
||||||
page_count = 0
|
|
||||||
try:
|
|
||||||
page_count = int((data.get("lawyerItems") or {}).get("pageCount") or 0)
|
|
||||||
except Exception:
|
|
||||||
page_count = 0
|
|
||||||
return items, page_count
|
|
||||||
|
|
||||||
def parse_detail(self, detail_url: str) -> Dict:
|
|
||||||
contact_url = detail_url.rstrip("/") + "/lawyer_contact.aspx"
|
|
||||||
html = self._get_text(contact_url, referer=detail_url)
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
full_text = soup.get_text(" ", strip=True)
|
|
||||||
|
|
||||||
name = ""
|
|
||||||
law_firm = ""
|
|
||||||
phone = ""
|
|
||||||
email = ""
|
|
||||||
address = ""
|
|
||||||
license_no = ""
|
|
||||||
practice_years: Optional[int] = None
|
|
||||||
|
|
||||||
name_tag = soup.select_one(".logo-box .title b")
|
|
||||||
if name_tag:
|
|
||||||
name = name_tag.get_text(strip=True).replace("律师", "").strip()
|
|
||||||
if not name and soup.title:
|
|
||||||
match = re.search(r"([^\s,,。_]+?)律师", soup.title.get_text(" ", strip=True))
|
|
||||||
if match:
|
|
||||||
name = match.group(1).strip()
|
|
||||||
|
|
||||||
phone_candidates = [
|
|
||||||
soup.select_one(".logo-box .r-bar .tel").get_text(" ", strip=True)
|
|
||||||
if soup.select_one(".logo-box .r-bar .tel")
|
|
||||||
else "",
|
|
||||||
soup.select_one(".lawyer-show ul.info").get_text(" ", strip=True)
|
|
||||||
if soup.select_one(".lawyer-show ul.info")
|
|
||||||
else "",
|
|
||||||
full_text,
|
|
||||||
]
|
|
||||||
for candidate in phone_candidates:
|
|
||||||
phone = normalize_phone(candidate)
|
|
||||||
if phone:
|
|
||||||
break
|
|
||||||
|
|
||||||
for li in soup.select(".lawyer-show ul.info li"):
|
|
||||||
li_text = li.get_text(" ", strip=True)
|
|
||||||
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
|
|
||||||
law_firm = li_text
|
|
||||||
|
|
||||||
if not law_firm:
|
|
||||||
match = re.search(r'"affiliation":\{"@type":"Organization","name":"([^"]+)"', html)
|
|
||||||
if match:
|
|
||||||
law_firm = match.group(1).strip()
|
|
||||||
|
|
||||||
match = re.search(r'"identifier":"([^"]+)"', html)
|
|
||||||
if match:
|
|
||||||
license_no = match.group(1).strip()
|
|
||||||
|
|
||||||
match = re.search(r'"streetAddress":"([^"]+)"', html)
|
|
||||||
if match:
|
|
||||||
address = match.group(1).strip()
|
|
||||||
|
|
||||||
email_match = EMAIL_RE.search(html)
|
|
||||||
if email_match:
|
|
||||||
email = email_match.group(0).strip()
|
|
||||||
|
|
||||||
year_match = YEAR_RE.search(full_text)
|
|
||||||
if year_match:
|
|
||||||
try:
|
|
||||||
practice_years = int(year_match.group(1))
|
|
||||||
except Exception:
|
|
||||||
practice_years = None
|
|
||||||
|
|
||||||
specialties = [node.get_text(strip=True) for node in soup.select(".tag-h38 span")]
|
|
||||||
specialties = [x for x in specialties if x]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"name": name,
|
|
||||||
"law_firm": law_firm,
|
|
||||||
"phone": phone,
|
|
||||||
"email": email,
|
|
||||||
"address": address,
|
|
||||||
"license_no": license_no,
|
|
||||||
"practice_years": practice_years,
|
|
||||||
"specialties": specialties,
|
|
||||||
"detail_url": detail_url,
|
|
||||||
"contact_url": contact_url,
|
|
||||||
}
|
|
||||||
|
|
||||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
|
||||||
seen_details: Set[str] = set()
|
|
||||||
|
|
||||||
for page in range(1, self.max_pages + 1):
|
|
||||||
try:
|
|
||||||
items, page_count = self.fetch_list_page(target, page)
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[list] 失败 pid={target.province_id} cid={target.city_id} p{page}: {exc}")
|
|
||||||
break
|
|
||||||
|
|
||||||
if not items:
|
|
||||||
break
|
|
||||||
|
|
||||||
for item in items:
|
|
||||||
detail_url = str(item.get("lawyerUrl") or "").strip()
|
|
||||||
if not detail_url:
|
|
||||||
continue
|
|
||||||
if detail_url.startswith("//"):
|
|
||||||
detail_url = "https:" + detail_url
|
|
||||||
if not detail_url.startswith("http"):
|
|
||||||
detail_url = urljoin(SITE_BASE, detail_url)
|
|
||||||
|
|
||||||
if detail_url in seen_details:
|
|
||||||
continue
|
|
||||||
seen_details.add(detail_url)
|
|
||||||
|
|
||||||
try:
|
|
||||||
detail = self.parse_detail(detail_url)
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[detail] 失败 {detail_url}: {exc}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
now = int(time.time())
|
|
||||||
uid = str(item.get("lawyerId") or item.get("globalUserId") or detail_url)
|
|
||||||
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
list_name = str(item.get("name") or "").replace("律师", "").strip()
|
|
||||||
category_text = str(item.get("categoryNames") or "").strip()
|
|
||||||
category_arr = [x.strip() for x in re.split(r"[、,,]", category_text) if x.strip()]
|
|
||||||
|
|
||||||
yield {
|
|
||||||
"record_id": record_id,
|
|
||||||
"collected_at": now,
|
|
||||||
"source": {
|
|
||||||
"site": SITE_NAME,
|
|
||||||
"province_id": target.province_id,
|
|
||||||
"province": target.province_name,
|
|
||||||
"city_id": target.city_id,
|
|
||||||
"city": target.city_name,
|
|
||||||
"page": page,
|
|
||||||
"detail_url": detail_url,
|
|
||||||
"contact_url": detail.get("contact_url", ""),
|
|
||||||
},
|
|
||||||
"list_snapshot": {
|
|
||||||
"lawyer_id": item.get("lawyerId"),
|
|
||||||
"name": list_name,
|
|
||||||
"category_names": category_arr,
|
|
||||||
"help_count": strip_html_tags(str(item.get("helpCount") or "")),
|
|
||||||
"comment_score": strip_html_tags(str(item.get("commentScore") or "")),
|
|
||||||
"response_time": str(item.get("responseTime") or "").strip(),
|
|
||||||
"year": item.get("year"),
|
|
||||||
"is_adv": bool(item.get("isAdv")),
|
|
||||||
},
|
|
||||||
"profile": {
|
|
||||||
"name": detail.get("name") or list_name,
|
|
||||||
"law_firm": detail.get("law_firm") or "",
|
|
||||||
"phone": detail.get("phone") or "",
|
|
||||||
"email": detail.get("email") or "",
|
|
||||||
"address": detail.get("address") or "",
|
|
||||||
"license_no": detail.get("license_no") or "",
|
|
||||||
"practice_years": detail.get("practice_years"),
|
|
||||||
"specialties": detail.get("specialties") or category_arr,
|
|
||||||
},
|
|
||||||
"raw": item,
|
|
||||||
}
|
}
|
||||||
|
print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
|
||||||
|
return city_map
|
||||||
|
|
||||||
if self.sleep_seconds:
|
if last_error:
|
||||||
time.sleep(self.sleep_seconds)
|
print(f"[华律] 加载地区数据失败: {last_error}")
|
||||||
|
print("[华律] 无城市数据(已尝试 area_new/area2/area)")
|
||||||
|
return {}
|
||||||
|
|
||||||
if page_count > 0 and page >= page_count:
|
def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
|
||||||
break
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
with request_slot():
|
||||||
|
resp = self.session.post(LIST_URL, data=data, timeout=20, verify=False)
|
||||||
|
status_code = resp.status_code
|
||||||
|
text = resp.text
|
||||||
|
resp.close()
|
||||||
|
if status_code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
|
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||||
|
self._refresh_session()
|
||||||
|
time.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
print("请求失败: 403 Forbidden")
|
||||||
|
return None
|
||||||
|
if status_code >= 400:
|
||||||
|
raise requests.exceptions.HTTPError(f"{status_code} Error")
|
||||||
|
try:
|
||||||
|
return json.loads(text)
|
||||||
|
except ValueError as exc:
|
||||||
|
print(f"解析JSON失败: {exc}")
|
||||||
|
return None
|
||||||
|
except requests.exceptions.RequestException as exc:
|
||||||
|
print(f"请求失败: {exc}")
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||||||
source = record.get("source", {}) or {}
|
contact_url = f"{url}lawyer_contact.aspx"
|
||||||
profile = record.get("profile", {}) or {}
|
print(f" 详情: {contact_url}")
|
||||||
|
existing = self.db.select_data(
|
||||||
|
"lawyer",
|
||||||
|
"id, avatar_url",
|
||||||
|
f"domain='{DOMAIN}' AND url='{contact_url}'"
|
||||||
|
)
|
||||||
|
existing_id = None
|
||||||
|
if existing:
|
||||||
|
existing_id = existing[0].get("id")
|
||||||
|
avatar = (existing[0].get("avatar_url") or "").strip()
|
||||||
|
if avatar:
|
||||||
|
print(" -- 已存在且头像已补全,跳过")
|
||||||
|
return None
|
||||||
|
|
||||||
phone = normalize_phone(profile.get("phone", ""))
|
html = self._get_detail(contact_url)
|
||||||
if not phone:
|
if not html:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
province = (source.get("province") or "").strip()
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
city = (source.get("city") or province).strip()
|
info_list = soup.find("ul", class_="information-list")
|
||||||
return {
|
if not info_list:
|
||||||
"name": (profile.get("name") or "").strip(),
|
return None
|
||||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
|
||||||
|
phone = ""
|
||||||
|
law_firm = ""
|
||||||
|
for li in info_list.find_all("li"):
|
||||||
|
text = li.get_text(strip=True)
|
||||||
|
if "手机号" in text:
|
||||||
|
cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
|
||||||
|
match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
|
||||||
|
if match:
|
||||||
|
phone = match.group(0)
|
||||||
|
if "执业单位" in text:
|
||||||
|
law_firm = text.replace("执业单位", "").strip()
|
||||||
|
|
||||||
|
name = ""
|
||||||
|
breadcrumb = soup.find("div", class_="weizhi")
|
||||||
|
if breadcrumb:
|
||||||
|
links = breadcrumb.find_all("a")
|
||||||
|
if len(links) > 2:
|
||||||
|
name = links[2].get_text(strip=True)
|
||||||
|
|
||||||
|
phone = phone.replace('-', '').strip()
|
||||||
|
if not phone or not re.fullmatch(r"1\d{10}", phone):
|
||||||
|
print(" 无手机号,跳过")
|
||||||
|
return None
|
||||||
|
|
||||||
|
avatar_url, site_time = self._extract_avatar_and_time(soup)
|
||||||
|
data = {
|
||||||
|
"phone": phone,
|
||||||
"province": province,
|
"province": province,
|
||||||
"city": city,
|
"city": city,
|
||||||
"phone": phone,
|
"law_firm": law_firm,
|
||||||
"url": (source.get("contact_url") or source.get("detail_url") or "").strip(),
|
"url": contact_url,
|
||||||
"domain": LEGACY_DOMAIN,
|
"avatar_url": avatar_url,
|
||||||
"create_time": int(record.get("collected_at") or time.time()),
|
"create_time": int(time.time()),
|
||||||
"params": json.dumps(record, ensure_ascii=False),
|
"site_time": site_time,
|
||||||
|
"domain": DOMAIN,
|
||||||
|
"name": name,
|
||||||
|
"params": json.dumps({"source": url}, ensure_ascii=False)
|
||||||
}
|
}
|
||||||
|
if existing_id:
|
||||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
update_data = {
|
||||||
if not self.db or not phones:
|
"avatar_url": avatar_url,
|
||||||
return set()
|
"site_time": site_time,
|
||||||
|
}
|
||||||
deduped = sorted({p for p in phones if p})
|
if name:
|
||||||
if not deduped:
|
update_data["name"] = name
|
||||||
return set()
|
if law_firm:
|
||||||
|
update_data["law_firm"] = law_firm
|
||||||
existing: Set[str] = set()
|
if province:
|
||||||
cur = self.db.db.cursor()
|
update_data["province"] = province
|
||||||
try:
|
if city:
|
||||||
chunk_size = 500
|
update_data["city"] = city
|
||||||
for i in range(0, len(deduped), chunk_size):
|
if phone:
|
||||||
chunk = deduped[i:i + chunk_size]
|
update_data["phone"] = phone
|
||||||
placeholders = ",".join(["%s"] * len(chunk))
|
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
|
||||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
|
||||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
|
||||||
for row in cur.fetchall():
|
|
||||||
existing.add(row[0])
|
|
||||||
finally:
|
|
||||||
cur.close()
|
|
||||||
|
|
||||||
return existing
|
|
||||||
|
|
||||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
|
||||||
if not self.db:
|
|
||||||
return 0, 0
|
|
||||||
|
|
||||||
rows: List[Dict[str, str]] = []
|
|
||||||
for record in records:
|
|
||||||
row = self._to_legacy_lawyer_row(record)
|
|
||||||
if row:
|
|
||||||
rows.append(row)
|
|
||||||
if not rows:
|
|
||||||
return 0, 0
|
|
||||||
|
|
||||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
|
||||||
inserted = 0
|
|
||||||
skipped = 0
|
|
||||||
|
|
||||||
for row in rows:
|
|
||||||
phone = row.get("phone", "")
|
|
||||||
if not phone or phone in existing:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
try:
|
try:
|
||||||
self.db.insert_data("lawyer", row)
|
self.db.update_data("lawyer", update_data, f"id={existing_id}")
|
||||||
existing.add(phone)
|
print(" -- 已存在,已补全头像/时间")
|
||||||
inserted += 1
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
skipped += 1
|
print(f" 更新失败: {exc}")
|
||||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
return None
|
||||||
|
# 若手机号已存在,则更新头像/时间,不再插入新记录
|
||||||
|
existing_phone = self.db.select_data(
|
||||||
|
"lawyer",
|
||||||
|
"id, avatar_url, url",
|
||||||
|
f"domain='{DOMAIN}' AND phone='{phone}'"
|
||||||
|
)
|
||||||
|
if existing_phone:
|
||||||
|
existing_row = existing_phone[0]
|
||||||
|
avatar = (existing_row.get("avatar_url") or "").strip()
|
||||||
|
if avatar:
|
||||||
|
print(" -- 已存在手机号且头像已补全,跳过")
|
||||||
|
return None
|
||||||
|
update_data = {
|
||||||
|
"avatar_url": avatar_url,
|
||||||
|
"site_time": site_time,
|
||||||
|
}
|
||||||
|
if name:
|
||||||
|
update_data["name"] = name
|
||||||
|
if law_firm:
|
||||||
|
update_data["law_firm"] = law_firm
|
||||||
|
if province:
|
||||||
|
update_data["province"] = province
|
||||||
|
if city:
|
||||||
|
update_data["city"] = city
|
||||||
|
if phone:
|
||||||
|
update_data["phone"] = phone
|
||||||
|
if not existing_row.get("url"):
|
||||||
|
update_data["url"] = contact_url
|
||||||
|
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
|
||||||
|
try:
|
||||||
|
self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
|
||||||
|
print(" -- 已存在手机号,已补全头像/时间")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" 更新失败: {exc}")
|
||||||
|
return None
|
||||||
|
return data
|
||||||
|
|
||||||
return inserted, skipped
|
def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
|
||||||
|
avatar_url = ""
|
||||||
|
site_time = None
|
||||||
|
img_tag = soup.select_one(
|
||||||
|
"div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
|
||||||
|
)
|
||||||
|
if img_tag:
|
||||||
|
src = (img_tag.get("src") or "").strip()
|
||||||
|
if src:
|
||||||
|
if src.startswith("//"):
|
||||||
|
avatar_url = f"https:{src}"
|
||||||
|
else:
|
||||||
|
avatar_url = src
|
||||||
|
match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
|
||||||
|
if match:
|
||||||
|
site_time = int(f"{match.group(1)}{match.group(2)}")
|
||||||
|
else:
|
||||||
|
match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
|
||||||
|
if match:
|
||||||
|
site_time = int(f"{match.group(1)}{match.group(2)}")
|
||||||
|
return avatar_url, site_time
|
||||||
|
|
||||||
def crawl(
|
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||||||
self,
|
for attempt in range(max_retries):
|
||||||
output_path: str,
|
try:
|
||||||
max_cities: int = 0,
|
with request_slot():
|
||||||
city_filter: Optional[str] = None,
|
resp = self.session.get(url, timeout=15, verify=False)
|
||||||
) -> None:
|
status_code = resp.status_code
|
||||||
cities = self.discover_cities()
|
text = resp.text
|
||||||
print(f"[discover] 共发现城市 {len(cities)} 个")
|
resp.close()
|
||||||
|
if status_code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
|
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||||
|
self._refresh_session()
|
||||||
|
time.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
print(" 请求失败: 403 Forbidden")
|
||||||
|
return None
|
||||||
|
if status_code >= 400:
|
||||||
|
raise requests.exceptions.HTTPError(f"{status_code} Error")
|
||||||
|
return text
|
||||||
|
except requests.exceptions.RequestException as exc:
|
||||||
|
print(f" 请求失败: {exc}")
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
if city_filter:
|
def run(self):
|
||||||
key = city_filter.strip().lower()
|
print("启动华律网采集...")
|
||||||
cities = [
|
if not self.areas:
|
||||||
c for c in cities
|
print("无城市数据")
|
||||||
if key in c.city_name.lower() or key in str(c.city_id).lower()
|
return
|
||||||
]
|
|
||||||
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
|
||||||
|
|
||||||
if max_cities > 0:
|
for city_code, city_info in self.areas.items():
|
||||||
cities = cities[:max_cities]
|
province_code = city_info.get("province_code")
|
||||||
print(f"[discover] 截断城市数 {len(cities)}")
|
if not province_code:
|
||||||
|
continue
|
||||||
|
province_name = city_info.get("province", "")
|
||||||
|
city_name = city_info.get("name", "")
|
||||||
|
print(f"采集 {province_name}-{city_name}")
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
page = 1
|
||||||
|
while True:
|
||||||
|
payload = {"pid": province_code, "cid": city_code, "page": str(page)}
|
||||||
|
data = self._post(payload)
|
||||||
|
if not data or not data.get("lawyerList"):
|
||||||
|
break
|
||||||
|
|
||||||
seen_ids: Set[str] = set()
|
for item in data["lawyerList"]:
|
||||||
if os.path.exists(output_path):
|
result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
|
||||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
if not result:
|
||||||
for line in old_file:
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
item = json.loads(line)
|
self.db.insert_data("lawyer", result)
|
||||||
except Exception:
|
print(f" -> 新增: {result['name']} ({result['phone']})")
|
||||||
continue
|
except Exception as exc:
|
||||||
rid = item.get("record_id")
|
print(f" 插入失败: {exc}")
|
||||||
if rid:
|
time.sleep(1)
|
||||||
seen_ids.add(rid)
|
|
||||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
|
||||||
|
|
||||||
total_new_json = 0
|
page_count = data.get("lawyerItems", {}).get("pageCount", page)
|
||||||
total_new_db = 0
|
if page >= page_count:
|
||||||
total_skip_db = 0
|
break
|
||||||
|
page += 1
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
with open(output_path, "a", encoding="utf-8") as out:
|
time.sleep(1)
|
||||||
for idx, target in enumerate(cities, start=1):
|
print("华律网采集完成")
|
||||||
print(
|
|
||||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
|
||||||
f"(pid={target.province_id}, cid={target.city_id})"
|
|
||||||
)
|
|
||||||
city_records = list(self.crawl_city(target))
|
|
||||||
|
|
||||||
city_new_json = 0
|
|
||||||
for record in city_records:
|
|
||||||
rid = record["record_id"]
|
|
||||||
if rid in seen_ids:
|
|
||||||
continue
|
|
||||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
||||||
seen_ids.add(rid)
|
|
||||||
city_new_json += 1
|
|
||||||
total_new_json += 1
|
|
||||||
|
|
||||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
|
||||||
total_new_db += city_new_db
|
|
||||||
total_skip_db += city_skip_db
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
|
||||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
|
||||||
)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
|
||||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="华律网全新采集脚本(站点数据直采)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--output",
|
|
||||||
default="/www/wwwroot/lawyers/data/hualv_records_all.jsonl",
|
|
||||||
help="输出 jsonl 文件路径",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-cities",
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help="最多采集多少个城市,0 表示不限",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-pages",
|
|
||||||
type=int,
|
|
||||||
default=9999,
|
|
||||||
help="每个城市最多采集多少页",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--city-filter",
|
|
||||||
default="",
|
|
||||||
help="按城市名称或城市编码过滤,如 beijing / 110100",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--sleep",
|
|
||||||
type=float,
|
|
||||||
default=0.15,
|
|
||||||
help="详情页请求间隔秒数",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--direct",
|
|
||||||
action="store_true",
|
|
||||||
help="直连模式,不使用 proxy_settings.json 代理",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--no-db",
|
|
||||||
action="store_true",
|
|
||||||
help="只输出 JSONL,不写入数据库",
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
if args.no_db:
|
|
||||||
crawler = HualvCrawler(
|
|
||||||
max_pages=args.max_pages,
|
|
||||||
sleep_seconds=args.sleep,
|
|
||||||
use_proxy=not args.direct,
|
|
||||||
db_connection=None,
|
|
||||||
)
|
|
||||||
crawler.crawl(
|
|
||||||
output_path=args.output,
|
|
||||||
max_cities=args.max_cities,
|
|
||||||
city_filter=args.city_filter or None,
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
with Db() as db:
|
|
||||||
crawler = HualvCrawler(
|
|
||||||
max_pages=args.max_pages,
|
|
||||||
sleep_seconds=args.sleep,
|
|
||||||
use_proxy=not args.direct,
|
|
||||||
db_connection=db,
|
|
||||||
)
|
|
||||||
crawler.crawl(
|
|
||||||
output_path=args.output,
|
|
||||||
max_cities=args.max_cities,
|
|
||||||
city_filter=args.city_filter or None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
with Db() as db:
|
||||||
|
spider = HualvSpider(db)
|
||||||
|
spider.run()
|
||||||
|
|||||||
+238
-586
@@ -1,16 +1,13 @@
|
|||||||
import argparse
|
|
||||||
import hashlib
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
import random
|
||||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
from typing import Dict, Optional, List, Set
|
||||||
|
from urllib.parse import urljoin
|
||||||
import urllib3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from bs4 import BeautifulSoup
|
import threading
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -20,628 +17,283 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
from Db import Db
|
import requests
|
||||||
from request.requests_client import RequestClientError, RequestsClient
|
import urllib3
|
||||||
from utils.rate_limiter import wait_for_request
|
from bs4 import BeautifulSoup
|
||||||
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
SITE_NAME = "lawtime"
|
from Db import Db
|
||||||
LEGACY_DOMAIN = "法律快车"
|
from config import LAWTIME_CONFIG
|
||||||
SITE_BASE = "https://www.lawtime.cn"
|
from utils.rate_limiter import request_slot
|
||||||
PROVINCE_API = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode=0"
|
|
||||||
CITY_API_TEMPLATE = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode={province_id}"
|
|
||||||
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/lawyer"
|
|
||||||
|
|
||||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
|
||||||
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
|
DETAIL_BASE = "https://m.lawtime.cn"
|
||||||
|
DOMAIN = "法律快车"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class LawtimeSpider:
|
||||||
class CityTarget:
|
def __init__(self, db_connection):
|
||||||
province_id: str
|
|
||||||
province_name: str
|
|
||||||
province_py: str
|
|
||||||
city_id: str
|
|
||||||
city_name: str
|
|
||||||
city_py: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ListCard:
|
|
||||||
detail_url: str
|
|
||||||
name: str
|
|
||||||
phone: str
|
|
||||||
address: str = ""
|
|
||||||
specialties: List[str] = field(default_factory=list)
|
|
||||||
metric_text: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_phone(text: str) -> str:
|
|
||||||
compact = re.sub(r"\D", "", text or "")
|
|
||||||
match = PHONE_RE.search(compact)
|
|
||||||
return match.group(0) if match else ""
|
|
||||||
|
|
||||||
|
|
||||||
class LawtimeCrawler:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
max_pages: int = 9999,
|
|
||||||
sleep_seconds: float = 0.1,
|
|
||||||
use_proxy: bool = True,
|
|
||||||
db_connection=None,
|
|
||||||
):
|
|
||||||
self.max_pages = max_pages
|
|
||||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = RequestsClient(
|
self.session = self._build_session()
|
||||||
headers={
|
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
|
||||||
"User-Agent": (
|
self._tls = threading.local()
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
||||||
"Chrome/122.0.0.0 Safari/537.36"
|
|
||||||
),
|
|
||||||
"Accept": "text/html,application/json,*/*;q=0.8",
|
|
||||||
"Connection": "close",
|
|
||||||
},
|
|
||||||
use_proxy=use_proxy,
|
|
||||||
retry_total=2,
|
|
||||||
retry_backoff_factor=1,
|
|
||||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
|
||||||
retry_allowed_methods=("GET",),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_text(
|
def _build_session(self) -> requests.Session:
|
||||||
self,
|
report_proxy_status()
|
||||||
url: str,
|
session = requests.Session()
|
||||||
*,
|
session.trust_env = False
|
||||||
timeout: int = 20,
|
proxies = get_proxies()
|
||||||
max_retries: int = 3,
|
if proxies:
|
||||||
referer: str = SITE_BASE,
|
session.proxies.update(proxies)
|
||||||
) -> str:
|
else:
|
||||||
headers = {"Referer": referer}
|
session.proxies.clear()
|
||||||
last_error: Optional[Exception] = None
|
headers = LAWTIME_CONFIG.get("HEADERS", {})
|
||||||
|
if headers:
|
||||||
|
session.headers.update(headers)
|
||||||
|
session.headers.setdefault("Connection", "close")
|
||||||
|
return session
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
def _refresh_session(self) -> None:
|
||||||
wait_for_request()
|
|
||||||
try:
|
|
||||||
resp = self.client.get_text(
|
|
||||||
url,
|
|
||||||
timeout=timeout,
|
|
||||||
verify=False,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
code = resp.status_code
|
|
||||||
if code == 403:
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
self.client.refresh()
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
raise RequestClientError(f"{code} Error: {url}")
|
|
||||||
if code >= 500 and attempt < max_retries - 1:
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
if code >= 400:
|
|
||||||
raise RequestClientError(f"{code} Error: {url}")
|
|
||||||
return resp.text
|
|
||||||
except Exception as exc:
|
|
||||||
last_error = exc
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
raise
|
|
||||||
|
|
||||||
if last_error is not None:
|
|
||||||
raise last_error
|
|
||||||
raise RequestClientError(f"Unknown request error: {url}")
|
|
||||||
|
|
||||||
def _get_json(self, url: str, *, referer: str) -> List[Dict]:
|
|
||||||
text = self._get_text(url, referer=referer)
|
|
||||||
cleaned = (text or "").strip().lstrip("\ufeff")
|
|
||||||
if not cleaned or cleaned.startswith("<"):
|
|
||||||
return []
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(cleaned)
|
self.session.close()
|
||||||
except ValueError:
|
except Exception:
|
||||||
return []
|
pass
|
||||||
return data if isinstance(data, list) else []
|
self.session = self._build_session()
|
||||||
|
|
||||||
def discover_cities(self) -> List[CityTarget]:
|
def _get_thread_session(self) -> requests.Session:
|
||||||
provinces = self._get_json(PROVINCE_API, referer=SITE_BASE)
|
s = getattr(self._tls, "session", None)
|
||||||
if not provinces:
|
if s is not None:
|
||||||
print("[discover] 地区接口未返回有效数据")
|
return s
|
||||||
return []
|
s = self._build_session()
|
||||||
|
s.headers.update(dict(self.session.headers))
|
||||||
|
self._tls.session = s
|
||||||
|
return s
|
||||||
|
|
||||||
results: List[CityTarget] = []
|
def _refresh_thread_session(self) -> None:
|
||||||
seen_py: Set[str] = set()
|
s = getattr(self._tls, "session", None)
|
||||||
|
if s is not None:
|
||||||
for province in provinces:
|
|
||||||
province_id = str(province.get("id") or "").strip()
|
|
||||||
province_name = str(province.get("province") or province.get("city") or "").strip()
|
|
||||||
province_py = str(province.get("pinyin") or "").strip()
|
|
||||||
if not province_id or not province_name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
city_api = CITY_API_TEMPLATE.format(province_id=province_id)
|
|
||||||
try:
|
try:
|
||||||
cities = self._get_json(city_api, referer=LIST_URL_TEMPLATE.format(city_py=province_py or ""))
|
s.close()
|
||||||
except Exception as exc:
|
|
||||||
print(f"[city] 获取失败 province={province_id}: {exc}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not cities:
|
|
||||||
cities = [
|
|
||||||
{
|
|
||||||
"id": province_id,
|
|
||||||
"province": province_name,
|
|
||||||
"city": province_name,
|
|
||||||
"pinyin": province_py,
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
for city in cities:
|
|
||||||
city_id = str(city.get("id") or "").strip()
|
|
||||||
city_name = str(city.get("city") or city.get("province") or "").strip()
|
|
||||||
city_py = str(city.get("pinyin") or "").strip()
|
|
||||||
if not city_id or not city_name or not city_py:
|
|
||||||
continue
|
|
||||||
if city_py in seen_py:
|
|
||||||
continue
|
|
||||||
seen_py.add(city_py)
|
|
||||||
|
|
||||||
results.append(
|
|
||||||
CityTarget(
|
|
||||||
province_id=province_id,
|
|
||||||
province_name=province_name,
|
|
||||||
province_py=province_py,
|
|
||||||
city_id=city_id,
|
|
||||||
city_name=city_name,
|
|
||||||
city_py=city_py,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
def _build_list_url(self, city_py: str, page: int) -> str:
|
|
||||||
base = LIST_URL_TEMPLATE.format(city_py=city_py)
|
|
||||||
if page <= 1:
|
|
||||||
return base
|
|
||||||
return f"{base}?page={page}"
|
|
||||||
|
|
||||||
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[ListCard], bool, str]:
|
|
||||||
list_url = self._build_list_url(target.city_py, page)
|
|
||||||
html = self._get_text(list_url, referer=SITE_BASE + "/")
|
|
||||||
|
|
||||||
cards = self.parse_list_cards(html)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
next_link = soup.select_one(f"div.page a[href*='page={page + 1}']")
|
|
||||||
has_next = next_link is not None
|
|
||||||
|
|
||||||
return cards, has_next, list_url
|
|
||||||
|
|
||||||
def parse_list_cards(self, html: str) -> List[ListCard]:
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
cards: List[ListCard] = []
|
|
||||||
seen: Set[str] = set()
|
|
||||||
|
|
||||||
for item in soup.select("li.lawyer-item-card"):
|
|
||||||
link_tag = item.select_one("a.name[href]") or item.select_one("a.lawyer-img-box[href]")
|
|
||||||
if not link_tag:
|
|
||||||
continue
|
|
||||||
detail_url = (link_tag.get("href") or "").strip()
|
|
||||||
if not detail_url.startswith("http"):
|
|
||||||
continue
|
|
||||||
if detail_url in seen:
|
|
||||||
continue
|
|
||||||
seen.add(detail_url)
|
|
||||||
|
|
||||||
name = link_tag.get_text(strip=True)
|
|
||||||
phone = ""
|
|
||||||
phone_tag = item.select_one("div.phone")
|
|
||||||
if phone_tag:
|
|
||||||
phone = normalize_phone(phone_tag.get_text(" ", strip=True))
|
|
||||||
|
|
||||||
address = ""
|
|
||||||
addr_tag = item.select_one("div.location .txt")
|
|
||||||
if addr_tag:
|
|
||||||
address = addr_tag.get_text(" ", strip=True)
|
|
||||||
|
|
||||||
specialties: List[str] = []
|
|
||||||
prof_tag = item.select_one("div.prof .txt")
|
|
||||||
if prof_tag:
|
|
||||||
specialties = [
|
|
||||||
x.strip() for x in re.split(r"[、,,]", prof_tag.get_text(" ", strip=True)) if x.strip()
|
|
||||||
]
|
|
||||||
|
|
||||||
metric_text = ""
|
|
||||||
metric_tag = item.select_one("div.num-msg")
|
|
||||||
if metric_tag:
|
|
||||||
metric_text = metric_tag.get_text(" ", strip=True)
|
|
||||||
|
|
||||||
cards.append(
|
|
||||||
ListCard(
|
|
||||||
detail_url=detail_url,
|
|
||||||
name=name,
|
|
||||||
phone=phone,
|
|
||||||
address=address,
|
|
||||||
specialties=specialties,
|
|
||||||
metric_text=metric_text,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return cards
|
|
||||||
|
|
||||||
def parse_detail(self, detail_url: str) -> Dict:
|
|
||||||
html = self._get_text(detail_url, referer=SITE_BASE)
|
|
||||||
if "网站防火墙" in html or "访问被拒绝" in html or "/ipfilter/verify" in html:
|
|
||||||
raise RequestClientError(f"firewall blocked: {detail_url}")
|
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
text = soup.get_text(" ", strip=True)
|
|
||||||
|
|
||||||
name = ""
|
|
||||||
law_firm = ""
|
|
||||||
phone = ""
|
|
||||||
address = ""
|
|
||||||
practice_years: Optional[int] = None
|
|
||||||
specialties: List[str] = []
|
|
||||||
|
|
||||||
if soup.title:
|
|
||||||
title = soup.title.get_text(" ", strip=True)
|
|
||||||
match = re.search(r"([^\s_,,。]+?)律师", title)
|
|
||||||
if match:
|
|
||||||
name = match.group(1).strip()
|
|
||||||
|
|
||||||
phone_candidates = [
|
|
||||||
soup.select_one(".data-w .tel-b b").get_text(" ", strip=True)
|
|
||||||
if soup.select_one(".data-w .tel-b b")
|
|
||||||
else "",
|
|
||||||
soup.select_one(".law-info-b .item .two-r.b").get_text(" ", strip=True)
|
|
||||||
if soup.select_one(".law-info-b .item .two-r.b")
|
|
||||||
else "",
|
|
||||||
text,
|
|
||||||
]
|
|
||||||
for candidate in phone_candidates:
|
|
||||||
phone = normalize_phone(candidate)
|
|
||||||
if phone:
|
|
||||||
break
|
|
||||||
|
|
||||||
law_firm_tag = soup.select_one(".law-info-b .item .two-nowrap")
|
|
||||||
if law_firm_tag:
|
|
||||||
law_firm = law_firm_tag.get_text(" ", strip=True)
|
|
||||||
|
|
||||||
for li in soup.select(".law-info-b .item"):
|
|
||||||
li_text = li.get_text(" ", strip=True)
|
|
||||||
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
|
|
||||||
law_firm = li_text
|
|
||||||
|
|
||||||
addr_tag = soup.select_one(".law-info-b .item .two-r[title]")
|
|
||||||
if addr_tag:
|
|
||||||
addr_value = (addr_tag.get("title") or "").strip()
|
|
||||||
if len(addr_value) > 8:
|
|
||||||
address = addr_value
|
|
||||||
|
|
||||||
if not address:
|
|
||||||
addr_tag = soup.select_one(".law-info-b .item .two-r")
|
|
||||||
if addr_tag:
|
|
||||||
addr_value = addr_tag.get_text(" ", strip=True)
|
|
||||||
if len(addr_value) > 8 and "律师" not in addr_value:
|
|
||||||
address = addr_value
|
|
||||||
|
|
||||||
year_match = YEAR_RE.search(text)
|
|
||||||
if year_match:
|
|
||||||
try:
|
|
||||||
practice_years = int(year_match.group(1))
|
|
||||||
except Exception:
|
except Exception:
|
||||||
practice_years = None
|
pass
|
||||||
|
self._tls.session = None
|
||||||
|
|
||||||
specialties = [x.get_text(strip=True) for x in soup.select(".profession-b .item") if x.get_text(strip=True)]
|
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||||
|
if not phones:
|
||||||
return {
|
|
||||||
"name": name,
|
|
||||||
"law_firm": law_firm,
|
|
||||||
"phone": phone,
|
|
||||||
"address": address,
|
|
||||||
"practice_years": practice_years,
|
|
||||||
"specialties": specialties,
|
|
||||||
"detail_url": detail_url,
|
|
||||||
}
|
|
||||||
|
|
||||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
|
||||||
seen_details: Set[str] = set()
|
|
||||||
|
|
||||||
for page in range(1, self.max_pages + 1):
|
|
||||||
try:
|
|
||||||
cards, has_next, list_url = self.fetch_list_page(target, page)
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
|
|
||||||
break
|
|
||||||
|
|
||||||
if not cards:
|
|
||||||
break
|
|
||||||
|
|
||||||
for card in cards:
|
|
||||||
if card.detail_url in seen_details:
|
|
||||||
continue
|
|
||||||
seen_details.add(card.detail_url)
|
|
||||||
|
|
||||||
detail: Dict = {}
|
|
||||||
try:
|
|
||||||
detail = self.parse_detail(card.detail_url)
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[detail] 失败 {card.detail_url}: {exc}")
|
|
||||||
|
|
||||||
phone = normalize_phone(detail.get("phone") or card.phone)
|
|
||||||
profile_name = (detail.get("name") or card.name).replace("律师", "").strip()
|
|
||||||
|
|
||||||
now = int(time.time())
|
|
||||||
record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
yield {
|
|
||||||
"record_id": record_id,
|
|
||||||
"collected_at": now,
|
|
||||||
"source": {
|
|
||||||
"site": SITE_NAME,
|
|
||||||
"province_id": target.province_id,
|
|
||||||
"province": target.province_name,
|
|
||||||
"province_py": target.province_py,
|
|
||||||
"city_id": target.city_id,
|
|
||||||
"city": target.city_name,
|
|
||||||
"city_py": target.city_py,
|
|
||||||
"page": page,
|
|
||||||
"list_url": list_url,
|
|
||||||
"detail_url": card.detail_url,
|
|
||||||
},
|
|
||||||
"list_snapshot": {
|
|
||||||
"name": card.name,
|
|
||||||
"phone": card.phone,
|
|
||||||
"address": card.address,
|
|
||||||
"specialties": card.specialties,
|
|
||||||
"metric_text": card.metric_text,
|
|
||||||
},
|
|
||||||
"profile": {
|
|
||||||
"name": profile_name,
|
|
||||||
"law_firm": (detail.get("law_firm") or "").strip(),
|
|
||||||
"phone": phone,
|
|
||||||
"address": (detail.get("address") or card.address or "").strip(),
|
|
||||||
"practice_years": detail.get("practice_years"),
|
|
||||||
"specialties": detail.get("specialties") or card.specialties,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.sleep_seconds:
|
|
||||||
time.sleep(self.sleep_seconds)
|
|
||||||
|
|
||||||
if not has_next:
|
|
||||||
break
|
|
||||||
|
|
||||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
|
||||||
source = record.get("source", {}) or {}
|
|
||||||
profile = record.get("profile", {}) or {}
|
|
||||||
|
|
||||||
phone = normalize_phone(profile.get("phone", ""))
|
|
||||||
if not phone:
|
|
||||||
return None
|
|
||||||
|
|
||||||
province = (source.get("province") or "").strip()
|
|
||||||
city = (source.get("city") or province).strip()
|
|
||||||
return {
|
|
||||||
"name": (profile.get("name") or "").strip(),
|
|
||||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
|
||||||
"province": province,
|
|
||||||
"city": city,
|
|
||||||
"phone": phone,
|
|
||||||
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
|
|
||||||
"domain": LEGACY_DOMAIN,
|
|
||||||
"create_time": int(record.get("collected_at") or time.time()),
|
|
||||||
"params": json.dumps(record, ensure_ascii=False),
|
|
||||||
}
|
|
||||||
|
|
||||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
|
||||||
if not self.db or not phones:
|
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
deduped = sorted({p for p in phones if p})
|
|
||||||
if not deduped:
|
|
||||||
return set()
|
|
||||||
|
|
||||||
existing: Set[str] = set()
|
existing: Set[str] = set()
|
||||||
cur = self.db.db.cursor()
|
cur = self.db.db.cursor()
|
||||||
try:
|
try:
|
||||||
chunk_size = 500
|
chunk_size = 500
|
||||||
for i in range(0, len(deduped), chunk_size):
|
for i in range(0, len(phones), chunk_size):
|
||||||
chunk = deduped[i:i + chunk_size]
|
chunk = phones[i:i + chunk_size]
|
||||||
placeholders = ",".join(["%s"] * len(chunk))
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
cur.execute(sql, [DOMAIN, *chunk])
|
||||||
for row in cur.fetchall():
|
for row in cur.fetchall():
|
||||||
existing.add(row[0])
|
existing.add(row[0])
|
||||||
finally:
|
finally:
|
||||||
cur.close()
|
cur.close()
|
||||||
|
|
||||||
return existing
|
return existing
|
||||||
|
|
||||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
def _load_areas(self):
|
||||||
if not self.db:
|
condition = "level = 2 and domain='法律快车'"
|
||||||
return 0, 0
|
tables = ("area_new", "area", "area2")
|
||||||
|
last_error = None
|
||||||
|
for table in tables:
|
||||||
|
try:
|
||||||
|
rows = self.db.select_data(table, "pinyin, province, city", condition) or []
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = exc
|
||||||
|
continue
|
||||||
|
if rows:
|
||||||
|
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
|
||||||
|
print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
||||||
|
return rows
|
||||||
|
|
||||||
rows: List[Dict[str, str]] = []
|
if last_error:
|
||||||
for record in records:
|
print(f"[法律快车] 加载地区数据失败: {last_error}")
|
||||||
row = self._to_legacy_lawyer_row(record)
|
print("[法律快车] 无城市数据(已尝试 area_new/area/area2)")
|
||||||
if row:
|
return []
|
||||||
rows.append(row)
|
|
||||||
if not rows:
|
|
||||||
return 0, 0
|
|
||||||
|
|
||||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||||||
inserted = 0
|
return self._get_with_session(self.session, url, max_retries=max_retries, is_thread=False)
|
||||||
skipped = 0
|
|
||||||
|
|
||||||
for row in rows:
|
def _get_with_session(self, session: requests.Session, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
|
||||||
phone = row.get("phone", "")
|
for attempt in range(max_retries):
|
||||||
if not phone or phone in existing:
|
try:
|
||||||
skipped += 1
|
with request_slot():
|
||||||
|
resp = session.get(url, timeout=15, verify=False)
|
||||||
|
status_code = resp.status_code
|
||||||
|
text = resp.text
|
||||||
|
resp.close()
|
||||||
|
if status_code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
|
print(f"请求失败 {url}: 403,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||||
|
if is_thread:
|
||||||
|
self._refresh_thread_session()
|
||||||
|
session = self._get_thread_session()
|
||||||
|
else:
|
||||||
|
self._refresh_session()
|
||||||
|
session = self.session
|
||||||
|
time.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
print(f"请求失败 {url}: 403 Forbidden")
|
||||||
|
return None
|
||||||
|
if status_code >= 400:
|
||||||
|
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||||||
|
return text
|
||||||
|
except requests.exceptions.RequestException as exc:
|
||||||
|
print(f"请求失败 {url}: {exc}")
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_list(self, html: str, province: str, city: str) -> int:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
links = [a.get("href", "") for a in soup.select("a.hide_link")]
|
||||||
|
links = [link.replace("lll", "int") for link in links if link]
|
||||||
|
if not links:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
|
||||||
|
|
||||||
|
results: List[Dict[str, str]] = []
|
||||||
|
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
||||||
|
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
|
||||||
|
for fut in as_completed(futs):
|
||||||
|
try:
|
||||||
|
data = fut.result()
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" 详情解析异常: {exc}")
|
||||||
|
continue
|
||||||
|
if data and data.get("phone"):
|
||||||
|
results.append(data)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return len(detail_urls)
|
||||||
|
|
||||||
|
phones = [d["phone"] for d in results if d.get("phone")]
|
||||||
|
existing = self._existing_phones(phones)
|
||||||
|
|
||||||
|
for data in results:
|
||||||
|
phone = data.get("phone")
|
||||||
|
if not phone:
|
||||||
|
continue
|
||||||
|
if phone in existing:
|
||||||
|
print(f" -- 已存在: {data['name']} ({phone})")
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
self.db.insert_data("lawyer", row)
|
self.db.insert_data("lawyer", data)
|
||||||
existing.add(phone)
|
print(f" -> 新增: {data['name']} ({phone})")
|
||||||
inserted += 1
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
skipped += 1
|
print(f" 插入失败 {data.get('url')}: {exc}")
|
||||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
|
||||||
|
|
||||||
return inserted, skipped
|
return len(detail_urls)
|
||||||
|
|
||||||
def crawl(
|
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||||||
self,
|
html = None
|
||||||
output_path: str,
|
sess = self._get_thread_session()
|
||||||
max_cities: int = 0,
|
html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
|
||||||
city_filter: Optional[str] = None,
|
if not html:
|
||||||
) -> None:
|
return None
|
||||||
cities = self.discover_cities()
|
|
||||||
print(f"[discover] 共发现城市 {len(cities)} 个")
|
|
||||||
|
|
||||||
if city_filter:
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
key = city_filter.strip().lower()
|
text = soup.get_text(" ")
|
||||||
cities = [
|
|
||||||
c for c in cities
|
|
||||||
if key in c.city_py.lower() or key in c.city_name.lower()
|
|
||||||
]
|
|
||||||
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
|
||||||
|
|
||||||
if max_cities > 0:
|
name = ""
|
||||||
cities = cities[:max_cities]
|
title_tag = soup.find("title")
|
||||||
print(f"[discover] 截断城市数 {len(cities)}")
|
if title_tag:
|
||||||
|
match = re.search(r"(\S+)律师", title_tag.get_text())
|
||||||
|
if match:
|
||||||
|
name = match.group(1)
|
||||||
|
if not name:
|
||||||
|
intl_div = soup.find("div", class_="intl")
|
||||||
|
if intl_div:
|
||||||
|
match = re.search(r"(\S+)律师", intl_div.get_text())
|
||||||
|
if match:
|
||||||
|
name = match.group(1)
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
phone = ""
|
||||||
|
phone_pattern = r"1[3-9]\d{9}"
|
||||||
|
for item in soup.select("div.item.flex"):
|
||||||
|
label = item.find("div", class_="label")
|
||||||
|
desc = item.find("div", class_="desc")
|
||||||
|
if not label or not desc:
|
||||||
|
continue
|
||||||
|
label_text = label.get_text()
|
||||||
|
desc_text = desc.get_text().replace("-", "")
|
||||||
|
if "联系电话" in label_text or "电话" in label_text:
|
||||||
|
matches = re.findall(phone_pattern, desc_text)
|
||||||
|
if matches:
|
||||||
|
phone = matches[0]
|
||||||
|
break
|
||||||
|
if not phone:
|
||||||
|
matches = re.findall(phone_pattern, text.replace("-", ""))
|
||||||
|
if matches:
|
||||||
|
phone = matches[0]
|
||||||
|
if not phone:
|
||||||
|
print(f" 无手机号: {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
seen_ids: Set[str] = set()
|
law_firm = ""
|
||||||
if os.path.exists(output_path):
|
for item in soup.select("div.item.flex"):
|
||||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
label = item.find("div", class_="label")
|
||||||
for line in old_file:
|
desc = item.find("div", class_="desc")
|
||||||
line = line.strip()
|
if not label or not desc:
|
||||||
if not line:
|
continue
|
||||||
continue
|
if "执业律所" in label.get_text() or "律所" in label.get_text():
|
||||||
try:
|
law_firm = desc.get_text(strip=True).replace("已认证", "")
|
||||||
item = json.loads(line)
|
break
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
rid = item.get("record_id")
|
|
||||||
if rid:
|
|
||||||
seen_ids.add(rid)
|
|
||||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
|
||||||
|
|
||||||
total_new_json = 0
|
params = {
|
||||||
total_new_db = 0
|
"list_url": url,
|
||||||
total_skip_db = 0
|
"province": province,
|
||||||
|
"city": city,
|
||||||
|
}
|
||||||
|
|
||||||
with open(output_path, "a", encoding="utf-8") as out:
|
return {
|
||||||
for idx, target in enumerate(cities, start=1):
|
"name": name or "",
|
||||||
print(
|
"law_firm": law_firm,
|
||||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
"province": province,
|
||||||
f"({target.city_py})"
|
"city": city,
|
||||||
)
|
"phone": phone,
|
||||||
city_records = list(self.crawl_city(target))
|
"url": url,
|
||||||
|
"domain": DOMAIN,
|
||||||
|
"create_time": int(time.time()),
|
||||||
|
"params": json.dumps(params, ensure_ascii=False)
|
||||||
|
}
|
||||||
|
|
||||||
city_new_json = 0
|
def run(self):
|
||||||
for record in city_records:
|
print("启动法律快车采集...")
|
||||||
rid = record["record_id"]
|
areas = self._load_areas()
|
||||||
if rid in seen_ids:
|
if not areas:
|
||||||
continue
|
print("无地区数据")
|
||||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
return
|
||||||
seen_ids.add(rid)
|
|
||||||
city_new_json += 1
|
|
||||||
total_new_json += 1
|
|
||||||
|
|
||||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
for area in areas:
|
||||||
total_new_db += city_new_db
|
pinyin = area.get("pinyin")
|
||||||
total_skip_db += city_skip_db
|
province = area.get("province", "")
|
||||||
|
city = area.get("city", "")
|
||||||
print(
|
if not pinyin:
|
||||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
continue
|
||||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
page = 1
|
||||||
)
|
while True:
|
||||||
|
list_url = LIST_BASE.format(pinyin=pinyin, page=page)
|
||||||
print(
|
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
||||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
html = self._get(list_url)
|
||||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
if not html:
|
||||||
)
|
break
|
||||||
|
link_count = self._parse_list(html, province, city)
|
||||||
|
if link_count == 0:
|
||||||
def parse_args() -> argparse.Namespace:
|
break
|
||||||
parser = argparse.ArgumentParser(description="法律快车全新采集脚本(站点数据直采)")
|
page += 1
|
||||||
parser.add_argument(
|
print("法律快车采集完成")
|
||||||
"--output",
|
|
||||||
default="/www/wwwroot/lawyers/data/lawtime_records_all.jsonl",
|
|
||||||
help="输出 jsonl 文件路径",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-cities",
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help="最多采集多少个城市,0 表示不限",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-pages",
|
|
||||||
type=int,
|
|
||||||
default=9999,
|
|
||||||
help="每个城市最多采集多少页",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--city-filter",
|
|
||||||
default="",
|
|
||||||
help="按城市拼音或城市名过滤,如 beijing",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--sleep",
|
|
||||||
type=float,
|
|
||||||
default=0.1,
|
|
||||||
help="详情页请求间隔秒数",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--direct",
|
|
||||||
action="store_true",
|
|
||||||
help="直连模式,不使用 proxy_settings.json 代理",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--no-db",
|
|
||||||
action="store_true",
|
|
||||||
help="只输出 JSONL,不写入数据库",
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
if args.no_db:
|
|
||||||
crawler = LawtimeCrawler(
|
|
||||||
max_pages=args.max_pages,
|
|
||||||
sleep_seconds=args.sleep,
|
|
||||||
use_proxy=not args.direct,
|
|
||||||
db_connection=None,
|
|
||||||
)
|
|
||||||
crawler.crawl(
|
|
||||||
output_path=args.output,
|
|
||||||
max_cities=args.max_cities,
|
|
||||||
city_filter=args.city_filter or None,
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
with Db() as db:
|
|
||||||
crawler = LawtimeCrawler(
|
|
||||||
max_pages=args.max_pages,
|
|
||||||
sleep_seconds=args.sleep,
|
|
||||||
use_proxy=not args.direct,
|
|
||||||
db_connection=db,
|
|
||||||
)
|
|
||||||
crawler.crawl(
|
|
||||||
output_path=args.output,
|
|
||||||
max_cities=args.max_cities,
|
|
||||||
city_filter=args.city_filter or None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
with Db() as db:
|
||||||
|
spider = LawtimeSpider(db)
|
||||||
|
spider.run()
|
||||||
|
|||||||
+267
-608
@@ -1,17 +1,11 @@
|
|||||||
import argparse
|
|
||||||
import hashlib
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
import random
|
||||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
from typing import Dict, Optional, List, Set
|
||||||
from urllib.parse import urljoin
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
import threading
|
||||||
import urllib3
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir)
|
project_root = os.path.dirname(current_dir)
|
||||||
@@ -21,237 +15,167 @@ if request_dir not in sys.path:
|
|||||||
if project_root not in sys.path:
|
if project_root not in sys.path:
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
from Db import Db
|
import requests
|
||||||
from request.requests_client import RequestClientError, RequestsClient
|
import urllib3
|
||||||
from utils.rate_limiter import wait_for_request
|
from bs4 import BeautifulSoup
|
||||||
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
SITE_NAME = "64365"
|
from Db import Db
|
||||||
LEGACY_DOMAIN = "律图"
|
from utils.rate_limiter import request_slot
|
||||||
SITE_BASE = "https://m.64365.com"
|
|
||||||
AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js"
|
|
||||||
LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
|
|
||||||
|
|
||||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
DOMAIN = "律图"
|
||||||
YEAR_RE = re.compile(r"(\d+)\s*年")
|
LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class Six4365Spider:
|
||||||
class CityTarget:
|
def __init__(self, db_connection):
|
||||||
area_id: str
|
|
||||||
province_id: str
|
|
||||||
province_name: str
|
|
||||||
province_py: str
|
|
||||||
city_name: str
|
|
||||||
city_py: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ListCard:
|
|
||||||
detail_url: str
|
|
||||||
name: str
|
|
||||||
specialties: List[str]
|
|
||||||
score_text: str
|
|
||||||
service_text: str
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_phone(text: str) -> str:
|
|
||||||
compact = re.sub(r"\D", "", text or "")
|
|
||||||
match = PHONE_RE.search(compact)
|
|
||||||
return match.group(0) if match else ""
|
|
||||||
|
|
||||||
|
|
||||||
class Six4365Crawler:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
max_pages: int = 9999,
|
|
||||||
sleep_seconds: float = 0.1,
|
|
||||||
use_proxy: bool = True,
|
|
||||||
db_connection=None,
|
|
||||||
):
|
|
||||||
self.max_pages = max_pages
|
|
||||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.client = RequestsClient(
|
self.session = self._build_session()
|
||||||
headers={
|
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
|
||||||
"User-Agent": (
|
self._tls = threading.local()
|
||||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
self.cities = self._load_cities()
|
||||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
|
||||||
"Mobile/15E148 Safari/604.1"
|
|
||||||
),
|
|
||||||
"Accept": "text/html, */*; q=0.01",
|
|
||||||
"Connection": "close",
|
|
||||||
},
|
|
||||||
use_proxy=use_proxy,
|
|
||||||
retry_total=2,
|
|
||||||
retry_backoff_factor=1,
|
|
||||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
|
||||||
retry_allowed_methods=("GET", "POST"),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _request_text(
|
def _build_session(self) -> requests.Session:
|
||||||
self,
|
report_proxy_status()
|
||||||
method: str,
|
session = requests.Session()
|
||||||
url: str,
|
session.trust_env = False
|
||||||
*,
|
proxies = get_proxies()
|
||||||
timeout: int = 20,
|
if proxies:
|
||||||
max_retries: int = 3,
|
session.proxies.update(proxies)
|
||||||
referer: str = SITE_BASE,
|
else:
|
||||||
data: Optional[Dict] = None,
|
session.proxies.clear()
|
||||||
) -> str:
|
session.headers.update({
|
||||||
headers = {"Referer": referer}
|
"User-Agent": (
|
||||||
last_error: Optional[Exception] = None
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||||
|
"Mobile/15E148 Safari/604.1"
|
||||||
|
),
|
||||||
|
"Connection": "close",
|
||||||
|
})
|
||||||
|
return session
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
def _refresh_session(self) -> None:
|
||||||
wait_for_request()
|
try:
|
||||||
|
self.session.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self.session = self._build_session()
|
||||||
|
|
||||||
|
def _get_thread_session(self) -> requests.Session:
|
||||||
|
"""requests.Session 不是严格线程安全:每个线程用独立 session(但共享同样代理/headers)"""
|
||||||
|
s = getattr(self._tls, "session", None)
|
||||||
|
if s is not None:
|
||||||
|
return s
|
||||||
|
s = self._build_session()
|
||||||
|
s.headers.update(dict(self.session.headers))
|
||||||
|
self._tls.session = s
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _refresh_thread_session(self) -> None:
|
||||||
|
s = getattr(self._tls, "session", None)
|
||||||
|
if s is not None:
|
||||||
try:
|
try:
|
||||||
if method.upper() == "POST":
|
s.close()
|
||||||
resp = self.client.post_text(
|
except Exception:
|
||||||
url,
|
pass
|
||||||
timeout=timeout,
|
self._tls.session = None
|
||||||
verify=False,
|
|
||||||
headers=headers,
|
|
||||||
data=data,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
resp = self.client.get_text(
|
|
||||||
url,
|
|
||||||
timeout=timeout,
|
|
||||||
verify=False,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
|
|
||||||
code = resp.status_code
|
def _existing_urls(self, urls: List[str]) -> Set[str]:
|
||||||
if code == 403:
|
"""批量查重,减少 N 次 is_data_exist"""
|
||||||
if attempt < max_retries - 1:
|
if not urls:
|
||||||
self.client.refresh()
|
return set()
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
existing: Set[str] = set()
|
||||||
continue
|
cur = self.db.db.cursor()
|
||||||
raise RequestClientError(f"{code} Error: {url}")
|
try:
|
||||||
if code >= 500 and attempt < max_retries - 1:
|
# IN 参数过多会失败,分批
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
chunk_size = 500
|
||||||
continue
|
for i in range(0, len(urls), chunk_size):
|
||||||
if code >= 400:
|
chunk = urls[i:i + chunk_size]
|
||||||
raise RequestClientError(f"{code} Error: {url}")
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
return resp.text
|
sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
|
||||||
|
cur.execute(sql, chunk)
|
||||||
|
for row in cur.fetchall():
|
||||||
|
# pymysql 默认返回 tuple
|
||||||
|
existing.add(row[0])
|
||||||
|
finally:
|
||||||
|
cur.close()
|
||||||
|
return existing
|
||||||
|
|
||||||
|
def _load_cities(self):
|
||||||
|
tables = ("area_new", "area2", "area")
|
||||||
|
last_error = None
|
||||||
|
for table in tables:
|
||||||
|
try:
|
||||||
|
provinces = self.db.select_data(
|
||||||
|
table,
|
||||||
|
"id, code, province",
|
||||||
|
"domain='64365' AND level=1"
|
||||||
|
) or []
|
||||||
|
cities = self.db.select_data(
|
||||||
|
table,
|
||||||
|
"code, city, province, pid",
|
||||||
|
"domain='64365' AND level=2"
|
||||||
|
) or []
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
last_error = exc
|
last_error = exc
|
||||||
if attempt < max_retries - 1:
|
continue
|
||||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
||||||
continue
|
|
||||||
raise
|
|
||||||
|
|
||||||
if last_error is not None:
|
if not cities:
|
||||||
raise last_error
|
continue
|
||||||
raise RequestClientError(f"Unknown request error: {url}")
|
|
||||||
|
|
||||||
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
|
province_map = {row.get('id'): row for row in provinces}
|
||||||
return self._request_text(
|
data = {}
|
||||||
"GET",
|
for city in cities:
|
||||||
url,
|
province_row = province_map.get(city.get('pid'), {}) or {}
|
||||||
timeout=timeout,
|
data[str(city.get('code'))] = {
|
||||||
max_retries=max_retries,
|
"name": city.get('city'),
|
||||||
referer=referer,
|
"province": city.get('province'),
|
||||||
)
|
"province_name": province_row.get('province', city.get('province')),
|
||||||
|
}
|
||||||
|
print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
|
||||||
|
return data
|
||||||
|
|
||||||
def _post_text(
|
if last_error:
|
||||||
self,
|
print(f"[律图] 加载地区数据失败: {last_error}")
|
||||||
url: str,
|
print("[律图] 无城市数据(已尝试 area_new/area2/area)")
|
||||||
*,
|
return {}
|
||||||
data: Dict,
|
|
||||||
timeout: int = 20,
|
|
||||||
max_retries: int = 3,
|
|
||||||
referer: str = SITE_BASE,
|
|
||||||
) -> str:
|
|
||||||
return self._request_text(
|
|
||||||
"POST",
|
|
||||||
url,
|
|
||||||
timeout=timeout,
|
|
||||||
max_retries=max_retries,
|
|
||||||
referer=referer,
|
|
||||||
data=data,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_area_data(self, text: str) -> List[Dict]:
|
def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
|
||||||
match = re.search(
|
for attempt in range(max_retries):
|
||||||
r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData",
|
try:
|
||||||
text,
|
with request_slot():
|
||||||
re.S,
|
resp = self.session.post(LIST_URL, data=payload, timeout=10, verify=False)
|
||||||
)
|
status_code = resp.status_code
|
||||||
if not match:
|
text = resp.text
|
||||||
return []
|
resp.close()
|
||||||
|
if status_code == 403:
|
||||||
raw = match.group(1)
|
if attempt < max_retries - 1:
|
||||||
try:
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
data = json.loads(raw)
|
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||||
except Exception:
|
self._refresh_session()
|
||||||
return []
|
time.sleep(wait_time)
|
||||||
return data if isinstance(data, list) else []
|
|
||||||
|
|
||||||
def discover_cities(self) -> List[CityTarget]:
|
|
||||||
text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/")
|
|
||||||
provinces = self._extract_area_data(text)
|
|
||||||
|
|
||||||
targets: List[CityTarget] = []
|
|
||||||
seen_area: Set[str] = set()
|
|
||||||
|
|
||||||
for province in provinces:
|
|
||||||
province_id = str(province.get("id") or "").strip()
|
|
||||||
province_name = str(province.get("name") or "").strip()
|
|
||||||
province_py = str(province.get("py") or "").strip()
|
|
||||||
child_rows = province.get("child") or []
|
|
||||||
|
|
||||||
# 常规省份 child 是地级市;直辖市 child 是区县,此时使用省级 id 抓取
|
|
||||||
if child_rows and any((row.get("child") or []) for row in child_rows):
|
|
||||||
for city in child_rows:
|
|
||||||
area_id = str(city.get("id") or "").strip()
|
|
||||||
city_name = str(city.get("name") or "").strip()
|
|
||||||
city_py = str(city.get("py") or "").strip()
|
|
||||||
if not area_id or not city_name:
|
|
||||||
continue
|
continue
|
||||||
if area_id in seen_area:
|
print("请求失败: 403 Forbidden")
|
||||||
continue
|
return None
|
||||||
seen_area.add(area_id)
|
if status_code >= 400:
|
||||||
targets.append(
|
raise requests.exceptions.HTTPError(f"{status_code} Error")
|
||||||
CityTarget(
|
return text
|
||||||
area_id=area_id,
|
except requests.exceptions.RequestException as exc:
|
||||||
province_id=province_id,
|
print(f"请求失败: {exc}")
|
||||||
province_name=province_name,
|
return None
|
||||||
province_py=province_py,
|
return None
|
||||||
city_name=city_name,
|
|
||||||
city_py=city_py,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
if not province_id or not province_name:
|
|
||||||
continue
|
|
||||||
if province_id in seen_area:
|
|
||||||
continue
|
|
||||||
seen_area.add(province_id)
|
|
||||||
targets.append(
|
|
||||||
CityTarget(
|
|
||||||
area_id=province_id,
|
|
||||||
province_id=province_id,
|
|
||||||
province_name=province_name,
|
|
||||||
province_py=province_py,
|
|
||||||
city_name=province_name,
|
|
||||||
city_py=province_py,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return targets
|
def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
|
||||||
|
|
||||||
def _build_payload(self, area_id: str, page: int) -> Dict[str, str]:
|
|
||||||
ua = self.client.headers.get("User-Agent", "")
|
|
||||||
return {
|
return {
|
||||||
"AdCode": "",
|
"AdCode": "",
|
||||||
"RegionId": str(area_id),
|
"RegionId": str(city_code),
|
||||||
"CategoryId": "",
|
"CategoryId": "",
|
||||||
"MaxNumber": "",
|
"MaxNumber": "",
|
||||||
"OnlyData": "true",
|
"OnlyData": "true",
|
||||||
"IgnoreButton": "",
|
"IgnoreButton": "",
|
||||||
"LawyerRecommendRequest[AreaId]": str(area_id),
|
"LawyerRecommendRequest[AreaId]": str(city_code),
|
||||||
"LawyerRecommendRequest[LawCategoryIds]": "",
|
"LawyerRecommendRequest[LawCategoryIds]": "",
|
||||||
"LawyerRecommendRequest[LawFirmPersonCount]": "",
|
"LawyerRecommendRequest[LawFirmPersonCount]": "",
|
||||||
"LawyerRecommendRequest[LawFirmScale]": "",
|
"LawyerRecommendRequest[LawFirmScale]": "",
|
||||||
@@ -268,429 +192,164 @@ class Six4365Crawler:
|
|||||||
"LawyerRecommendRequest[RefferUrl]": "",
|
"LawyerRecommendRequest[RefferUrl]": "",
|
||||||
"LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
|
"LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
|
||||||
"LawyerRecommendRequest[resource_type_name]": "",
|
"LawyerRecommendRequest[resource_type_name]": "",
|
||||||
"LawyerRecommendRequest[UserAgent]": ua,
|
"LawyerRecommendRequest[UserAgent]": self.session.headers["User-Agent"],
|
||||||
"LawyerRecommendRequest[AddLawyerWithNoData]": "false",
|
"LawyerRecommendRequest[AddLawyerWithNoData]": "false",
|
||||||
"ShowCaseButton": "true",
|
"ShowCaseButton": "true",
|
||||||
}
|
}
|
||||||
|
|
||||||
def fetch_list_html(self, target: CityTarget, page: int) -> str:
|
def _parse_list(self, html: str, province: str, city: str) -> int:
|
||||||
payload = self._build_payload(target.area_id, page)
|
|
||||||
return self._post_text(
|
|
||||||
LIST_API_URL,
|
|
||||||
data=payload,
|
|
||||||
referer=SITE_BASE + "/findlawyer/",
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse_list_cards(self, html: str) -> List[ListCard]:
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
cards: List[ListCard] = []
|
lawyers = soup.find_all("a", class_="lawyer")
|
||||||
seen: Set[str] = set()
|
if not lawyers:
|
||||||
|
return 0
|
||||||
|
|
||||||
for anchor in soup.select("a.lawyer[href]"):
|
detail_urls: List[str] = []
|
||||||
href = (anchor.get("href") or "").strip()
|
for lawyer in lawyers:
|
||||||
|
href = lawyer.get("href")
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
detail_url = urljoin(SITE_BASE, href)
|
detail_urls.append(f"{href.rstrip('/')}/info/")
|
||||||
if detail_url in seen:
|
|
||||||
|
if not detail_urls:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
results: List[Dict[str, str]] = []
|
||||||
|
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
||||||
|
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
|
||||||
|
for fut in as_completed(futs):
|
||||||
|
try:
|
||||||
|
data = fut.result()
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" 详情解析异常: {exc}")
|
||||||
|
continue
|
||||||
|
if data:
|
||||||
|
results.append(data)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return len(detail_urls)
|
||||||
|
|
||||||
|
existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
|
||||||
|
for data in results:
|
||||||
|
if not data:
|
||||||
continue
|
continue
|
||||||
seen.add(detail_url)
|
url = data.get("url", "")
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
if url in existing:
|
||||||
|
print(f" -- 已存在URL: {url}")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
self.db.insert_data("lawyer", data)
|
||||||
|
print(f" -> 新增: {data['name']} ({data['phone']})")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" 插入失败 {url}: {exc}")
|
||||||
|
|
||||||
name = ""
|
return len(detail_urls)
|
||||||
name_tag = anchor.select_one("b.name")
|
|
||||||
if name_tag:
|
|
||||||
name = name_tag.get_text(strip=True)
|
|
||||||
|
|
||||||
specialties: List[str] = []
|
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||||||
skill_tag = anchor.select_one("div.skill")
|
html = self._get_detail(url)
|
||||||
if skill_tag:
|
if not html:
|
||||||
raw = skill_tag.get_text(" ", strip=True).replace("擅长:", "")
|
return None
|
||||||
specialties = [x.strip() for x in re.split(r"[、,,]", raw) if x.strip()]
|
|
||||||
|
|
||||||
score_text = ""
|
|
||||||
score_tag = anchor.select_one("div.info span[title='评分'] em")
|
|
||||||
if score_tag:
|
|
||||||
score_text = score_tag.get_text(strip=True)
|
|
||||||
|
|
||||||
service_text = ""
|
|
||||||
service_tag = anchor.select_one("div.info")
|
|
||||||
if service_tag:
|
|
||||||
service_text = service_tag.get_text(" ", strip=True)
|
|
||||||
|
|
||||||
cards.append(
|
|
||||||
ListCard(
|
|
||||||
detail_url=detail_url,
|
|
||||||
name=name,
|
|
||||||
specialties=specialties,
|
|
||||||
score_text=score_text,
|
|
||||||
service_text=service_text,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return cards
|
|
||||||
|
|
||||||
def parse_detail(self, detail_url: str) -> Dict:
|
|
||||||
info_url = detail_url.rstrip("/") + "/info/"
|
|
||||||
html = self._get_text(info_url, referer=detail_url)
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
base_info = soup.find("ul", class_="intro-basic-bar")
|
||||||
|
if not base_info:
|
||||||
|
return None
|
||||||
|
|
||||||
name = ""
|
name = ""
|
||||||
law_firm = ""
|
law_firm = ""
|
||||||
phone = ""
|
phone = ""
|
||||||
practice_years: Optional[int] = None
|
|
||||||
office_area = ""
|
|
||||||
address = ""
|
|
||||||
specialties: List[str] = []
|
|
||||||
|
|
||||||
for li in soup.select("ul.intro-basic-bar li"):
|
for li in base_info.find_all("li"):
|
||||||
label_tag = li.select_one("span.label")
|
label = li.find("span", class_="label")
|
||||||
value_tag = li.select_one("div.txt")
|
txt = li.find("div", class_="txt")
|
||||||
if not label_tag or not value_tag:
|
if not label or not txt:
|
||||||
continue
|
continue
|
||||||
|
label_text = label.get_text(strip=True)
|
||||||
|
if "姓名" in label_text:
|
||||||
|
name = txt.get_text(strip=True)
|
||||||
|
if "执业律所" in label_text:
|
||||||
|
law_firm = txt.get_text(strip=True)
|
||||||
|
|
||||||
label = label_tag.get_text(" ", strip=True).replace(":", "")
|
more_section = soup.find("div", class_="more-intro-basic")
|
||||||
value = value_tag.get_text(" ", strip=True)
|
if more_section:
|
||||||
|
phone_ul = more_section.find("ul", class_="intro-basic-bar")
|
||||||
|
if phone_ul:
|
||||||
|
for li in phone_ul.find_all("li"):
|
||||||
|
label = li.find("span", class_="label")
|
||||||
|
txt = li.find("div", class_="txt")
|
||||||
|
if label and txt and "联系电话" in label.get_text(strip=True):
|
||||||
|
phone = txt.get_text(strip=True).replace(" ", "")
|
||||||
|
break
|
||||||
|
|
||||||
if "姓名" in label and not name:
|
phone = phone.replace('-', '').strip()
|
||||||
name = value
|
if not name or not phone:
|
||||||
elif "执业律所" in label and not law_firm:
|
|
||||||
law_firm = value
|
|
||||||
elif "联系电话" in label and not phone:
|
|
||||||
phone = normalize_phone(value)
|
|
||||||
elif "执业年限" in label and practice_years is None:
|
|
||||||
year_match = YEAR_RE.search(value)
|
|
||||||
if year_match:
|
|
||||||
try:
|
|
||||||
practice_years = int(year_match.group(1))
|
|
||||||
except Exception:
|
|
||||||
practice_years = None
|
|
||||||
elif "办公地区" in label and not office_area:
|
|
||||||
office_area = value
|
|
||||||
elif "办公地址" in label and not address:
|
|
||||||
address = value
|
|
||||||
|
|
||||||
text = soup.get_text(" ", strip=True)
|
|
||||||
if not phone:
|
|
||||||
phone = normalize_phone(text)
|
|
||||||
|
|
||||||
if not name and soup.title:
|
|
||||||
title = soup.title.get_text(" ", strip=True)
|
|
||||||
match = re.search(r"([^\s_,,。]+?)律师", title)
|
|
||||||
if match:
|
|
||||||
name = match.group(1).strip()
|
|
||||||
|
|
||||||
skill_match = re.search(r"擅长:([^\n]+)", text)
|
|
||||||
if skill_match:
|
|
||||||
specialties = [x.strip() for x in re.split(r"[、,,]", skill_match.group(1)) if x.strip()]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"name": name,
|
|
||||||
"law_firm": law_firm,
|
|
||||||
"phone": phone,
|
|
||||||
"practice_years": practice_years,
|
|
||||||
"office_area": office_area,
|
|
||||||
"address": address,
|
|
||||||
"specialties": specialties,
|
|
||||||
"detail_url": detail_url,
|
|
||||||
"info_url": info_url,
|
|
||||||
}
|
|
||||||
|
|
||||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
|
||||||
seen_detail_urls: Set[str] = set()
|
|
||||||
page_first_seen: Set[str] = set()
|
|
||||||
|
|
||||||
for page in range(1, self.max_pages + 1):
|
|
||||||
try:
|
|
||||||
html = self.fetch_list_html(target, page)
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[list] 失败 area={target.area_id} p{page}: {exc}")
|
|
||||||
break
|
|
||||||
|
|
||||||
cards = self.parse_list_cards(html)
|
|
||||||
if not cards:
|
|
||||||
break
|
|
||||||
|
|
||||||
first_url = cards[0].detail_url
|
|
||||||
if first_url in page_first_seen:
|
|
||||||
break
|
|
||||||
page_first_seen.add(first_url)
|
|
||||||
|
|
||||||
for card in cards:
|
|
||||||
if card.detail_url in seen_detail_urls:
|
|
||||||
continue
|
|
||||||
seen_detail_urls.add(card.detail_url)
|
|
||||||
|
|
||||||
try:
|
|
||||||
detail = self.parse_detail(card.detail_url)
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[detail] 失败 {card.detail_url}: {exc}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
now = int(time.time())
|
|
||||||
uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url)
|
|
||||||
uid = uid_match.group(1) if uid_match else card.detail_url
|
|
||||||
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
yield {
|
|
||||||
"record_id": record_id,
|
|
||||||
"collected_at": now,
|
|
||||||
"source": {
|
|
||||||
"site": SITE_NAME,
|
|
||||||
"province_id": target.province_id,
|
|
||||||
"province": target.province_name,
|
|
||||||
"province_py": target.province_py,
|
|
||||||
"area_id": target.area_id,
|
|
||||||
"city": target.city_name,
|
|
||||||
"city_py": target.city_py,
|
|
||||||
"page": page,
|
|
||||||
"detail_url": card.detail_url,
|
|
||||||
"info_url": detail.get("info_url", ""),
|
|
||||||
},
|
|
||||||
"list_snapshot": {
|
|
||||||
"name": card.name,
|
|
||||||
"specialties": card.specialties,
|
|
||||||
"score_text": card.score_text,
|
|
||||||
"service_text": card.service_text,
|
|
||||||
},
|
|
||||||
"profile": {
|
|
||||||
"name": detail.get("name") or card.name,
|
|
||||||
"law_firm": detail.get("law_firm") or "",
|
|
||||||
"phone": detail.get("phone") or "",
|
|
||||||
"practice_years": detail.get("practice_years"),
|
|
||||||
"office_area": detail.get("office_area") or "",
|
|
||||||
"address": detail.get("address") or "",
|
|
||||||
"specialties": detail.get("specialties") or card.specialties,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.sleep_seconds:
|
|
||||||
time.sleep(self.sleep_seconds)
|
|
||||||
|
|
||||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
|
||||||
source = record.get("source", {}) or {}
|
|
||||||
profile = record.get("profile", {}) or {}
|
|
||||||
|
|
||||||
phone = normalize_phone(profile.get("phone", ""))
|
|
||||||
if not phone:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
province = (source.get("province") or "").strip()
|
data = {
|
||||||
city = (source.get("city") or province).strip()
|
"phone": phone,
|
||||||
return {
|
|
||||||
"name": (profile.get("name") or "").strip(),
|
|
||||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
|
||||||
"province": province,
|
"province": province,
|
||||||
"city": city,
|
"city": city,
|
||||||
"phone": phone,
|
"law_firm": law_firm,
|
||||||
"url": (source.get("info_url") or source.get("detail_url") or "").strip(),
|
"url": url,
|
||||||
"domain": LEGACY_DOMAIN,
|
"domain": DOMAIN,
|
||||||
"create_time": int(record.get("collected_at") or time.time()),
|
"name": name,
|
||||||
"params": json.dumps(record, ensure_ascii=False),
|
"create_time": int(time.time()),
|
||||||
|
"params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
|
||||||
}
|
}
|
||||||
|
return data
|
||||||
|
|
||||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||||||
if not self.db or not phones:
|
session = self._get_thread_session()
|
||||||
return set()
|
for attempt in range(max_retries):
|
||||||
|
|
||||||
deduped = sorted({p for p in phones if p})
|
|
||||||
if not deduped:
|
|
||||||
return set()
|
|
||||||
|
|
||||||
existing: Set[str] = set()
|
|
||||||
cur = self.db.db.cursor()
|
|
||||||
try:
|
|
||||||
chunk_size = 500
|
|
||||||
for i in range(0, len(deduped), chunk_size):
|
|
||||||
chunk = deduped[i:i + chunk_size]
|
|
||||||
placeholders = ",".join(["%s"] * len(chunk))
|
|
||||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
|
||||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
|
||||||
for row in cur.fetchall():
|
|
||||||
existing.add(row[0])
|
|
||||||
finally:
|
|
||||||
cur.close()
|
|
||||||
|
|
||||||
return existing
|
|
||||||
|
|
||||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
|
||||||
if not self.db:
|
|
||||||
return 0, 0
|
|
||||||
|
|
||||||
rows: List[Dict[str, str]] = []
|
|
||||||
for record in records:
|
|
||||||
row = self._to_legacy_lawyer_row(record)
|
|
||||||
if row:
|
|
||||||
rows.append(row)
|
|
||||||
if not rows:
|
|
||||||
return 0, 0
|
|
||||||
|
|
||||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
|
||||||
inserted = 0
|
|
||||||
skipped = 0
|
|
||||||
|
|
||||||
for row in rows:
|
|
||||||
phone = row.get("phone", "")
|
|
||||||
if not phone or phone in existing:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
try:
|
try:
|
||||||
self.db.insert_data("lawyer", row)
|
with request_slot():
|
||||||
existing.add(phone)
|
resp = session.get(url, timeout=10, verify=False)
|
||||||
inserted += 1
|
status_code = resp.status_code
|
||||||
except Exception as exc:
|
text = resp.text
|
||||||
skipped += 1
|
resp.close()
|
||||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
if status_code == 403:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
return inserted, skipped
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||||
|
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||||
def crawl(
|
self._refresh_thread_session()
|
||||||
self,
|
session = self._get_thread_session()
|
||||||
output_path: str,
|
time.sleep(wait_time)
|
||||||
max_cities: int = 0,
|
|
||||||
city_filter: Optional[str] = None,
|
|
||||||
) -> None:
|
|
||||||
cities = self.discover_cities()
|
|
||||||
print(f"[discover] 共发现地区 {len(cities)} 个")
|
|
||||||
|
|
||||||
if city_filter:
|
|
||||||
key = city_filter.strip().lower()
|
|
||||||
cities = [
|
|
||||||
c for c in cities
|
|
||||||
if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id
|
|
||||||
]
|
|
||||||
print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}")
|
|
||||||
|
|
||||||
if max_cities > 0:
|
|
||||||
cities = cities[:max_cities]
|
|
||||||
print(f"[discover] 截断地区数 {len(cities)}")
|
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
||||||
|
|
||||||
seen_ids: Set[str] = set()
|
|
||||||
if os.path.exists(output_path):
|
|
||||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
|
||||||
for line in old_file:
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
continue
|
||||||
try:
|
print(" 请求失败: 403 Forbidden")
|
||||||
item = json.loads(line)
|
return None
|
||||||
except Exception:
|
if status_code >= 400:
|
||||||
continue
|
raise requests.exceptions.HTTPError(f"{status_code} Error")
|
||||||
rid = item.get("record_id")
|
return text
|
||||||
if rid:
|
except requests.exceptions.RequestException as exc:
|
||||||
seen_ids.add(rid)
|
print(f" 请求失败: {exc}")
|
||||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
total_new_json = 0
|
def run(self):
|
||||||
total_new_db = 0
|
print("启动律图采集...")
|
||||||
total_skip_db = 0
|
if not self.cities:
|
||||||
|
print("无城市数据")
|
||||||
|
return
|
||||||
|
|
||||||
with open(output_path, "a", encoding="utf-8") as out:
|
for city_code, info in self.cities.items():
|
||||||
for idx, target in enumerate(cities, start=1):
|
province = info.get("province_name", "")
|
||||||
print(
|
city = info.get("name", "")
|
||||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
print(f"采集 {province}-{city}")
|
||||||
f"(area={target.area_id})"
|
page = 1
|
||||||
)
|
while True:
|
||||||
city_records = list(self.crawl_city(target))
|
payload = self._build_payload(city_code, page)
|
||||||
|
html = self._post(payload)
|
||||||
city_new_json = 0
|
if not html:
|
||||||
for record in city_records:
|
break
|
||||||
rid = record["record_id"]
|
link_count = self._parse_list(html, province, city)
|
||||||
if rid in seen_ids:
|
if link_count == 0:
|
||||||
continue
|
break
|
||||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
page += 1
|
||||||
seen_ids.add(rid)
|
print("律图采集完成")
|
||||||
city_new_json += 1
|
|
||||||
total_new_json += 1
|
|
||||||
|
|
||||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
|
||||||
total_new_db += city_new_db
|
|
||||||
total_skip_db += city_skip_db
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
|
||||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
|
||||||
)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
|
||||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="律图全新采集脚本(站点数据直采)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--output",
|
|
||||||
default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl",
|
|
||||||
help="输出 jsonl 文件路径",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-cities",
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help="最多采集多少个地区,0 表示不限",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-pages",
|
|
||||||
type=int,
|
|
||||||
default=9999,
|
|
||||||
help="每个地区最多采集多少页",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--city-filter",
|
|
||||||
default="",
|
|
||||||
help="按城市名称/拼音/编码过滤",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--sleep",
|
|
||||||
type=float,
|
|
||||||
default=0.1,
|
|
||||||
help="详情页请求间隔秒数",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--direct",
|
|
||||||
action="store_true",
|
|
||||||
help="直连模式,不使用 proxy_settings.json 代理",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--no-db",
|
|
||||||
action="store_true",
|
|
||||||
help="只输出 JSONL,不写入数据库",
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
if args.no_db:
|
|
||||||
crawler = Six4365Crawler(
|
|
||||||
max_pages=args.max_pages,
|
|
||||||
sleep_seconds=args.sleep,
|
|
||||||
use_proxy=not args.direct,
|
|
||||||
db_connection=None,
|
|
||||||
)
|
|
||||||
crawler.crawl(
|
|
||||||
output_path=args.output,
|
|
||||||
max_cities=args.max_cities,
|
|
||||||
city_filter=args.city_filter or None,
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
with Db() as db:
|
|
||||||
crawler = Six4365Crawler(
|
|
||||||
max_pages=args.max_pages,
|
|
||||||
sleep_seconds=args.sleep,
|
|
||||||
use_proxy=not args.direct,
|
|
||||||
db_connection=db,
|
|
||||||
)
|
|
||||||
crawler.crawl(
|
|
||||||
output_path=args.output,
|
|
||||||
max_cities=args.max_cities,
|
|
||||||
city_filter=args.city_filter or None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
with Db() as db:
|
||||||
|
spider = Six4365Spider(db)
|
||||||
|
spider.run()
|
||||||
|
|||||||
@@ -0,0 +1,220 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
request_dir = os.path.join(project_root, "request")
|
||||||
|
if request_dir not in sys.path:
|
||||||
|
sys.path.insert(0, request_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CheckResult:
|
||||||
|
site: str
|
||||||
|
url: str
|
||||||
|
method: str
|
||||||
|
ok: bool
|
||||||
|
status_code: Optional[int]
|
||||||
|
error: str
|
||||||
|
hint: str
|
||||||
|
elapsed_ms: int
|
||||||
|
|
||||||
|
|
||||||
|
def _now_ms() -> int:
|
||||||
|
return int(time.time() * 1000)
|
||||||
|
|
||||||
|
|
||||||
|
def _short_hint(text: str) -> str:
|
||||||
|
s = (text or "").strip().lower()
|
||||||
|
flags = []
|
||||||
|
for key, label in [
|
||||||
|
("403", "403"),
|
||||||
|
("429", "429"),
|
||||||
|
("captcha", "captcha"),
|
||||||
|
("验证码", "captcha_cn"),
|
||||||
|
("人机", "bot_check_cn"),
|
||||||
|
("access denied", "access_denied"),
|
||||||
|
("forbidden", "forbidden"),
|
||||||
|
("too many requests", "rate_limited"),
|
||||||
|
("cloudflare", "cloudflare"),
|
||||||
|
("challenge", "challenge"),
|
||||||
|
]:
|
||||||
|
if key in s:
|
||||||
|
flags.append(label)
|
||||||
|
return ",".join(flags)[:120]
|
||||||
|
|
||||||
|
|
||||||
|
def _build_session() -> requests.Session:
|
||||||
|
report_proxy_status()
|
||||||
|
s = requests.Session()
|
||||||
|
s.trust_env = False
|
||||||
|
proxies = get_proxies()
|
||||||
|
if proxies:
|
||||||
|
s.proxies.update(proxies)
|
||||||
|
else:
|
||||||
|
s.proxies.clear()
|
||||||
|
s.headers.update(
|
||||||
|
{
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/136.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "*/*",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Connection": "close",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _check(
|
||||||
|
session: requests.Session,
|
||||||
|
*,
|
||||||
|
site: str,
|
||||||
|
method: str,
|
||||||
|
url: str,
|
||||||
|
timeout: Tuple[float, float] = (10.0, 15.0),
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
data: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> CheckResult:
|
||||||
|
start = _now_ms()
|
||||||
|
try:
|
||||||
|
resp = session.request(
|
||||||
|
method=method,
|
||||||
|
url=url,
|
||||||
|
timeout=timeout,
|
||||||
|
verify=False,
|
||||||
|
headers=headers,
|
||||||
|
data=data,
|
||||||
|
)
|
||||||
|
text = resp.text or ""
|
||||||
|
status = resp.status_code
|
||||||
|
hint = _short_hint(text[:1200])
|
||||||
|
ok = 200 <= status < 400
|
||||||
|
return CheckResult(
|
||||||
|
site=site,
|
||||||
|
url=url,
|
||||||
|
method=method,
|
||||||
|
ok=ok,
|
||||||
|
status_code=status,
|
||||||
|
error="",
|
||||||
|
hint=hint,
|
||||||
|
elapsed_ms=_now_ms() - start,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
return CheckResult(
|
||||||
|
site=site,
|
||||||
|
url=url,
|
||||||
|
method=method,
|
||||||
|
ok=False,
|
||||||
|
status_code=None,
|
||||||
|
error=str(exc)[:200],
|
||||||
|
hint="",
|
||||||
|
elapsed_ms=_now_ms() - start,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
resp.close() # type: ignore[name-defined]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _tests() -> List[Dict[str, Any]]:
|
||||||
|
# 每个站点选一个“代表性列表/API”作为冒烟:能快速暴露 403/验证码/限频。
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"site": "大律师(m站)",
|
||||||
|
"method": "GET",
|
||||||
|
"url": "https://m.maxlaw.cn/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "大律师(PC站)",
|
||||||
|
"method": "GET",
|
||||||
|
"url": "https://www.maxlaw.cn/law/beijing?page=1",
|
||||||
|
"headers": {"Referer": "https://www.maxlaw.cn/"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "找法网(m站)",
|
||||||
|
"method": "GET",
|
||||||
|
"url": "https://m.findlaw.cn/beijing/q_lawyer/p1?ajax=1&order=0&sex=-1",
|
||||||
|
"headers": {
|
||||||
|
"Referer": "https://m.findlaw.cn/beijing/q_lawyer/",
|
||||||
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "法律快车(m站)",
|
||||||
|
"method": "GET",
|
||||||
|
"url": "https://m.lawtime.cn/beijing/lawyer/?page=1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "律图(m站)",
|
||||||
|
"method": "POST",
|
||||||
|
"url": "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/",
|
||||||
|
"data": {
|
||||||
|
"RegionId": "110100", # 北京市
|
||||||
|
"OnlyData": "true",
|
||||||
|
"LawyerRecommendRequest[AreaId]": "110100",
|
||||||
|
"LawyerRecommendRequest[PageIndex]": "1",
|
||||||
|
"LawyerRecommendRequest[PageSize]": "10",
|
||||||
|
"LawyerRecommendRequest[OrderType]": "0",
|
||||||
|
"LawyerRecommendRequest[Type]": "1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "华律(m站)",
|
||||||
|
"method": "POST",
|
||||||
|
"url": "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/",
|
||||||
|
"data": {
|
||||||
|
"pid": "110000", # 北京
|
||||||
|
"cid": "110100", # 北京市
|
||||||
|
"page": "1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
mode = os.getenv("PROXY_ENABLED")
|
||||||
|
print(f"[smoke] PROXY_ENABLED={mode!r}")
|
||||||
|
s = _build_session()
|
||||||
|
results: List[CheckResult] = []
|
||||||
|
for item in _tests():
|
||||||
|
res = _check(
|
||||||
|
s,
|
||||||
|
site=item["site"],
|
||||||
|
method=item["method"],
|
||||||
|
url=item["url"],
|
||||||
|
headers=item.get("headers"),
|
||||||
|
data=item.get("data"),
|
||||||
|
)
|
||||||
|
results.append(res)
|
||||||
|
print(
|
||||||
|
f"[smoke] {res.site} {res.method} {res.status_code} ok={res.ok} "
|
||||||
|
f"{res.elapsed_ms}ms hint={res.hint or '-'} err={res.error or '-'}"
|
||||||
|
)
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"proxy_enabled": mode,
|
||||||
|
"results": [res.__dict__ for res in results],
|
||||||
|
}
|
||||||
|
print("[smoke] summary_json=" + json.dumps(summary, ensure_ascii=False))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
|
|
||||||
+31
-71
@@ -1,80 +1,40 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
# 切换到脚本所在目录,确保相对路径正确
|
||||||
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
cd "$(dirname "$0")"
|
||||||
LOG_DIR="${PROJECT_ROOT}/logs"
|
|
||||||
DATA_DIR="${PROJECT_ROOT}/data"
|
|
||||||
|
|
||||||
mkdir -p "${LOG_DIR}" "${DATA_DIR}"
|
echo "使用 request/proxy_settings.json 读取代理配置"
|
||||||
|
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||||
|
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||||
|
|
||||||
if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
|
is_job_running() {
|
||||||
PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
|
local script="$1"
|
||||||
else
|
local script_regex="${script//./\\.}"
|
||||||
PYTHON_BIN="python3"
|
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||||
fi
|
|
||||||
|
|
||||||
RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
|
|
||||||
|
|
||||||
echo "[start] project=${PROJECT_ROOT}"
|
|
||||||
echo "[start] python=${PYTHON_BIN}"
|
|
||||||
echo "[start] mode=${RUN_MODE}"
|
|
||||||
echo "[start] proxy=request/proxy_settings.json"
|
|
||||||
|
|
||||||
# 大律师(新结构采集 + 写库)可通过环境变量控制
|
|
||||||
DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
|
|
||||||
DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
|
|
||||||
DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
|
|
||||||
DLS_SLEEP="${DLS_SLEEP:-0.2}"
|
|
||||||
DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
|
|
||||||
DLS_EXTRA_ARGS=()
|
|
||||||
|
|
||||||
if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
|
|
||||||
DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
|
|
||||||
fi
|
|
||||||
if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
|
|
||||||
DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
|
|
||||||
fi
|
|
||||||
if [[ -n "${DLS_CITY_FILTER}" ]]; then
|
|
||||||
DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
|
|
||||||
fi
|
|
||||||
DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
|
|
||||||
|
|
||||||
if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
|
|
||||||
DLS_EXTRA_ARGS+=(--direct)
|
|
||||||
fi
|
|
||||||
if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
|
|
||||||
DLS_EXTRA_ARGS+=(--no-db)
|
|
||||||
fi
|
|
||||||
|
|
||||||
run_bg() {
|
|
||||||
local name="$1"
|
|
||||||
shift
|
|
||||||
local logfile="${LOG_DIR}/${name}.log"
|
|
||||||
nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
|
|
||||||
echo "[start] ${name} pid=$! log=${logfile}"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
run_fg() {
|
start_job() {
|
||||||
local name="$1"
|
local script="$1"
|
||||||
shift
|
local log_file="$2"
|
||||||
local logfile="${LOG_DIR}/${name}.log"
|
local label="$3"
|
||||||
echo "[start] ${name} fg log=${logfile}"
|
local existing
|
||||||
env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
|
|
||||||
|
existing="$(is_job_running "${script}")"
|
||||||
|
if [[ -n "${existing}" ]]; then
|
||||||
|
echo "跳过 ${label}: ${script} 已在运行"
|
||||||
|
echo "${existing}" | head -n 1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||||
|
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||||
|
sleep 1
|
||||||
}
|
}
|
||||||
|
|
||||||
if [[ "${RUN_MODE}" == "sequential" ]]; then
|
start_job "dls.py" "dls.log" "大律师"
|
||||||
run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
start_job "dls_pc.py" "dls_pc.log" "大律师PC站"
|
||||||
run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
start_job "findlaw.py" "findlaw.log" "找法网"
|
||||||
run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
start_job "lawtime.py" "lawtime.log" "法律快车"
|
||||||
run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
start_job "six4365.py" "six4365.log" "律图"
|
||||||
run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
start_job "hualv.py" "hualv.log" "华律"
|
||||||
echo "[done] sequential completed"
|
|
||||||
else
|
|
||||||
run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
|
||||||
run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
|
||||||
run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
|
||||||
run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
|
||||||
run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
|
||||||
echo "[done] all crawlers started in background"
|
|
||||||
fi
|
|
||||||
|
|||||||
Executable
+48
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# 切换到脚本所在目录,确保相对路径正确
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
# 强制直连:不使用代理 IP
|
||||||
|
export PROXY_ENABLED=0
|
||||||
|
|
||||||
|
# 直连模式建议更保守一些,降低被临时风控的概率
|
||||||
|
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||||
|
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||||
|
|
||||||
|
is_job_running() {
|
||||||
|
local script="$1"
|
||||||
|
local script_regex="${script//./\\.}"
|
||||||
|
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||||
|
}
|
||||||
|
|
||||||
|
start_job() {
|
||||||
|
local script="$1"
|
||||||
|
local log_file="$2"
|
||||||
|
local label="$3"
|
||||||
|
local existing
|
||||||
|
|
||||||
|
existing="$(is_job_running "${script}")"
|
||||||
|
if [[ -n "${existing}" ]]; then
|
||||||
|
echo "跳过 ${label}: ${script} 已在运行"
|
||||||
|
echo "${existing}" | head -n 1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||||
|
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||||
|
sleep 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "直连模式(PROXY_ENABLED=0),每周两次建议用 cron 调度"
|
||||||
|
echo "当前归入直连组:大律师(m/PC)、华律、律图"
|
||||||
|
|
||||||
|
# 直连优先站点:
|
||||||
|
# - 大律师(m站/PC站):当前可直接访问,未见明显强风控
|
||||||
|
# - 华律:当前网页可直接访问,未见明显强风控
|
||||||
|
# - 律图:当前网页可直接访问,未见明显强风控
|
||||||
|
start_job "dls.py" "direct_dls.log" "大律师(直连)"
|
||||||
|
start_job "dls_pc.py" "direct_dls_pc.log" "大律师PC站(直连)"
|
||||||
|
start_job "hualv.py" "direct_hualv.log" "华律(直连)"
|
||||||
|
start_job "six4365.py" "direct_six4365.log" "律图(直连)"
|
||||||
Executable
+53
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# 切换到脚本所在目录,确保相对路径正确
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
# 强制开启代理:用于容易被限频/拦截的站点
|
||||||
|
export PROXY_ENABLED=1
|
||||||
|
|
||||||
|
# 代理模式下默认更保守一点,避免冲爆代理与触发风控
|
||||||
|
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||||
|
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||||
|
|
||||||
|
# 可选:开启代理连通性测试输出(部分脚本会打印测试信息/代理状态)
|
||||||
|
export PROXY_TEST="${PROXY_TEST:-0}"
|
||||||
|
|
||||||
|
is_job_running() {
|
||||||
|
local script="$1"
|
||||||
|
local script_regex="${script//./\\.}"
|
||||||
|
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||||
|
}
|
||||||
|
|
||||||
|
start_job() {
|
||||||
|
local script="$1"
|
||||||
|
local log_file="$2"
|
||||||
|
local label="$3"
|
||||||
|
local existing
|
||||||
|
|
||||||
|
existing="$(is_job_running "${script}")"
|
||||||
|
if [[ -n "${existing}" ]]; then
|
||||||
|
echo "跳过 ${label}: ${script} 已在运行"
|
||||||
|
echo "${existing}" | head -n 1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||||
|
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||||
|
sleep 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "代理模式(PROXY_ENABLED=1),每周一次建议用 cron 调度"
|
||||||
|
echo "代理配置读取自 request/proxy_settings.json"
|
||||||
|
echo "每周一次代理任务 = 全量采集所有站点"
|
||||||
|
|
||||||
|
# 每周一次代理任务做全量采集:
|
||||||
|
# - 强风控/更敏感站点:找法网、法律快车
|
||||||
|
# - 其余站点也一并跑,保证每周至少有一次“全量最新数据”刷新
|
||||||
|
start_job "dls.py" "proxy_dls.log" "大律师(代理全量)"
|
||||||
|
start_job "dls_pc.py" "proxy_dls_pc.log" "大律师PC站(代理全量)"
|
||||||
|
start_job "findlaw.py" "proxy_findlaw.log" "找法网(代理)"
|
||||||
|
start_job "lawtime.py" "proxy_lawtime.log" "法律快车(代理)"
|
||||||
|
start_job "hualv.py" "proxy_hualv.log" "华律(代理全量)"
|
||||||
|
start_job "six4365.py" "proxy_six4365.log" "律图(代理全量)"
|
||||||
@@ -0,0 +1,565 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
from openpyxl import Workbook, load_workbook
|
||||||
|
from openpyxl.styles import Font
|
||||||
|
|
||||||
|
from config import DB_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class LawyerRecord:
|
||||||
|
id: int
|
||||||
|
name: str
|
||||||
|
phone: str
|
||||||
|
law_firm: str
|
||||||
|
province: str
|
||||||
|
city: str
|
||||||
|
domain: str
|
||||||
|
create_time: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PhoneBackfill:
|
||||||
|
matched_phones: List[str]
|
||||||
|
records: List[LawyerRecord]
|
||||||
|
best_name: str
|
||||||
|
best_law_firm: str
|
||||||
|
best_domain: str
|
||||||
|
candidate_names: List[str]
|
||||||
|
candidate_firms: List[str]
|
||||||
|
candidate_domains: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
DOMAIN_PRIORITY = {
|
||||||
|
"华律": 90,
|
||||||
|
"大律师": 85,
|
||||||
|
"找法网": 82,
|
||||||
|
"法律快车": 80,
|
||||||
|
"律图": 72,
|
||||||
|
"众法利单页": 68,
|
||||||
|
"众法利": 66,
|
||||||
|
"六四三六五": 64,
|
||||||
|
"智飞律师在线": 40,
|
||||||
|
"高德地图": 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
GENERIC_FIRMS = {"高德搜索"}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="按律所名从数据库补手机号并导出对比表")
|
||||||
|
parser.add_argument("--input", default="man.xlsx", help="原始 xlsx 文件路径")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="man_firm_phone_compare.xlsx",
|
||||||
|
help="输出 xlsx 文件路径",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_text(value: object) -> str:
|
||||||
|
text = str(value or "").strip()
|
||||||
|
text = text.replace("(", "(").replace(")", ")")
|
||||||
|
text = re.sub(r"\s+", "", text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_firm(value: object) -> str:
|
||||||
|
text = normalize_text(value)
|
||||||
|
text = text.replace("本地大所", "").replace("特色律所", "")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_name(value: object) -> str:
|
||||||
|
text = normalize_text(value)
|
||||||
|
return text.replace("律师", "")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_province(value: object) -> str:
|
||||||
|
text = str(value or "").strip()
|
||||||
|
mapping = {
|
||||||
|
"北京市": "北京",
|
||||||
|
"天津市": "天津",
|
||||||
|
"上海市": "上海",
|
||||||
|
"重庆市": "重庆",
|
||||||
|
"内蒙古自治区": "内蒙古",
|
||||||
|
"广西壮族自治区": "广西",
|
||||||
|
"宁夏回族自治区": "宁夏",
|
||||||
|
"新疆维吾尔自治区": "新疆",
|
||||||
|
"西藏自治区": "西藏",
|
||||||
|
"香港特别行政区": "香港",
|
||||||
|
"澳门特别行政区": "澳门",
|
||||||
|
"新疆生产建设兵团": "新疆",
|
||||||
|
}
|
||||||
|
if text in mapping:
|
||||||
|
return mapping[text]
|
||||||
|
if text.endswith("省") and len(text) > 1:
|
||||||
|
return text[:-1]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_city(value: object) -> str:
|
||||||
|
text = str(value or "").strip()
|
||||||
|
for suffix in ("市", "地区", "盟"):
|
||||||
|
if text.endswith(suffix) and len(text) > len(suffix):
|
||||||
|
return text[: -len(suffix)]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def split_phones(value: object) -> List[str]:
|
||||||
|
return re.findall(r"1\d{10}", str(value or ""))
|
||||||
|
|
||||||
|
|
||||||
|
def unique_phones(records: Sequence[LawyerRecord]) -> List[str]:
|
||||||
|
output: List[str] = []
|
||||||
|
seen = set()
|
||||||
|
for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True):
|
||||||
|
if record.phone and record.phone not in seen:
|
||||||
|
seen.add(record.phone)
|
||||||
|
output.append(record.phone)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def unique_values(records: Sequence[LawyerRecord], attr: str) -> List[str]:
|
||||||
|
output: List[str] = []
|
||||||
|
seen = set()
|
||||||
|
for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True):
|
||||||
|
value = getattr(record, attr, "")
|
||||||
|
if value and value not in seen:
|
||||||
|
seen.add(value)
|
||||||
|
output.append(value)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def phone_record_sort_key(
|
||||||
|
record: LawyerRecord,
|
||||||
|
target_name: object,
|
||||||
|
target_province: object,
|
||||||
|
target_city: object,
|
||||||
|
) -> Tuple[int, int, int]:
|
||||||
|
score = 0
|
||||||
|
normalized_target_name = normalize_name(target_name)
|
||||||
|
normalized_target_province = normalize_province(target_province)
|
||||||
|
normalized_target_city = normalize_city(target_city)
|
||||||
|
|
||||||
|
if normalized_target_name:
|
||||||
|
if normalize_name(record.name) == normalized_target_name:
|
||||||
|
score += 400
|
||||||
|
elif record.name:
|
||||||
|
score -= 40
|
||||||
|
|
||||||
|
if record.law_firm and record.law_firm not in GENERIC_FIRMS:
|
||||||
|
score += 220
|
||||||
|
elif record.law_firm:
|
||||||
|
score += 40
|
||||||
|
|
||||||
|
if record.name:
|
||||||
|
score += 100
|
||||||
|
|
||||||
|
if normalized_target_city:
|
||||||
|
if normalize_city(record.city) == normalized_target_city:
|
||||||
|
score += 45
|
||||||
|
elif record.city:
|
||||||
|
score -= 10
|
||||||
|
|
||||||
|
if normalized_target_province:
|
||||||
|
if normalize_province(record.province) == normalized_target_province:
|
||||||
|
score += 25
|
||||||
|
elif record.province:
|
||||||
|
score -= 5
|
||||||
|
|
||||||
|
score += DOMAIN_PRIORITY.get(record.domain, 50)
|
||||||
|
return score, record.create_time, record.id
|
||||||
|
|
||||||
|
|
||||||
|
def compare_result(original_phones: Sequence[str], candidate_phones: Sequence[str]) -> str:
|
||||||
|
if not candidate_phones:
|
||||||
|
return "未匹配"
|
||||||
|
if not original_phones:
|
||||||
|
return "原手机号为空"
|
||||||
|
|
||||||
|
original_set = set(original_phones)
|
||||||
|
candidate_set = set(candidate_phones)
|
||||||
|
if original_set == candidate_set:
|
||||||
|
return "完全一致"
|
||||||
|
if original_set & candidate_set:
|
||||||
|
return "候选包含原手机号"
|
||||||
|
return "不包含原手机号"
|
||||||
|
|
||||||
|
|
||||||
|
def infer_firm_from_address(address: object, ordered_firms: Sequence[str]) -> str:
|
||||||
|
normalized_address = normalize_text(address)
|
||||||
|
if not normalized_address:
|
||||||
|
return ""
|
||||||
|
for firm in ordered_firms:
|
||||||
|
if len(firm) < 4:
|
||||||
|
continue
|
||||||
|
if firm in normalized_address:
|
||||||
|
return firm
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def load_db_indexes() -> Tuple[Dict[str, List[LawyerRecord]], List[str], Dict[str, List[LawyerRecord]]]:
|
||||||
|
conn = pymysql.connect(**DB_CONFIG)
|
||||||
|
firm_index: Dict[str, List[LawyerRecord]] = defaultdict(list)
|
||||||
|
phone_index: Dict[str, List[LawyerRecord]] = defaultdict(list)
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
SELECT id, name, phone, law_firm, province, city, domain, create_time
|
||||||
|
FROM lawyer
|
||||||
|
WHERE phone IS NOT NULL
|
||||||
|
AND phone <> ''
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
for row in cur.fetchall():
|
||||||
|
record = LawyerRecord(
|
||||||
|
id=int(row[0]),
|
||||||
|
name=str(row[1] or "").strip(),
|
||||||
|
phone=str(row[2] or "").strip(),
|
||||||
|
law_firm=str(row[3] or "").strip(),
|
||||||
|
province=str(row[4] or "").strip(),
|
||||||
|
city=str(row[5] or "").strip(),
|
||||||
|
domain=str(row[6] or "").strip(),
|
||||||
|
create_time=int(row[7] or 0),
|
||||||
|
)
|
||||||
|
phone_index[record.phone].append(record)
|
||||||
|
normalized_firm = normalize_firm(record.law_firm)
|
||||||
|
if normalized_firm:
|
||||||
|
firm_index[normalized_firm].append(record)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
ordered_firms = sorted(firm_index.keys(), key=len, reverse=True)
|
||||||
|
return firm_index, ordered_firms, phone_index
|
||||||
|
|
||||||
|
|
||||||
|
def build_phone_backfill(
|
||||||
|
original_phone: object,
|
||||||
|
name: object,
|
||||||
|
province: object,
|
||||||
|
city: object,
|
||||||
|
phone_index: Dict[str, List[LawyerRecord]],
|
||||||
|
) -> PhoneBackfill:
|
||||||
|
def pick_best_name(records: Sequence[LawyerRecord], target_name: object) -> str:
|
||||||
|
normalized_target_name = normalize_name(target_name)
|
||||||
|
if normalized_target_name:
|
||||||
|
for item in records:
|
||||||
|
if item.name and normalize_name(item.name) == normalized_target_name:
|
||||||
|
return item.name
|
||||||
|
for item in records:
|
||||||
|
if item.name:
|
||||||
|
return item.name
|
||||||
|
return ""
|
||||||
|
|
||||||
|
records: List[LawyerRecord] = []
|
||||||
|
seen_ids = set()
|
||||||
|
for phone in split_phones(original_phone):
|
||||||
|
for record in phone_index.get(phone, []):
|
||||||
|
if record.id in seen_ids:
|
||||||
|
continue
|
||||||
|
seen_ids.add(record.id)
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
sorted_records = sorted(
|
||||||
|
records,
|
||||||
|
key=lambda item: phone_record_sort_key(item, name, province, city),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
candidate_names = unique_values(sorted_records, "name")
|
||||||
|
candidate_firms = unique_values(
|
||||||
|
[item for item in sorted_records if item.law_firm and item.law_firm not in GENERIC_FIRMS],
|
||||||
|
"law_firm",
|
||||||
|
)
|
||||||
|
if not candidate_firms:
|
||||||
|
candidate_firms = unique_values(
|
||||||
|
[item for item in sorted_records if item.law_firm],
|
||||||
|
"law_firm",
|
||||||
|
)
|
||||||
|
candidate_domains = unique_values(sorted_records, "domain")
|
||||||
|
matched_phones = unique_values(sorted_records, "phone")
|
||||||
|
|
||||||
|
best_name = pick_best_name(sorted_records, name)
|
||||||
|
best_law_firm = ""
|
||||||
|
best_domain = ""
|
||||||
|
preferred_name = normalize_name(name) or normalize_name(best_name)
|
||||||
|
|
||||||
|
for record in sorted_records:
|
||||||
|
if not record.law_firm or record.law_firm in GENERIC_FIRMS:
|
||||||
|
continue
|
||||||
|
if preferred_name and normalize_name(record.name) != preferred_name:
|
||||||
|
continue
|
||||||
|
best_law_firm = record.law_firm
|
||||||
|
best_domain = record.domain
|
||||||
|
break
|
||||||
|
|
||||||
|
if not best_law_firm:
|
||||||
|
for record in sorted_records:
|
||||||
|
if record.law_firm and record.law_firm not in GENERIC_FIRMS:
|
||||||
|
best_law_firm = record.law_firm
|
||||||
|
best_domain = record.domain
|
||||||
|
break
|
||||||
|
|
||||||
|
if not best_domain and sorted_records:
|
||||||
|
best_domain = sorted_records[0].domain
|
||||||
|
|
||||||
|
return PhoneBackfill(
|
||||||
|
matched_phones=matched_phones,
|
||||||
|
records=sorted_records,
|
||||||
|
best_name=best_name,
|
||||||
|
best_law_firm=best_law_firm,
|
||||||
|
best_domain=best_domain,
|
||||||
|
candidate_names=candidate_names,
|
||||||
|
candidate_firms=candidate_firms,
|
||||||
|
candidate_domains=candidate_domains,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def match_row(
|
||||||
|
name: object,
|
||||||
|
original_phone: object,
|
||||||
|
law_firm: object,
|
||||||
|
province: object,
|
||||||
|
city: object,
|
||||||
|
address: object,
|
||||||
|
phone_backfill: PhoneBackfill,
|
||||||
|
firm_index: Dict[str, List[LawyerRecord]],
|
||||||
|
ordered_firms: Sequence[str],
|
||||||
|
) -> Tuple[str, str, List[LawyerRecord]]:
|
||||||
|
def add_method(part: str, method_parts: List[str]) -> None:
|
||||||
|
if part and part not in method_parts:
|
||||||
|
method_parts.append(part)
|
||||||
|
|
||||||
|
matched_firm = normalize_firm(law_firm)
|
||||||
|
used_phone_backfill_firm = False
|
||||||
|
inferred_from_address = False
|
||||||
|
if not matched_firm:
|
||||||
|
matched_firm = normalize_firm(phone_backfill.best_law_firm)
|
||||||
|
used_phone_backfill_firm = bool(matched_firm)
|
||||||
|
if not matched_firm:
|
||||||
|
matched_firm = infer_firm_from_address(address, ordered_firms)
|
||||||
|
inferred_from_address = bool(matched_firm)
|
||||||
|
if not matched_firm:
|
||||||
|
return "", "无可用律所名", []
|
||||||
|
|
||||||
|
candidates = firm_index.get(matched_firm, [])
|
||||||
|
if not candidates:
|
||||||
|
return matched_firm, "数据库无此律所", []
|
||||||
|
|
||||||
|
method_parts = ["律所"]
|
||||||
|
chosen = list(candidates)
|
||||||
|
|
||||||
|
normalized_name = normalize_name(name)
|
||||||
|
if not normalized_name:
|
||||||
|
normalized_name = normalize_name(phone_backfill.best_name)
|
||||||
|
if normalized_name:
|
||||||
|
name_filtered = [item for item in chosen if normalize_name(item.name) == normalized_name]
|
||||||
|
if name_filtered:
|
||||||
|
chosen = name_filtered
|
||||||
|
add_method("姓名", method_parts)
|
||||||
|
|
||||||
|
if len(unique_phones(chosen)) != 1:
|
||||||
|
normalized_province = normalize_province(province)
|
||||||
|
normalized_city = normalize_city(city)
|
||||||
|
|
||||||
|
if normalized_province and normalized_city:
|
||||||
|
province_city_filtered = [
|
||||||
|
item
|
||||||
|
for item in chosen
|
||||||
|
if normalize_province(item.province) == normalized_province
|
||||||
|
and normalize_city(item.city) == normalized_city
|
||||||
|
]
|
||||||
|
if province_city_filtered:
|
||||||
|
chosen = province_city_filtered
|
||||||
|
add_method("省份", method_parts)
|
||||||
|
add_method("城市", method_parts)
|
||||||
|
|
||||||
|
if len(unique_phones(chosen)) != 1 and normalized_city:
|
||||||
|
city_filtered = [
|
||||||
|
item for item in chosen if normalize_city(item.city) == normalized_city
|
||||||
|
]
|
||||||
|
if city_filtered:
|
||||||
|
chosen = city_filtered
|
||||||
|
add_method("城市", method_parts)
|
||||||
|
|
||||||
|
if len(unique_phones(chosen)) != 1 and normalized_province:
|
||||||
|
province_filtered = [
|
||||||
|
item
|
||||||
|
for item in chosen
|
||||||
|
if normalize_province(item.province) == normalized_province
|
||||||
|
]
|
||||||
|
if province_filtered:
|
||||||
|
chosen = province_filtered
|
||||||
|
add_method("省份", method_parts)
|
||||||
|
|
||||||
|
method = "+".join(method_parts)
|
||||||
|
if used_phone_backfill_firm:
|
||||||
|
method = "手机号回填律所|" + method
|
||||||
|
elif inferred_from_address:
|
||||||
|
method = "地址推断律所|" + method
|
||||||
|
return matched_firm, method, chosen
|
||||||
|
|
||||||
|
|
||||||
|
def autosize_columns(ws) -> None:
|
||||||
|
for column_cells in ws.columns:
|
||||||
|
values = [str(cell.value or "") for cell in column_cells]
|
||||||
|
max_length = min(max((len(value) for value in values), default=0), 60)
|
||||||
|
column_letter = column_cells[0].column_letter
|
||||||
|
ws.column_dimensions[column_letter].width = max_length + 2
|
||||||
|
|
||||||
|
|
||||||
|
def iter_input_rows(ws) -> Iterable[Tuple[int, List[object]]]:
|
||||||
|
for row_idx in range(1, ws.max_row + 1):
|
||||||
|
yield row_idx, [ws.cell(row_idx, col_idx).value for col_idx in range(1, 8)]
|
||||||
|
|
||||||
|
|
||||||
|
def build_output(input_path: str, output_path: str) -> Dict[str, int]:
|
||||||
|
workbook = load_workbook(input_path)
|
||||||
|
source_ws = workbook.active
|
||||||
|
|
||||||
|
firm_index, ordered_firms, phone_index = load_db_indexes()
|
||||||
|
|
||||||
|
out_wb = Workbook()
|
||||||
|
out_ws = out_wb.active
|
||||||
|
out_ws.title = "firm_phone_compare"
|
||||||
|
headers = [
|
||||||
|
"原始行号",
|
||||||
|
"原姓名",
|
||||||
|
"原手机号",
|
||||||
|
"原律所",
|
||||||
|
"原省份",
|
||||||
|
"原城市",
|
||||||
|
"原地址",
|
||||||
|
"原备注",
|
||||||
|
"手机号命中记录数",
|
||||||
|
"手机号命中手机号",
|
||||||
|
"手机号补全姓名",
|
||||||
|
"手机号补全律所",
|
||||||
|
"手机号补全来源",
|
||||||
|
"手机号候选姓名",
|
||||||
|
"手机号候选律所",
|
||||||
|
"用于匹配的律所",
|
||||||
|
"匹配方式",
|
||||||
|
"数据库候选手机号",
|
||||||
|
"候选数量",
|
||||||
|
"原手机号对比",
|
||||||
|
"数据库候选姓名",
|
||||||
|
"数据库候选省市",
|
||||||
|
"数据库来源",
|
||||||
|
]
|
||||||
|
out_ws.append(headers)
|
||||||
|
for cell in out_ws[1]:
|
||||||
|
cell.font = Font(bold=True)
|
||||||
|
|
||||||
|
stats = defaultdict(int)
|
||||||
|
for row_idx, row in iter_input_rows(source_ws):
|
||||||
|
name, original_phone, law_firm, province, city, address, remark = row
|
||||||
|
needs_phone_completion = not normalize_firm(law_firm)
|
||||||
|
phone_backfill = build_phone_backfill(
|
||||||
|
original_phone=original_phone,
|
||||||
|
name=name,
|
||||||
|
province=province,
|
||||||
|
city=city,
|
||||||
|
phone_index=phone_index,
|
||||||
|
)
|
||||||
|
matched_firm, method, matched_records = match_row(
|
||||||
|
name=name,
|
||||||
|
original_phone=original_phone,
|
||||||
|
law_firm=law_firm,
|
||||||
|
province=province,
|
||||||
|
city=city,
|
||||||
|
address=address,
|
||||||
|
phone_backfill=phone_backfill,
|
||||||
|
firm_index=firm_index,
|
||||||
|
ordered_firms=ordered_firms,
|
||||||
|
)
|
||||||
|
candidate_phones = unique_phones(matched_records)
|
||||||
|
compare = compare_result(split_phones(original_phone), candidate_phones)
|
||||||
|
candidate_names = unique_values(matched_records, "name")
|
||||||
|
candidate_domains = unique_values(matched_records, "domain")
|
||||||
|
city_province_pairs = []
|
||||||
|
seen_pairs = set()
|
||||||
|
for record in matched_records:
|
||||||
|
pair = f"{record.province}-{record.city}".strip("-")
|
||||||
|
if pair and pair not in seen_pairs:
|
||||||
|
seen_pairs.add(pair)
|
||||||
|
city_province_pairs.append(pair)
|
||||||
|
|
||||||
|
out_ws.append(
|
||||||
|
[
|
||||||
|
row_idx,
|
||||||
|
name or "",
|
||||||
|
original_phone or "",
|
||||||
|
law_firm or "",
|
||||||
|
province or "",
|
||||||
|
city or "",
|
||||||
|
address or "",
|
||||||
|
remark or "",
|
||||||
|
len(phone_backfill.records) if needs_phone_completion else "",
|
||||||
|
" / ".join(phone_backfill.matched_phones) if needs_phone_completion else "",
|
||||||
|
phone_backfill.best_name if needs_phone_completion else "",
|
||||||
|
phone_backfill.best_law_firm if needs_phone_completion else "",
|
||||||
|
phone_backfill.best_domain if needs_phone_completion else "",
|
||||||
|
" / ".join(phone_backfill.candidate_names) if needs_phone_completion else "",
|
||||||
|
" / ".join(phone_backfill.candidate_firms) if needs_phone_completion else "",
|
||||||
|
matched_firm or "",
|
||||||
|
method or "",
|
||||||
|
" / ".join(candidate_phones) or "",
|
||||||
|
len(candidate_phones),
|
||||||
|
compare,
|
||||||
|
" / ".join(candidate_names) or "",
|
||||||
|
" / ".join(city_province_pairs) or "",
|
||||||
|
" / ".join(candidate_domains) or "",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if needs_phone_completion and phone_backfill.records:
|
||||||
|
stats["phone_backfill_hit_rows"] += 1
|
||||||
|
if needs_phone_completion and phone_backfill.best_name:
|
||||||
|
stats["phone_backfill_name_rows"] += 1
|
||||||
|
if needs_phone_completion and phone_backfill.best_law_firm:
|
||||||
|
stats["phone_backfill_firm_rows"] += 1
|
||||||
|
if needs_phone_completion and method.startswith("手机号回填律所|"):
|
||||||
|
stats["phone_backfill_used_for_match_rows"] += 1
|
||||||
|
|
||||||
|
if candidate_phones:
|
||||||
|
stats["matched_rows"] += 1
|
||||||
|
if len(candidate_phones) == 1:
|
||||||
|
stats["unique_rows"] += 1
|
||||||
|
else:
|
||||||
|
stats["multi_rows"] += 1
|
||||||
|
else:
|
||||||
|
stats["unmatched_rows"] += 1
|
||||||
|
|
||||||
|
if compare == "完全一致":
|
||||||
|
stats["same_rows"] += 1
|
||||||
|
elif compare == "候选包含原手机号":
|
||||||
|
stats["contains_rows"] += 1
|
||||||
|
elif compare == "不包含原手机号":
|
||||||
|
stats["diff_rows"] += 1
|
||||||
|
elif compare == "原手机号为空":
|
||||||
|
stats["blank_phone_rows"] += 1
|
||||||
|
|
||||||
|
out_ws.freeze_panes = "A2"
|
||||||
|
autosize_columns(out_ws)
|
||||||
|
out_wb.save(output_path)
|
||||||
|
return dict(stats)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
stats = build_output(args.input, args.output)
|
||||||
|
print(f"已生成: {args.output}")
|
||||||
|
for key in sorted(stats):
|
||||||
|
print(f"{key}={stats[key]}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -1,22 +1,114 @@
|
|||||||
# common_sites 独立项目配置
|
# 数据库连接配置
|
||||||
|
|
||||||
DB_CONFIG = {
|
DB_CONFIG = {
|
||||||
"host": "8.134.219.222",
|
"host": "8.134.219.222", # 数据库地址
|
||||||
"user": "lawyer",
|
"user": "lawyer", # 数据库用户名
|
||||||
"password": "CTxr8yGwsSX3NdfJ",
|
"password": "CTxr8yGwsSX3NdfJ", # 数据库密码
|
||||||
"database": "lawyer",
|
"database": "lawyer", # 数据库名称
|
||||||
"charset": "utf8mb4",
|
"charset": "utf8mb4",
|
||||||
}
|
}
|
||||||
|
|
||||||
HEADERS = {
|
# 高德地图 API 配置
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
GAODE_CONFIG = {
|
||||||
"Accept": "*/*",
|
"API_KEY": "f261575fb28003761c433f6c9379e89d",
|
||||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
|
|
||||||
"X-Requested-With": "XMLHttpRequest",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 微信爬虫特定的配置
|
||||||
|
WEIXIN_CONFIG = {
|
||||||
|
"TOKEN": "553117235", # 您的Token
|
||||||
|
"FINGERPRINT": "3c02c35093184e9a9a668ac3c81e53f9",
|
||||||
|
"COOKIE": {
|
||||||
|
"appmsglist_action_3258147150": "card",
|
||||||
|
"_qimei_uuid42": "1a302160d051008226aec905b63f99ff3989f30009",
|
||||||
|
"_qimei_i_3": "63b22b84c15204dfc595ac6452d722b1f0bdf0f6145b568ae68a7c0e70947438686637943989e2a1d792",
|
||||||
|
"_qimei_h38": "215986ce26aec905b63f99ff0200000e81a302",
|
||||||
|
"ua_id": "S7gglu0eZh9NkAzLAAAAADH8dynpnFZVN29lxm7BQo0=",
|
||||||
|
"wxuin": "73074968761097",
|
||||||
|
"mm_lang": "zh_CN",
|
||||||
|
"eas_sid": "91X7I7K4K5k364U2z3k2I980F5",
|
||||||
|
"_qimei_q36": "",
|
||||||
|
"_qimei_fingerprint": "d895c46d5fda98cab67d9daec00068ed",
|
||||||
|
"_qimei_i_1": "4dc76680945f59d3c7c4ab325dd526b3feeea1a31458558bbdd97e582493206c6163629d39d8e1dcd4b2c28f",
|
||||||
|
"pgv_pvid": "6923507145",
|
||||||
|
"ts_uid": "9585717820",
|
||||||
|
"_t_qbtool_uid": "aaaa2vn5byd280l00iglw701zci788cb",
|
||||||
|
"_ga": "GA1.1.1323926288.1775838938",
|
||||||
|
"_ga_TPFW0KPXC1": "GS2.1.s1775841484$o2$g1$t1775841485$j59$l0$h0",
|
||||||
|
"uuid": "20d1cfb540221c6e7b6d665ab1d4a8f7",
|
||||||
|
"rand_info": "CAESIA8LYV6dvWh5dYrgQLPhZb8TXwUJoWdcdDzN0TTdztSj",
|
||||||
|
"slave_bizuin": "3258147150",
|
||||||
|
"data_bizuin": "3258147150",
|
||||||
|
"bizuin": "3258147150",
|
||||||
|
"data_ticket": "dgLFmSrI8f1q6JnYOd2Y/sKJIWjh6YlLSau1n1+Mv5iOTR5hgsm1qjNLypWflGd6",
|
||||||
|
"slave_sid": "VGVnNmM5NmFpV19ESElmVlZOTGZfVVJfWE5HanlHNjN0WEswZVkxVk9vc2FTenQzVGRsWUxDT0xGQVBJRVZzU0JNVV9RckRJVE9jSVUwbjl4Z2VHaEZKSzE5WVc3THRCRW96T0Z1V1VwbnBLSnkxSWdKaHdaN1dYdzI1SmdpZ0IyOFJtUE45OTR2Q2NvM1FB",
|
||||||
|
"slave_user": "gh_fe76760560d0",
|
||||||
|
"xid": "4893c62dc8518b6a1628fd34bc9aa276",
|
||||||
|
"_clck": "3258147150|1|g5g|0",
|
||||||
|
"_clsk": "1p4oo3h|1776957001796|5|1|mp.weixin.qq.com/weheat-agent/payload/record"
|
||||||
|
},
|
||||||
|
"COUNT": 20, # 单页条数
|
||||||
|
"REQUESTS_PER_SECOND": 8, # 每秒最大请求数(调高更快,但有风控风险)
|
||||||
|
"PAGE_DELAY": 0.8, # 每页采集后的等待秒数
|
||||||
|
"CITY_DELAY": 0.3, # 每城市采集后的等待秒数
|
||||||
|
}
|
||||||
|
|
||||||
|
# 通用请求头
|
||||||
|
HEADERS = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
||||||
|
'Accept': '*/*',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
}
|
||||||
|
|
||||||
|
# 法律快车爬虫配置
|
||||||
LAWTIME_CONFIG = {
|
LAWTIME_CONFIG = {
|
||||||
"HEADERS": {
|
"HEADERS": {
|
||||||
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Redis配置 - 用于采集索引和断点恢复
|
||||||
|
REDIS_CONFIG = {
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 6379,
|
||||||
|
"password": "",
|
||||||
|
"db": 0, # 使用数据库0
|
||||||
|
"decode_responses": True, # 自动解码响应
|
||||||
|
"socket_timeout": 5, # 连接超时时间
|
||||||
|
"socket_connect_timeout": 5, # 连接建立超时时间
|
||||||
|
"health_check_interval": 30, # 健康检查间隔
|
||||||
|
"retry_on_timeout": True, # 超时重试
|
||||||
|
"max_connections": 20, # 最大连接数
|
||||||
|
}
|
||||||
|
|
||||||
|
# Redis键名配置
|
||||||
|
REDIS_KEYS = {
|
||||||
|
"spider_progress": "lawyer:spider:progress:{spider_name}", # 爬虫进度
|
||||||
|
"url_processed": "lawyer:url:processed:{spider_name}", # 已处理URL集合
|
||||||
|
"url_failed": "lawyer:url:failed:{spider_name}", # 失败URL集合
|
||||||
|
"spider_stats": "lawyer:stats:{spider_name}", # 爬虫统计信息
|
||||||
|
"global_stats": "lawyer:global:stats", # 全局统计
|
||||||
|
"session_info": "lawyer:session:{session_id}", # 会话信息
|
||||||
|
"url_queue": "lawyer:queue:{spider_name}", # URL队列
|
||||||
|
"duplicate_filter": "lawyer:duplicate:{spider_name}", # 去重过滤器
|
||||||
|
}
|
||||||
|
|
||||||
|
# MongoDB配置 - 用于日志存储
|
||||||
|
MONGO_CONFIG = {
|
||||||
|
"uri": "mongodb://127.0.0.1:27017/",
|
||||||
|
"database": "lawyer",
|
||||||
|
"collections": {
|
||||||
|
"logs": "logs", # 通用日志
|
||||||
|
"spider_logs": "spider_logs", # 爬虫专用日志
|
||||||
|
"error_logs": "error_logs", # 错误日志
|
||||||
|
"system_logs": "system_logs", # 系统日志
|
||||||
|
"performance_logs": "performance_logs" # 性能日志
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"maxPoolSize": 10, # 连接池最大连接数
|
||||||
|
"minPoolSize": 1, # 连接池最小连接数
|
||||||
|
"maxIdleTimeMS": 30000, # 最大空闲时间
|
||||||
|
"serverSelectionTimeoutMS": 5000, # 服务器选择超时
|
||||||
|
"connectTimeoutMS": 10000, # 连接超时
|
||||||
|
"socketTimeoutMS": 30000, # Socket超时
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
services:
|
||||||
|
mongodb:
|
||||||
|
image: mongo:7
|
||||||
|
container_name: lawyers_mongodb
|
||||||
|
restart: always
|
||||||
|
ports:
|
||||||
|
- "27017:27017"
|
||||||
|
volumes:
|
||||||
|
- mongodb_data:/data/db
|
||||||
|
environment:
|
||||||
|
MONGO_INITDB_DATABASE: lawyer
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
mongodb_data:
|
||||||
@@ -0,0 +1,401 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from requests.exceptions import RequestException, HTTPError, ConnectionError, Timeout
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
|
||||||
|
# 添加项目根目录到系统路径(保留你的原逻辑)
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
from Db import Db # 你的 DB 封装
|
||||||
|
import config as project_config
|
||||||
|
|
||||||
|
# logging 配置
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class GaodeSpider:
|
||||||
|
"""高德地图 API 商户手机号采集 - 重构版"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
db_connection,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
offset: int = 20,
|
||||||
|
max_pages_per_city: int = 10,
|
||||||
|
sleep_between_pages: float = 2.0,
|
||||||
|
sleep_between_cities: float = 3.0,
|
||||||
|
):
|
||||||
|
self.db = db_connection
|
||||||
|
config_api_key = ""
|
||||||
|
gaode_config = getattr(project_config, "GAODE_CONFIG", None)
|
||||||
|
if isinstance(gaode_config, dict):
|
||||||
|
config_api_key = str(gaode_config.get("API_KEY", "")).strip()
|
||||||
|
|
||||||
|
self.api_key = (api_key or os.environ.get("AMAP_API_KEY", "") or config_api_key).strip()
|
||||||
|
if not self.api_key:
|
||||||
|
raise ValueError("高德 API Key 未配置,请在 config.py 的 GAODE_CONFIG.API_KEY 或环境变量 AMAP_API_KEY 中填写")
|
||||||
|
self.api_base = "https://restapi.amap.com/v3/place/text"
|
||||||
|
self.offset = offset
|
||||||
|
self.session = self._build_session()
|
||||||
|
self.max_pages_per_city = max_pages_per_city
|
||||||
|
self.sleep_between_pages = sleep_between_pages
|
||||||
|
self.sleep_between_cities = sleep_between_cities
|
||||||
|
|
||||||
|
# 加载地区数据
|
||||||
|
self.cities = self._load_area_data()
|
||||||
|
|
||||||
|
def _build_session(self) -> requests.Session:
|
||||||
|
s = requests.Session()
|
||||||
|
# Retry for idempotent errors (GET) and some server errors
|
||||||
|
retries = Retry(
|
||||||
|
total=3,
|
||||||
|
backoff_factor=1,
|
||||||
|
status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
allowed_methods=frozenset(["GET", "POST"])
|
||||||
|
)
|
||||||
|
adapter = HTTPAdapter(max_retries=retries)
|
||||||
|
s.mount("https://", adapter)
|
||||||
|
s.mount("http://", adapter)
|
||||||
|
s.headers.update({
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
})
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _load_area_data(self) -> Dict[int, Dict]:
|
||||||
|
"""从数据库加载地区数据(保持与你原来表结构兼容)。
|
||||||
|
要求 area_new 表含:id, code, city, province, pid, pinyin, domain, level
|
||||||
|
仅加载 domain='findlaw' 且 level=2 的城市
|
||||||
|
返回字典:{ city_id: {code, name, province, pid, pinyin} }
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
rows = self.db.select_data("area_new", "id, code, city, province, pid, pinyin", "domain='maxlaw' AND level=2")
|
||||||
|
result = {}
|
||||||
|
for r in rows:
|
||||||
|
cid = r.get("id")
|
||||||
|
result[cid] = {
|
||||||
|
"code": r.get("code") or "",
|
||||||
|
"name": r.get("city") or "",
|
||||||
|
"province": r.get("province") or "",
|
||||||
|
"pid": r.get("pid"),
|
||||||
|
"pinyin": r.get("pinyin") or ""
|
||||||
|
}
|
||||||
|
logger.info("加载城市数量: %d", len(result))
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("从数据库加载地区数据失败: %s", e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _search_gaode_api(self, keywords: str, city: str, page: int = 1) -> Dict:
|
||||||
|
"""调用高德 API 搜索,返回整个 JSON 响应(或空 dict)"""
|
||||||
|
params = {
|
||||||
|
"keywords": keywords,
|
||||||
|
"city": city,
|
||||||
|
"offset": self.offset,
|
||||||
|
"page": page,
|
||||||
|
"key": self.api_key,
|
||||||
|
"extensions": "all"
|
||||||
|
}
|
||||||
|
print(params)
|
||||||
|
try:
|
||||||
|
resp = self.session.get(self.api_base, params=params, timeout=15)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return data
|
||||||
|
except (HTTPError, ConnectionError, Timeout) as e:
|
||||||
|
logger.warning("高德 API 请求失败(%s %s page=%s): %s", keywords, city, page, e)
|
||||||
|
return {}
|
||||||
|
except ValueError as e:
|
||||||
|
logger.error("高德 API 返回非 JSON 数据: %s", e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _split_and_clean_phones(self, raw_tel: str) -> List[str]:
|
||||||
|
"""把 raw tel 拆成候选号码并清洗"""
|
||||||
|
if not raw_tel:
|
||||||
|
return []
|
||||||
|
|
||||||
|
logger.debug("原始电话号码: %s", raw_tel)
|
||||||
|
|
||||||
|
# 常见分隔符 ; / , 、 | 空格
|
||||||
|
parts = re.split(r"[;,/,、\|]+|\s+", raw_tel.strip())
|
||||||
|
cleaned = []
|
||||||
|
|
||||||
|
for p in parts:
|
||||||
|
if not p:
|
||||||
|
continue
|
||||||
|
|
||||||
|
original_p = p
|
||||||
|
# 移除括号内内容以及非数字和连字符
|
||||||
|
p = re.sub(r"(.*?)|\(.*?\)|[^\d\-+]", "", p)
|
||||||
|
# 有的号码含国际码 +86
|
||||||
|
p = p.lstrip("+")
|
||||||
|
# 移除前导 86(如果之后是11位)
|
||||||
|
if p.startswith("86") and len(p) > 11:
|
||||||
|
p = p[2:]
|
||||||
|
# 最后移除短横线
|
||||||
|
p = p.replace("-", "")
|
||||||
|
|
||||||
|
if p:
|
||||||
|
cleaned.append(p)
|
||||||
|
logger.debug("清洗后号码: %s -> %s", original_p, p)
|
||||||
|
|
||||||
|
logger.debug("清洗后共 %d 个号码: %s", len(cleaned), cleaned)
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
def _is_valid_phone(self, phone: str) -> bool:
|
||||||
|
"""验证手机号码,必须为11位且以1开头的手机号。"""
|
||||||
|
if not phone:
|
||||||
|
return False
|
||||||
|
# 强制要求:11位且以1开头的手机号
|
||||||
|
if re.fullmatch(r"1\d{10}", phone):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _extract_phones_from_poi(self, poi: Dict) -> List[str]:
|
||||||
|
"""
|
||||||
|
从 POI 数据中提取所有候选电话号码。
|
||||||
|
优先查找 poi['business']['tel'](你的示例),并兼容早期可能的字段如 tel/phone。
|
||||||
|
返回去重后且通过校验的号码列表。
|
||||||
|
"""
|
||||||
|
candidates = []
|
||||||
|
|
||||||
|
# 1) 优先查找 business.tel 字段(高德API的主要电话字段)
|
||||||
|
business = poi.get("business") or {}
|
||||||
|
tel = business.get("tel")
|
||||||
|
if tel:
|
||||||
|
logger.debug("从 business.tel 提取: %s", tel)
|
||||||
|
candidates.extend(self._split_and_clean_phones(str(tel)))
|
||||||
|
|
||||||
|
# 2) 兼容旧结构:顶层 tel/phone/contact 等
|
||||||
|
for key in ("tel", "phone", "contact", "business_area"):
|
||||||
|
v = poi.get(key)
|
||||||
|
if v:
|
||||||
|
logger.debug("从 %s 提取: %s", key, v)
|
||||||
|
candidates.extend(self._split_and_clean_phones(str(v)))
|
||||||
|
|
||||||
|
# 3) 审慎兼容:一些扩展字段可能也包含电话
|
||||||
|
for nested_key in ("biz_ext", "ext", "attributes"):
|
||||||
|
nested = poi.get(nested_key) or {}
|
||||||
|
if isinstance(nested, dict):
|
||||||
|
for subkey in ("tel", "phone", "contact"):
|
||||||
|
if nested.get(subkey):
|
||||||
|
logger.debug("从 %s.%s 提取: %s", nested_key, subkey, nested.get(subkey))
|
||||||
|
candidates.extend(self._split_and_clean_phones(str(nested.get(subkey))))
|
||||||
|
|
||||||
|
# 去重并只保留合法号码(按 _is_valid_phone)
|
||||||
|
unique = []
|
||||||
|
for c in candidates:
|
||||||
|
if c not in unique and self._is_valid_phone(c):
|
||||||
|
unique.append(c)
|
||||||
|
logger.debug("有效电话号码: %s", c)
|
||||||
|
|
||||||
|
logger.debug("POI %s 提取到 %d 个有效电话号码", poi.get("name", ""), len(unique))
|
||||||
|
return unique
|
||||||
|
|
||||||
|
|
||||||
|
def _is_duplicate(self, phone: str) -> bool:
|
||||||
|
"""检查某个 phone 是否已存在(domain='高德地图')"""
|
||||||
|
try:
|
||||||
|
condition = f"phone='{phone}' AND domain='高德地图'"
|
||||||
|
exists = self.db.is_data_exist("lawyer", condition)
|
||||||
|
if exists:
|
||||||
|
logger.debug("手机号已存在: %s (domain=高德地图)", phone)
|
||||||
|
return exists
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("去重检查失败: %s", e)
|
||||||
|
# 若出错,返回 True 以避免重复插入或脏数据
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _parse_poi_to_record(self, poi: Dict, city_info: Dict, province_info: Dict, used_phone: str) -> Dict:
|
||||||
|
"""把单条 poi 转为数据库记录(针对某个已选的号码 used_phone)"""
|
||||||
|
# 安全处理 shopinfo 字段,可能是字符串或字典
|
||||||
|
shopinfo = poi.get("shopinfo")
|
||||||
|
if isinstance(shopinfo, dict):
|
||||||
|
law_firm = shopinfo.get("shop_name", "高德搜索")
|
||||||
|
else:
|
||||||
|
law_firm = "高德搜索"
|
||||||
|
|
||||||
|
record = {
|
||||||
|
"name": poi.get("name", "").strip(),
|
||||||
|
"phone": used_phone,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"province": province_info.get("name", ""),
|
||||||
|
"city": city_info.get("name", ""),
|
||||||
|
"url": poi.get("website", "") or "",
|
||||||
|
"domain": "高德地图",
|
||||||
|
"create_time": int(time.time()),
|
||||||
|
"params": json.dumps({
|
||||||
|
"address": poi.get("address", ""),
|
||||||
|
"location": poi.get("location", ""),
|
||||||
|
"type": poi.get("type", ""),
|
||||||
|
"business_area": poi.get("business_area", ""),
|
||||||
|
"raw_tel": poi.get("tel", "") or "",
|
||||||
|
"raw_poi": poi
|
||||||
|
}, ensure_ascii=False)
|
||||||
|
}
|
||||||
|
return record
|
||||||
|
|
||||||
|
def _save_lawyer(self, record: Dict) -> bool:
|
||||||
|
"""存储律师信息到数据库"""
|
||||||
|
try:
|
||||||
|
self.db.insert_data("lawyer", record)
|
||||||
|
logger.info("新增商户: %s (%s)", record.get("name"), record.get("phone"))
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("存储失败: %s %s", record.get("name"), record.get("phone"))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _search_city(self, keywords: str, city_info: Dict, province_info: Dict) -> int:
|
||||||
|
"""在指定城市搜索并存储;返回新增条数"""
|
||||||
|
# city 参数可以是 city code 或 city name;使用你存表里的 code 优先
|
||||||
|
# city_code = city_info.get("code") or city_info.get("name")
|
||||||
|
city_code = city_info.get("name")
|
||||||
|
total_added = 0
|
||||||
|
|
||||||
|
# 先请求第一页拿到 count
|
||||||
|
page = 1
|
||||||
|
first_resp = self._search_gaode_api(keywords, city_code, page)
|
||||||
|
if not first_resp:
|
||||||
|
logger.info(" 未获取到第一页数据: %s", keywords)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
status = first_resp.get("status")
|
||||||
|
if str(status) != "1":
|
||||||
|
logger.warning(" 高德返回错误: %s", first_resp.get("info"))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
count = int(first_resp.get("count", 0))
|
||||||
|
except Exception:
|
||||||
|
count = 0
|
||||||
|
# 计算总页数
|
||||||
|
total_pages = math.ceil(count / self.offset) if count else 1
|
||||||
|
total_pages = min(total_pages, self.max_pages_per_city)
|
||||||
|
|
||||||
|
logger.info(" 城市 %s 搜索到 count=%s, pages=%s (限制 %s)", city_code, count, total_pages, self.max_pages_per_city)
|
||||||
|
|
||||||
|
# 处理第一页 POIs
|
||||||
|
def process_page(page_num: int, page_data: Dict) -> int:
|
||||||
|
"""处理单页数据,返回新增条数"""
|
||||||
|
nonlocal total_added
|
||||||
|
if not page_data:
|
||||||
|
logger.info(" page %s 未返回数据", page_num)
|
||||||
|
return 0
|
||||||
|
if str(page_data.get("status")) != "1":
|
||||||
|
logger.warning(" page %s 返回状态非1: %s", page_num, page_data.get("info"))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
pois = page_data.get("pois") or []
|
||||||
|
page_added = 0
|
||||||
|
for poi in pois:
|
||||||
|
name = (poi.get("name") or "").strip()
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
phones = self._extract_phones_from_poi(poi)
|
||||||
|
if not phones:
|
||||||
|
logger.debug(" 跳过无电话: %s", name)
|
||||||
|
continue
|
||||||
|
for ph in phones:
|
||||||
|
# 如果已存在跳过该号码
|
||||||
|
if self._is_duplicate(ph):
|
||||||
|
logger.debug(" 跳过已存在号码: %s (%s)", name, ph)
|
||||||
|
continue
|
||||||
|
rec = self._parse_poi_to_record(poi, city_info, province_info, ph)
|
||||||
|
ok = self._save_lawyer(rec)
|
||||||
|
if ok:
|
||||||
|
page_added += 1
|
||||||
|
total_added += 1
|
||||||
|
# 如果该 POI 有多个号码,我们仍尝试插入其它号码(有用时)
|
||||||
|
# end for phones
|
||||||
|
# end for pois
|
||||||
|
return page_added
|
||||||
|
|
||||||
|
# 先处理第一页
|
||||||
|
first_page_added = process_page(1, first_resp)
|
||||||
|
logger.info(" 城市 %s 第 1 页新增 %d 条", city_code, first_page_added)
|
||||||
|
|
||||||
|
# 记录已处理的页面,避免重复处理
|
||||||
|
processed_pages = {1}
|
||||||
|
|
||||||
|
# 依次请求剩余页面,直到读完或无数据
|
||||||
|
for page_num in range(2, total_pages + 1):
|
||||||
|
if page_num in processed_pages:
|
||||||
|
continue
|
||||||
|
|
||||||
|
time.sleep(self.sleep_between_pages)
|
||||||
|
page_data = self._search_gaode_api(keywords, city_code, page_num)
|
||||||
|
|
||||||
|
if not page_data:
|
||||||
|
logger.info(" 第 %s 页无响应数据,停止翻页", page_num)
|
||||||
|
break
|
||||||
|
|
||||||
|
if str(page_data.get("status")) != "1":
|
||||||
|
logger.info(" 第 %s 页状态异常(%s),停止翻页", page_num, page_data.get("info"))
|
||||||
|
break
|
||||||
|
|
||||||
|
pois = page_data.get("pois") or []
|
||||||
|
if not pois:
|
||||||
|
logger.info(" 第 %s 页返回空pois,提前结束", page_num)
|
||||||
|
break
|
||||||
|
|
||||||
|
page_added = process_page(page_num, page_data)
|
||||||
|
logger.info(" 城市 %s 第 %s 页新增 %d 条", city_code, page_num, page_added)
|
||||||
|
processed_pages.add(page_num)
|
||||||
|
|
||||||
|
# 如果结果数量不足一页,说明已经接近尾部
|
||||||
|
if len(pois) < self.offset:
|
||||||
|
logger.info(" 第 %s 页结果不足一页,推测已到尾页,提前结束", page_num)
|
||||||
|
break
|
||||||
|
|
||||||
|
return total_added
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
logger.info("启动高德地图律师信息采集...")
|
||||||
|
if not self.cities:
|
||||||
|
logger.error("未加载城市列表,退出")
|
||||||
|
return
|
||||||
|
|
||||||
|
total_stored = 0
|
||||||
|
keywords_suffix = "律师"
|
||||||
|
|
||||||
|
for city_id, city_info in self.cities.items():
|
||||||
|
try:
|
||||||
|
province_info = {"name": city_info.get("province", "")}
|
||||||
|
city_name = city_info.get("name", "")
|
||||||
|
if not city_name:
|
||||||
|
continue
|
||||||
|
search_keywords = f"{keywords_suffix}"
|
||||||
|
added = self._search_city(search_keywords, city_info, province_info)
|
||||||
|
total_stored += added
|
||||||
|
logger.info("城市 %s 完成,新增 %d 条,总计 %d", city_name, added, total_stored)
|
||||||
|
time.sleep(self.sleep_between_cities)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("处理城市 %s 时出错: %s", city_info.get("name", ""), e)
|
||||||
|
|
||||||
|
logger.info("采集完成,共新增 %d 条商户信息。", total_stored)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 运行示例
|
||||||
|
with Db() as db:
|
||||||
|
spider = GaodeSpider(db)
|
||||||
|
spider.run()
|
||||||
+832
@@ -0,0 +1,832 @@
|
|||||||
|
// ==UserScript==
|
||||||
|
// @name Douyin Batch City Search + AutoScroll + Capture
|
||||||
|
// @namespace http://tampermonkey.net/
|
||||||
|
// @version 1.1
|
||||||
|
// @description 从 Python 服务获取地区列表,按 city + "律师" 搜索并自动下滑,拦截 /aweme/v1/web/discover/search/ 返回并转发到入库接口。
|
||||||
|
// @author You
|
||||||
|
// @match https://www.douyin.com/*
|
||||||
|
// @grant GM_xmlhttpRequest
|
||||||
|
// @connect *
|
||||||
|
// @run-at document-idle
|
||||||
|
// ==/UserScript==
|
||||||
|
|
||||||
|
(function () {
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
/********************* 配置区(按需修改) *********************/
|
||||||
|
const API_BASE = 'http://127.0.0.1:9002'; // 改成你部署 Python 服务的地址,例如 http://nas.nepiedg.site:9002
|
||||||
|
const AREA_API = `${API_BASE}/api/layer/get_area?server=1`; // 获取城市列表的接口
|
||||||
|
const SEND_TARGETS = [
|
||||||
|
`${API_BASE}/api/layer/index?server=1&save_only=0`
|
||||||
|
];
|
||||||
|
|
||||||
|
// 搜索框与按钮选择器(根据页面更新)
|
||||||
|
const SEARCH_INPUT_SELECTORS = [
|
||||||
|
'input[data-e2e="search-input"]',
|
||||||
|
'input[data-e2e="searchbar-input"]',
|
||||||
|
'form[data-e2e="searchbar"] input',
|
||||||
|
'input[placeholder*="搜索"]'
|
||||||
|
];
|
||||||
|
const SEARCH_BTN_SELECTORS = [
|
||||||
|
'[data-e2e="search-button"]',
|
||||||
|
'button[data-e2e="search-button"]',
|
||||||
|
'span[data-e2e="search-button"]',
|
||||||
|
'button[data-e2e="searchbar-button"]',
|
||||||
|
'span.btn-title'
|
||||||
|
];
|
||||||
|
|
||||||
|
// 每个城市搜索时的自动下滑配置
|
||||||
|
const SCROLL_INTERVAL_MS = 2000;
|
||||||
|
const MAX_STABLE_COUNT = 6;
|
||||||
|
const MAX_SCROLLS_PER_CITY = 120;
|
||||||
|
const SCROLL_BY = 2200;
|
||||||
|
const WAIT_AFTER_SEARCH_MS = 1000;
|
||||||
|
const DELAY_BETWEEN_CITIES_MS = 1500;
|
||||||
|
|
||||||
|
// 断点续跑配置
|
||||||
|
const PROGRESS_STORAGE_KEY = 'dm_batch_progress_v1';
|
||||||
|
const DEVICE_ID_STORAGE_KEY = 'dm_batch_device_id_v1';
|
||||||
|
const PROGRESS_SYNC_ENABLED = true;
|
||||||
|
const PROGRESS_KEY = 'douyin_batch_default';
|
||||||
|
const PROGRESS_API = `${API_BASE}/api/layer/progress?server=1`;
|
||||||
|
|
||||||
|
// 可选:如果希望只发送包含手机号的条目,可在此启用并调整正则
|
||||||
|
const ONLY_SEND_IF_HAS_PHONE = false;
|
||||||
|
const PHONE_REGEX = /(?:\+?86)?1[3-9]\d{9}/g;
|
||||||
|
|
||||||
|
/********************* 运行时状态 *********************/
|
||||||
|
let areaList = [];
|
||||||
|
let stopFlag = false; // 由 UI 控制,true 表示停止整个任务
|
||||||
|
let skipCurrentCityFlag = false; // 由 UI 控制,true 表示跳过当前城市
|
||||||
|
let currentCityIndex = -1;
|
||||||
|
let currentAreaSignature = '';
|
||||||
|
let isLoopRunning = false;
|
||||||
|
let inputEl = null;
|
||||||
|
let btnEl = null;
|
||||||
|
const DEVICE_ID = getOrCreateDeviceId();
|
||||||
|
|
||||||
|
// 节流/去重发送
|
||||||
|
let lastSentHash = null;
|
||||||
|
let lastSentAt = 0;
|
||||||
|
const SEND_MIN_INTERVAL_MS = 800;
|
||||||
|
let progressSyncInFlight = false;
|
||||||
|
let progressSyncPendingPayload = null;
|
||||||
|
|
||||||
|
/********************* 工具函数 *********************/
|
||||||
|
function log(...args) { console.log('[DouyinBatch] ', ...args); }
|
||||||
|
function err(...args) { console.error('[DouyinBatch] ', ...args); }
|
||||||
|
|
||||||
|
function hashString(str) {
|
||||||
|
let h = 2166136261 >>> 0;
|
||||||
|
for (let i = 0; i < str.length; i++) {
|
||||||
|
h ^= str.charCodeAt(i);
|
||||||
|
h = Math.imul(h, 16777619) >>> 0;
|
||||||
|
}
|
||||||
|
return h.toString(16);
|
||||||
|
}
|
||||||
|
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise(r => setTimeout(r, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
function getOrCreateDeviceId() {
|
||||||
|
try {
|
||||||
|
const old = localStorage.getItem(DEVICE_ID_STORAGE_KEY);
|
||||||
|
if (old) return old;
|
||||||
|
const generated = (window.crypto && typeof window.crypto.randomUUID === 'function')
|
||||||
|
? window.crypto.randomUUID()
|
||||||
|
: `dm-${Date.now()}-${Math.random().toString(16).slice(2, 10)}`;
|
||||||
|
localStorage.setItem(DEVICE_ID_STORAGE_KEY, generated);
|
||||||
|
return generated;
|
||||||
|
} catch (_) {
|
||||||
|
return `dm-${Date.now()}-${Math.random().toString(16).slice(2, 10)}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function getAreaRowName(row) {
|
||||||
|
if (!row || typeof row !== 'object') return '';
|
||||||
|
return String(row.city || row.province || row.name || '').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildAreaSignature(list) {
|
||||||
|
try {
|
||||||
|
if (!Array.isArray(list) || list.length === 0) return 'empty';
|
||||||
|
const names = list.map(getAreaRowName).filter(Boolean);
|
||||||
|
return hashString(`${list.length}|${names.join('|')}`);
|
||||||
|
} catch (e) {
|
||||||
|
return 'unknown';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function readProgress() {
|
||||||
|
try {
|
||||||
|
const raw = localStorage.getItem(PROGRESS_STORAGE_KEY);
|
||||||
|
if (!raw) return null;
|
||||||
|
const parsed = JSON.parse(raw);
|
||||||
|
if (!parsed || typeof parsed !== 'object') return null;
|
||||||
|
return parsed;
|
||||||
|
} catch (_) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildProgressPayload(nextCityIndex, reason = '') {
|
||||||
|
const safeIndex = Number.isFinite(nextCityIndex) ? Math.max(0, Math.floor(nextCityIndex)) : 0;
|
||||||
|
const currentArea = areaList[safeIndex] || areaList[Math.max(0, currentCityIndex)] || {};
|
||||||
|
return {
|
||||||
|
progress_key: PROGRESS_KEY,
|
||||||
|
device_id: DEVICE_ID,
|
||||||
|
next_city_index: safeIndex,
|
||||||
|
area_signature: currentAreaSignature || '',
|
||||||
|
area_total: Array.isArray(areaList) ? areaList.length : 0,
|
||||||
|
current_city: getAreaRowName(currentArea),
|
||||||
|
reason,
|
||||||
|
status: stopFlag ? 'paused' : 'running',
|
||||||
|
extra: {
|
||||||
|
path: location.pathname || '',
|
||||||
|
href: location.href || '',
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function persistProgress(nextCityIndex, reason = '') {
|
||||||
|
try {
|
||||||
|
const payload = buildProgressPayload(nextCityIndex, reason);
|
||||||
|
localStorage.setItem(PROGRESS_STORAGE_KEY, JSON.stringify({
|
||||||
|
nextCityIndex: payload.next_city_index,
|
||||||
|
areaSignature: payload.area_signature,
|
||||||
|
reason: payload.reason,
|
||||||
|
updatedAt: Date.now(),
|
||||||
|
progressKey: payload.progress_key,
|
||||||
|
deviceId: payload.device_id,
|
||||||
|
}));
|
||||||
|
|
||||||
|
enqueueRemoteProgressSync(payload);
|
||||||
|
} catch (e) {
|
||||||
|
err('保存进度失败', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function restoreProgress(areaSignature, listLength) {
|
||||||
|
const progress = readProgress();
|
||||||
|
if (!progress) return 0;
|
||||||
|
if (!progress.areaSignature || progress.areaSignature !== areaSignature) return 0;
|
||||||
|
const idx = Number.isFinite(progress.nextCityIndex) ? Math.floor(progress.nextCityIndex) : 0;
|
||||||
|
if (idx < 0 || idx >= listLength) return 0;
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
function clearProgress() {
|
||||||
|
try { localStorage.removeItem(PROGRESS_STORAGE_KEY); } catch (_) {}
|
||||||
|
enqueueRemoteProgressSync({
|
||||||
|
action: 'clear',
|
||||||
|
progress_key: PROGRESS_KEY,
|
||||||
|
device_id: DEVICE_ID,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function gmGetJson(url) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
GM_xmlhttpRequest({
|
||||||
|
method: 'GET',
|
||||||
|
url,
|
||||||
|
onload(res) {
|
||||||
|
try {
|
||||||
|
const json = JSON.parse(res.responseText);
|
||||||
|
resolve(json);
|
||||||
|
} catch (e) {
|
||||||
|
reject(e);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
onerror(err) { reject(err); }
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function gmPostJson(url, data) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
GM_xmlhttpRequest({
|
||||||
|
method: 'POST',
|
||||||
|
url,
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
data: JSON.stringify(data || {}),
|
||||||
|
onload(res) {
|
||||||
|
try {
|
||||||
|
const json = JSON.parse(res.responseText || '{}');
|
||||||
|
resolve(json);
|
||||||
|
} catch (e) {
|
||||||
|
reject(e);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
onerror(err) { reject(err); }
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function enqueueRemoteProgressSync(payload) {
|
||||||
|
if (!PROGRESS_SYNC_ENABLED) return;
|
||||||
|
if (!payload || typeof payload !== 'object') return;
|
||||||
|
progressSyncPendingPayload = payload;
|
||||||
|
if (progressSyncInFlight) return;
|
||||||
|
flushRemoteProgressSync();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function flushRemoteProgressSync() {
|
||||||
|
if (!PROGRESS_SYNC_ENABLED) return;
|
||||||
|
if (progressSyncInFlight) return;
|
||||||
|
|
||||||
|
progressSyncInFlight = true;
|
||||||
|
try {
|
||||||
|
while (progressSyncPendingPayload) {
|
||||||
|
const payload = progressSyncPendingPayload;
|
||||||
|
progressSyncPendingPayload = null;
|
||||||
|
try {
|
||||||
|
await gmPostJson(PROGRESS_API, payload);
|
||||||
|
} catch (e) {
|
||||||
|
err('同步远端进度失败', e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
progressSyncInFlight = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function restoreRemoteProgress(areaSignature, listLength) {
|
||||||
|
if (!PROGRESS_SYNC_ENABLED) return 0;
|
||||||
|
try {
|
||||||
|
const url = `${PROGRESS_API}&progress_key=${encodeURIComponent(PROGRESS_KEY)}`;
|
||||||
|
const response = await gmGetJson(url);
|
||||||
|
const data = response && response.data ? response.data : null;
|
||||||
|
if (!data || typeof data !== 'object') return 0;
|
||||||
|
|
||||||
|
const remoteSignature = String(data.area_signature || '');
|
||||||
|
if (!remoteSignature || remoteSignature !== areaSignature) return 0;
|
||||||
|
|
||||||
|
const idxRaw = data.next_city_index;
|
||||||
|
const idx = Number.isFinite(idxRaw) ? Math.floor(idxRaw) : Math.floor(Number(idxRaw || 0));
|
||||||
|
if (!Number.isFinite(idx) || idx < 0 || idx >= listLength) return 0;
|
||||||
|
return idx;
|
||||||
|
} catch (e) {
|
||||||
|
err('读取远端进度失败', e);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function setNativeValue(el, value) {
|
||||||
|
if (!el) return;
|
||||||
|
const prototype = el.constructor && el.constructor.prototype ? el.constructor.prototype : window.HTMLInputElement && window.HTMLInputElement.prototype;
|
||||||
|
const descriptor = prototype ? Object.getOwnPropertyDescriptor(prototype, 'value') : null;
|
||||||
|
if (descriptor && descriptor.set) {
|
||||||
|
descriptor.set.call(el, value);
|
||||||
|
} else {
|
||||||
|
el.value = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function simulateSearchInput(keyword) {
|
||||||
|
if (!inputEl) return;
|
||||||
|
try {
|
||||||
|
inputEl.focus();
|
||||||
|
inputEl.dispatchEvent(new Event('focus', { bubbles: false }));
|
||||||
|
|
||||||
|
// 清空旧值并触发事件
|
||||||
|
if (inputEl.value) {
|
||||||
|
setNativeValue(inputEl, '');
|
||||||
|
if (typeof InputEvent === 'function') {
|
||||||
|
inputEl.dispatchEvent(new InputEvent('input', { bubbles: true, inputType: 'deleteContentBackward', data: '' }));
|
||||||
|
} else {
|
||||||
|
inputEl.dispatchEvent(new Event('input', { bubbles: true }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
setNativeValue(inputEl, keyword);
|
||||||
|
if (typeof InputEvent === 'function') {
|
||||||
|
inputEl.dispatchEvent(new InputEvent('beforeinput', { bubbles: true, inputType: 'insertText', data: keyword }));
|
||||||
|
inputEl.dispatchEvent(new InputEvent('input', { bubbles: true, inputType: 'insertText', data: keyword }));
|
||||||
|
} else {
|
||||||
|
inputEl.dispatchEvent(new Event('input', { bubbles: true }));
|
||||||
|
}
|
||||||
|
inputEl.dispatchEvent(new Event('change', { bubbles: true }));
|
||||||
|
inputEl.dispatchEvent(new Event('blur', { bubbles: false }));
|
||||||
|
} catch (e) {
|
||||||
|
err('simulateSearchInput error', e);
|
||||||
|
}
|
||||||
|
await new Promise(r => setTimeout(r, 80));
|
||||||
|
}
|
||||||
|
|
||||||
|
function simulateSearchTrigger() {
|
||||||
|
let triggered = false;
|
||||||
|
if (btnEl && btnEl.isConnected) {
|
||||||
|
try {
|
||||||
|
btnEl.focus();
|
||||||
|
btnEl.dispatchEvent(new MouseEvent('mousedown', { bubbles: true, cancelable: true, view: window }));
|
||||||
|
btnEl.dispatchEvent(new MouseEvent('mouseup', { bubbles: true, cancelable: true, view: window }));
|
||||||
|
btnEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true, view: window }));
|
||||||
|
triggered = true;
|
||||||
|
} catch (e) {
|
||||||
|
err('simulateSearchTrigger click error', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!triggered && inputEl) {
|
||||||
|
try {
|
||||||
|
const opts = { bubbles: true, cancelable: true, key: 'Enter', code: 'Enter', keyCode: 13, which: 13 };
|
||||||
|
inputEl.dispatchEvent(new KeyboardEvent('keydown', opts));
|
||||||
|
inputEl.dispatchEvent(new KeyboardEvent('keypress', opts));
|
||||||
|
inputEl.dispatchEvent(new KeyboardEvent('keyup', opts));
|
||||||
|
triggered = true;
|
||||||
|
} catch (e) {
|
||||||
|
err('Enter 触发搜索失败', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return triggered;
|
||||||
|
}
|
||||||
|
|
||||||
|
function sendToTargets(data) {
|
||||||
|
try {
|
||||||
|
const body = typeof data === 'string' ? data : JSON.stringify(data);
|
||||||
|
if (ONLY_SEND_IF_HAS_PHONE) {
|
||||||
|
if (!PHONE_REGEX.test(body)) {
|
||||||
|
// 未匹配手机号则跳过发送
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const hash = hashString(body);
|
||||||
|
const now = Date.now();
|
||||||
|
if (hash === lastSentHash && now - lastSentAt < SEND_MIN_INTERVAL_MS) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
lastSentHash = hash;
|
||||||
|
lastSentAt = now;
|
||||||
|
|
||||||
|
for (const target of SEND_TARGETS) {
|
||||||
|
GM_xmlhttpRequest({
|
||||||
|
method: 'POST',
|
||||||
|
url: target,
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
data: body,
|
||||||
|
onload(res) { log(`sent -> ${target}, status: ${res.status}`); },
|
||||||
|
onerror(e) { err(`send error to ${target}`, e); }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
err('sendToTargets error', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/********************* 拦截 fetch 与 XHR(捕获目标接口返回) *********************/
|
||||||
|
const TARGET_PATH = '/aweme/v1/web/discover/search/';
|
||||||
|
|
||||||
|
(function interceptFetch() {
|
||||||
|
if (!window.fetch) return;
|
||||||
|
const orig = window.fetch.bind(window);
|
||||||
|
window.fetch = function (...args) {
|
||||||
|
try {
|
||||||
|
const resource = args[0];
|
||||||
|
const url = (typeof resource === 'string') ? resource : (resource && resource.url) ? resource.url : '';
|
||||||
|
if (url && url.includes(TARGET_PATH)) {
|
||||||
|
return orig(...args).then((response) => {
|
||||||
|
try {
|
||||||
|
const cloned = response.clone();
|
||||||
|
cloned.json().then((json) => {
|
||||||
|
if (json && typeof json === 'object') {
|
||||||
|
sendToTargets({ source: 'fetch', url, data: json, ts: Date.now(), cityIndex: currentCityIndex });
|
||||||
|
}
|
||||||
|
}).catch(()=>{});
|
||||||
|
} catch (e) { /* ignore */ }
|
||||||
|
return response;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (e) { err('fetch wrapper error', e); }
|
||||||
|
return orig(...args);
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
|
||||||
|
(function interceptXHR() {
|
||||||
|
const XHR = window.XMLHttpRequest;
|
||||||
|
if (!XHR) return;
|
||||||
|
const origOpen = XHR.prototype.open;
|
||||||
|
const origSend = XHR.prototype.send;
|
||||||
|
|
||||||
|
XHR.prototype.open = function (method, url, ...rest) {
|
||||||
|
try { this.__dm_url = (typeof url === 'string') ? url : ''; } catch(e){}
|
||||||
|
return origOpen.apply(this, [method, url, ...rest]);
|
||||||
|
};
|
||||||
|
|
||||||
|
XHR.prototype.send = function (body) {
|
||||||
|
try {
|
||||||
|
const targetUrl = this.__dm_url || '';
|
||||||
|
if (targetUrl && targetUrl.includes(TARGET_PATH)) {
|
||||||
|
this.addEventListener('readystatechange', function () {
|
||||||
|
if (this.readyState === 4) {
|
||||||
|
try {
|
||||||
|
const text = this.responseText;
|
||||||
|
if (!text) return;
|
||||||
|
try {
|
||||||
|
const json = JSON.parse(text);
|
||||||
|
sendToTargets({ source: 'xhr', url: targetUrl, data: json, ts: Date.now(), cityIndex: currentCityIndex });
|
||||||
|
} catch (err) {
|
||||||
|
// 非 json 忽略
|
||||||
|
}
|
||||||
|
} catch (e) { /* ignore */ }
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (e) { err('XHR wrapper error', e); }
|
||||||
|
return origSend.apply(this, [body]);
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
|
||||||
|
/********************* 自动下滑函数(单次搜索) *********************/
|
||||||
|
async function autoScrollUntilStable(statusNode, maxScrolls = MAX_SCROLLS_PER_CITY) {
|
||||||
|
let lastHeight = -1;
|
||||||
|
let stableCount = 0;
|
||||||
|
let scrolls = 0;
|
||||||
|
|
||||||
|
while (!stopFlag) {
|
||||||
|
if (skipCurrentCityFlag) {
|
||||||
|
statusNode.textContent = '收到跳过指令,结束当前地区滚动。';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
scrolls++;
|
||||||
|
if (scrolls > maxScrolls) {
|
||||||
|
statusNode.textContent = `达到单次搜索最大滚动 ${maxScrolls},停止本次自动下滑。`;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 执行滚动
|
||||||
|
try {
|
||||||
|
window.scrollBy({ top: SCROLL_BY, left: 0, behavior: 'smooth' });
|
||||||
|
} catch (e) {
|
||||||
|
window.scrollTo(0, (document.body.scrollHeight || document.documentElement.scrollHeight));
|
||||||
|
}
|
||||||
|
|
||||||
|
await sleep(SCROLL_INTERVAL_MS);
|
||||||
|
|
||||||
|
if (skipCurrentCityFlag) {
|
||||||
|
statusNode.textContent = '收到跳过指令,结束当前地区滚动。';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const curHeight = document.body.scrollHeight || document.documentElement.scrollHeight || 0;
|
||||||
|
if (curHeight === lastHeight) {
|
||||||
|
stableCount++;
|
||||||
|
} else {
|
||||||
|
stableCount = 0;
|
||||||
|
lastHeight = curHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
statusNode.textContent = `滚动次数: ${scrolls}, 稳定计数: ${stableCount}/${MAX_STABLE_COUNT}`;
|
||||||
|
|
||||||
|
if (stableCount >= MAX_STABLE_COUNT) {
|
||||||
|
statusNode.textContent = `页面高度稳定 (${stableCount}), 本次搜索加载结束。`;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/********************* 页面元素辅助:等待元素出现 *********************/
|
||||||
|
function waitForSelector(selector, timeout = 10000) {
|
||||||
|
const selectors = Array.isArray(selector) ? selector.filter(Boolean) : [selector];
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
let timer;
|
||||||
|
const root = document.documentElement || document.body;
|
||||||
|
|
||||||
|
const cleanup = (observer) => {
|
||||||
|
try { observer && observer.disconnect(); } catch (_) {}
|
||||||
|
if (timer) clearTimeout(timer);
|
||||||
|
};
|
||||||
|
|
||||||
|
const pick = () => {
|
||||||
|
for (const sel of selectors) {
|
||||||
|
if (!sel) continue;
|
||||||
|
try {
|
||||||
|
const found = document.querySelector(sel);
|
||||||
|
if (found) {
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
err('query selector error', sel, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
};
|
||||||
|
|
||||||
|
const immediate = pick();
|
||||||
|
if (immediate) {
|
||||||
|
return resolve(immediate);
|
||||||
|
}
|
||||||
|
|
||||||
|
const observer = new MutationObserver(() => {
|
||||||
|
const node = pick();
|
||||||
|
if (node) {
|
||||||
|
cleanup(observer);
|
||||||
|
resolve(node);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (root) {
|
||||||
|
observer.observe(root, { childList: true, subtree: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
timer = setTimeout(() => {
|
||||||
|
cleanup(observer);
|
||||||
|
reject(new Error('timeout waiting for ' + selectors.join(', ')));
|
||||||
|
}, timeout);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function ensureSearchControls(statusNode) {
|
||||||
|
const isConnected = (node) => {
|
||||||
|
if (!node) return false;
|
||||||
|
try {
|
||||||
|
if (node.isConnected !== undefined) return node.isConnected;
|
||||||
|
return document.contains(node);
|
||||||
|
} catch (_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!isConnected(inputEl)) inputEl = null;
|
||||||
|
if (!isConnected(btnEl)) btnEl = null;
|
||||||
|
|
||||||
|
if (!inputEl) {
|
||||||
|
statusNode && (statusNode.textContent = '等待搜索输入框可用...');
|
||||||
|
inputEl = await waitForSelector(SEARCH_INPUT_SELECTORS, 10000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!btnEl) {
|
||||||
|
try {
|
||||||
|
statusNode && (statusNode.textContent = '等待搜索按钮可用...');
|
||||||
|
btnEl = await waitForSelector(SEARCH_BTN_SELECTORS, 8000);
|
||||||
|
if (btnEl && btnEl.tagName !== 'BUTTON') {
|
||||||
|
const maybeButton = btnEl.closest('button');
|
||||||
|
if (maybeButton) btnEl = maybeButton;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
btnEl = null;
|
||||||
|
err('未找到搜索按钮,将使用 Enter 键进行触发。');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!inputEl) {
|
||||||
|
throw new Error('未定位到搜索输入框');
|
||||||
|
}
|
||||||
|
|
||||||
|
return { inputEl, btnEl };
|
||||||
|
}
|
||||||
|
|
||||||
|
/********************* UI 控制(右下角) *********************/
|
||||||
|
function createUI() {
|
||||||
|
const css = `
|
||||||
|
#dm-batch-btn { position: fixed; right: 12px; bottom: 12px; z-index:999999; background: rgba(0,0,0,0.65); color:#fff;
|
||||||
|
padding:8px 10px; border-radius:8px; font-size:13px; cursor:pointer; user-select:none;}
|
||||||
|
#dm-batch-skip { position: fixed; right:12px; bottom:50px; z-index:999999; background: rgba(30,30,30,0.72); color:#fff;
|
||||||
|
padding:7px 10px; border-radius:8px; font-size:12px; cursor:pointer; user-select:none;}
|
||||||
|
#dm-batch-status { position: fixed; right:12px; bottom:88px; z-index:999999; background: rgba(0,0,0,0.45); color:#fff;
|
||||||
|
padding:6px 8px; border-radius:6px; font-size:12px; max-width:320px; word-break:break-word;}
|
||||||
|
`;
|
||||||
|
const s = document.createElement('style'); s.textContent = css; document.head && document.head.appendChild(s);
|
||||||
|
|
||||||
|
const btn = document.createElement('div');
|
||||||
|
btn.id = 'dm-batch-btn';
|
||||||
|
btn.textContent = 'BatchSearch:停止';
|
||||||
|
btn.dataset.running = '1';
|
||||||
|
document.body.appendChild(btn);
|
||||||
|
|
||||||
|
const skipBtn = document.createElement('div');
|
||||||
|
skipBtn.id = 'dm-batch-skip';
|
||||||
|
skipBtn.textContent = 'BatchSearch:跳过当前';
|
||||||
|
document.body.appendChild(skipBtn);
|
||||||
|
|
||||||
|
const status = document.createElement('div');
|
||||||
|
status.id = 'dm-batch-status';
|
||||||
|
status.textContent = '准备中...';
|
||||||
|
document.body.appendChild(status);
|
||||||
|
|
||||||
|
btn.addEventListener('click', () => {
|
||||||
|
const running = btn.dataset.running === '1';
|
||||||
|
btn.dataset.running = running ? '0' : '1';
|
||||||
|
btn.textContent = running ? 'BatchSearch:已停止' : 'BatchSearch:停止';
|
||||||
|
status.textContent = running ? '已手动停止(已保存断点)' : '已启动';
|
||||||
|
stopFlag = running; // if was running and clicked -> set stopFlag true; if restarting, set false
|
||||||
|
if (running) {
|
||||||
|
skipCurrentCityFlag = false;
|
||||||
|
persistProgress(Math.max(currentCityIndex, 0), 'manual_pause');
|
||||||
|
}
|
||||||
|
if (!stopFlag) {
|
||||||
|
// restart loop if needed
|
||||||
|
runBatchSearchLoop(status).catch(e => err(e));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
skipBtn.addEventListener('click', () => {
|
||||||
|
if (currentCityIndex < 0) {
|
||||||
|
status.textContent = '当前还未开始处理城市,稍后再跳过。';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
skipCurrentCityFlag = true;
|
||||||
|
const areaName = getAreaRowName(areaList[currentCityIndex] || {});
|
||||||
|
status.textContent = `收到跳过指令:${areaName || `索引${currentCityIndex}`}`;
|
||||||
|
});
|
||||||
|
|
||||||
|
skipBtn.addEventListener('contextmenu', (event) => {
|
||||||
|
event.preventDefault();
|
||||||
|
clearProgress();
|
||||||
|
currentCityIndex = 0;
|
||||||
|
status.textContent = '已清除断点。下次将从第 1 个地区开始。';
|
||||||
|
});
|
||||||
|
|
||||||
|
return { btn, skipBtn, status };
|
||||||
|
}
|
||||||
|
|
||||||
|
/********************* 主流程:获取城市并循环搜索 *********************/
|
||||||
|
async function runBatchSearchLoop(statusNode) {
|
||||||
|
if (isLoopRunning) {
|
||||||
|
statusNode.textContent = '批量任务已在运行中,请勿重复启动。';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
isLoopRunning = true;
|
||||||
|
try {
|
||||||
|
stopFlag = (document.getElementById('dm-batch-btn') && document.getElementById('dm-batch-btn').dataset.running === '0');
|
||||||
|
skipCurrentCityFlag = false;
|
||||||
|
|
||||||
|
if (stopFlag) {
|
||||||
|
statusNode.textContent = '当前是暂停状态,点击“BatchSearch:停止”可继续。';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取 area list(仅在内存为空时获取)
|
||||||
|
if (!areaList || !Array.isArray(areaList) || areaList.length === 0) {
|
||||||
|
statusNode.textContent = '正在获取城市列表...';
|
||||||
|
try {
|
||||||
|
const data = await gmGetJson(AREA_API);
|
||||||
|
const normalizedAreaList = Array.isArray(data)
|
||||||
|
? data
|
||||||
|
: (data && Array.isArray(data.data) ? data.data : []);
|
||||||
|
|
||||||
|
if (normalizedAreaList.length > 0) {
|
||||||
|
areaList = normalizedAreaList;
|
||||||
|
log('获取城市列表数量:', areaList.length);
|
||||||
|
statusNode.textContent = `获取到 ${areaList.length} 个城市,准备开始循环。`;
|
||||||
|
} else {
|
||||||
|
err('area API returned not array', data);
|
||||||
|
statusNode.textContent = '获取城市列表失败(返回格式异常)';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
err('获取城市列表失败', e);
|
||||||
|
statusNode.textContent = '获取城市列表失败: ' + e.message;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
currentAreaSignature = buildAreaSignature(areaList);
|
||||||
|
const restoredIndexLocal = restoreProgress(currentAreaSignature, areaList.length);
|
||||||
|
const restoredIndexRemote = await restoreRemoteProgress(currentAreaSignature, areaList.length);
|
||||||
|
const restoredIndex = Math.max(restoredIndexLocal, restoredIndexRemote);
|
||||||
|
const startIndex = (currentCityIndex >= 0 && currentCityIndex < areaList.length)
|
||||||
|
? currentCityIndex
|
||||||
|
: restoredIndex;
|
||||||
|
currentCityIndex = startIndex;
|
||||||
|
|
||||||
|
if (startIndex > 0) {
|
||||||
|
statusNode.textContent = `检测到断点(本地:${restoredIndexLocal + 1} 远端:${restoredIndexRemote + 1}),将从第 ${startIndex + 1}/${areaList.length} 个地区继续。`;
|
||||||
|
await sleep(500);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 等待搜索输入与按钮可用
|
||||||
|
try {
|
||||||
|
await ensureSearchControls(statusNode);
|
||||||
|
} catch (e) {
|
||||||
|
err('未找到搜索输入或按钮', e);
|
||||||
|
statusNode.textContent = '未找到搜索输入或按钮,脚本仍会监听接口,但无法自动搜索。';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let completedAll = true;
|
||||||
|
|
||||||
|
// 主循环:对每个 city 执行搜索 -> 下滑 -> 发送结果 -> 下一 city
|
||||||
|
for (let i = startIndex; i < areaList.length; i++) {
|
||||||
|
if (stopFlag) {
|
||||||
|
completedAll = false;
|
||||||
|
persistProgress(i, 'manual_stop');
|
||||||
|
statusNode.textContent = '已停止(断点已保存)。';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentCityIndex = i;
|
||||||
|
skipCurrentCityFlag = false;
|
||||||
|
persistProgress(i, 'start_city');
|
||||||
|
|
||||||
|
const city = (areaList[i].city || areaList[i].province || '').trim();
|
||||||
|
if (!city) {
|
||||||
|
persistProgress(i + 1, 'empty_city');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const keyword = `${city}律师`;
|
||||||
|
statusNode.textContent = `正在搜索:${keyword} (${i+1}/${areaList.length})`;
|
||||||
|
log(`开始城市[${i+1}/${areaList.length}] 搜索:`, keyword);
|
||||||
|
|
||||||
|
// 将搜索词放入输入框 (触发 input 事件)
|
||||||
|
try {
|
||||||
|
await ensureSearchControls(statusNode);
|
||||||
|
} catch (e) {
|
||||||
|
err('刷新搜索控件失败', e);
|
||||||
|
statusNode.textContent = '刷新搜索控件失败,终止批量搜索。';
|
||||||
|
completedAll = false;
|
||||||
|
persistProgress(i, 'search_control_error');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
await simulateSearchInput(keyword);
|
||||||
|
|
||||||
|
const triggered = simulateSearchTrigger();
|
||||||
|
if (!triggered) {
|
||||||
|
statusNode.textContent = '搜索触发失败,尝试刷新控件...';
|
||||||
|
btnEl = null;
|
||||||
|
await ensureSearchControls(statusNode);
|
||||||
|
if (!simulateSearchTrigger()) {
|
||||||
|
statusNode.textContent = '搜索触发失败,终止批量搜索。';
|
||||||
|
completedAll = false;
|
||||||
|
persistProgress(i, 'search_trigger_error');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 等待搜索结果开始加载
|
||||||
|
await new Promise(r => setTimeout(r, WAIT_AFTER_SEARCH_MS));
|
||||||
|
|
||||||
|
// 自动下滑直到稳定或达到上限
|
||||||
|
await autoScrollUntilStable(statusNode, MAX_SCROLLS_PER_CITY);
|
||||||
|
|
||||||
|
if (skipCurrentCityFlag) {
|
||||||
|
skipCurrentCityFlag = false;
|
||||||
|
persistProgress(i + 1, 'skip_city');
|
||||||
|
statusNode.textContent = `已跳过 ${keyword},继续下一个地区...`;
|
||||||
|
await sleep(Math.min(DELAY_BETWEEN_CITIES_MS, 800));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stopFlag) {
|
||||||
|
completedAll = false;
|
||||||
|
persistProgress(i, 'manual_stop_after_scroll');
|
||||||
|
statusNode.textContent = '已停止(断点已保存)。';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
persistProgress(i + 1, 'city_done');
|
||||||
|
|
||||||
|
// 等待短暂间隔再进行下一个城市
|
||||||
|
statusNode.textContent = `完成 ${keyword} 的加载,等待 ${DELAY_BETWEEN_CITIES_MS} ms 后继续...`;
|
||||||
|
await sleep(DELAY_BETWEEN_CITIES_MS);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (completedAll && !stopFlag) {
|
||||||
|
clearProgress();
|
||||||
|
currentCityIndex = -1;
|
||||||
|
statusNode.textContent = '批量搜索完成,已清除断点进度。';
|
||||||
|
log('批量搜索循环结束: completed');
|
||||||
|
} else {
|
||||||
|
log('批量搜索循环结束: paused/broken');
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
err('runBatchSearchLoop error', e);
|
||||||
|
persistProgress(Math.max(currentCityIndex, 0), 'loop_exception');
|
||||||
|
} finally {
|
||||||
|
isLoopRunning = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/********************* 启动脚本 *********************/
|
||||||
|
(function init() {
|
||||||
|
window.addEventListener('beforeunload', () => {
|
||||||
|
if (currentCityIndex >= 0) {
|
||||||
|
persistProgress(Math.max(currentCityIndex, 0), 'page_unload');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const ui = createUI();
|
||||||
|
ui.status.textContent = '就绪 - 可暂停/跳过,自动保存断点(右键跳过按钮可清除断点)';
|
||||||
|
console.log(location.pathname)
|
||||||
|
// 如果当前为目标页面(/jingxuan/search/),则自动启动;否则仍可在任何页面打开并手动启动。
|
||||||
|
const isAutoPage = location.pathname && location.pathname.indexOf('/search/') !== -1;
|
||||||
|
if (isAutoPage) {
|
||||||
|
ui.status.textContent = '检测到 /jingxuan/search/ 页面,准备开始批量搜索...';
|
||||||
|
// 给页面一点时间加载必要脚本与 dom
|
||||||
|
setTimeout(() => {
|
||||||
|
runBatchSearchLoop(ui.status).catch(e => err(e));
|
||||||
|
}, 800);
|
||||||
|
} else {
|
||||||
|
// 非目标页面,仍可手动点击按钮(按钮初始化为运行状态,点击色变为已停止)
|
||||||
|
ui.status.textContent = '非 /jingxuan/search/ 页面。导航至该页面或手动控制开始。';
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
})();
|
||||||
|
|
||||||
Binary file not shown.
@@ -0,0 +1,411 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
from Db import Db
|
||||||
|
|
||||||
|
SITE_NAME = "zhongfali_group80"
|
||||||
|
LEGACY_DOMAIN = "众法利单页"
|
||||||
|
START_URL = "http://m.zhongfali.com/pg.jsp?groupId=80&pgt=0&pgs=1"
|
||||||
|
DEFAULT_OUTPUT = "/www/wwwroot/lawyers/data/one_off_sites/zhongfali_records_all.jsonl"
|
||||||
|
|
||||||
|
SOCKS_PROXY = "127.0.0.1:7891"
|
||||||
|
CLASH_CONTROLLER = os.environ.get("CLASH_CONTROLLER", "http://127.0.0.1:9090")
|
||||||
|
CLASH_SECRET = os.environ.get("CLASH_SECRET", "")
|
||||||
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||||
|
INITIAL_STATE_RE = re.compile(r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})</script>", re.S)
|
||||||
|
|
||||||
|
|
||||||
|
class ProxyRotator:
|
||||||
|
def __init__(self, controller: str, secret: str):
|
||||||
|
self.controller = controller.rstrip("/")
|
||||||
|
self.secret = secret.strip()
|
||||||
|
self.nodes: List[str] = []
|
||||||
|
self.index = 0
|
||||||
|
|
||||||
|
def _api(self, path: str, method: str = "GET", payload: Optional[Dict] = None) -> Dict:
|
||||||
|
headers = {}
|
||||||
|
if self.secret:
|
||||||
|
headers["Authorization"] = f"Bearer {self.secret}"
|
||||||
|
body = None
|
||||||
|
if payload is not None:
|
||||||
|
headers["Content-Type"] = "application/json"
|
||||||
|
body = json.dumps(payload).encode("utf-8")
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{self.controller}{path}",
|
||||||
|
data=body,
|
||||||
|
headers=headers,
|
||||||
|
method=method,
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||||
|
raw = resp.read().decode("utf-8", errors="ignore")
|
||||||
|
return json.loads(raw) if raw else {}
|
||||||
|
|
||||||
|
def initialize(self) -> None:
|
||||||
|
if not self.secret:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self._api("/configs", method="PATCH", payload={"mode": "global"})
|
||||||
|
proxy_data = self._api("/proxies")
|
||||||
|
proxies = proxy_data.get("proxies", {}) or {}
|
||||||
|
skip = {
|
||||||
|
"GLOBAL",
|
||||||
|
"DIRECT",
|
||||||
|
"REJECT",
|
||||||
|
"REJECT-DROP",
|
||||||
|
"PASS",
|
||||||
|
"COMPATIBLE",
|
||||||
|
"🔰 选择节点",
|
||||||
|
"☁️ OneDrive",
|
||||||
|
"🐟 漏网之鱼",
|
||||||
|
"🎯 全球直连",
|
||||||
|
"🛑 拦截广告",
|
||||||
|
"🌍 爱奇艺&哔哩哔哩",
|
||||||
|
"🎮 Steam 登录/下载",
|
||||||
|
"🎮 Steam 商店/社区",
|
||||||
|
"🌩️ Cloudflare",
|
||||||
|
"🎬 动画疯",
|
||||||
|
"🎓学术网站",
|
||||||
|
"🇨🇳 国内网站",
|
||||||
|
}
|
||||||
|
self.nodes = [
|
||||||
|
name
|
||||||
|
for name, info in proxies.items()
|
||||||
|
if name not in skip and isinstance(info, dict)
|
||||||
|
and info.get("type") not in {"Selector", "URLTest", "Fallback", "LoadBalance"}
|
||||||
|
]
|
||||||
|
if self.nodes:
|
||||||
|
self.switch_to(self.nodes[0])
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[proxy] rotator init failed: {exc}")
|
||||||
|
self.nodes = []
|
||||||
|
|
||||||
|
def switch_to(self, node_name: str) -> None:
|
||||||
|
self._api("/proxies/GLOBAL", method="PUT", payload={"name": node_name})
|
||||||
|
|
||||||
|
def rotate(self) -> None:
|
||||||
|
if not self.nodes:
|
||||||
|
return
|
||||||
|
self.index = (self.index + 1) % len(self.nodes)
|
||||||
|
node = self.nodes[self.index]
|
||||||
|
self.switch_to(node)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(value: str) -> str:
|
||||||
|
compact = "".join(ch for ch in str(value or "") if ch.isdigit())
|
||||||
|
match = PHONE_RE.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_html(
|
||||||
|
url: str,
|
||||||
|
rotator: Optional[ProxyRotator] = None,
|
||||||
|
max_retries: int = 6,
|
||||||
|
timeout_seconds: int = 18,
|
||||||
|
) -> str:
|
||||||
|
last_error = ""
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
cmd = [
|
||||||
|
"curl",
|
||||||
|
"-sS",
|
||||||
|
"--socks5-hostname",
|
||||||
|
SOCKS_PROXY,
|
||||||
|
"-L",
|
||||||
|
"--compressed",
|
||||||
|
"--max-time",
|
||||||
|
str(timeout_seconds),
|
||||||
|
"-w",
|
||||||
|
"\n__CODE__:%{http_code}",
|
||||||
|
url,
|
||||||
|
]
|
||||||
|
proc = subprocess.run(cmd, capture_output=True)
|
||||||
|
if proc.returncode == 0:
|
||||||
|
raw = proc.stdout.decode("utf-8", errors="ignore")
|
||||||
|
marker = "\n__CODE__:"
|
||||||
|
split_at = raw.rfind(marker)
|
||||||
|
if split_at != -1:
|
||||||
|
text = raw[:split_at]
|
||||||
|
code_text = raw[split_at + len(marker):].strip()
|
||||||
|
else:
|
||||||
|
text = raw
|
||||||
|
code_text = ""
|
||||||
|
code_ok = code_text == "200" if code_text else bool(text)
|
||||||
|
if text and code_ok:
|
||||||
|
return text
|
||||||
|
last_error = "empty body"
|
||||||
|
else:
|
||||||
|
last_error = proc.stderr.decode("utf-8", errors="ignore").strip() or f"exit={proc.returncode}"
|
||||||
|
if rotator and rotator.nodes:
|
||||||
|
try:
|
||||||
|
rotator.rotate()
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = f"{last_error}; rotate failed: {exc}"
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(0.6 * (attempt + 1))
|
||||||
|
raise RuntimeError(f"fetch failed: {url}, reason={last_error}")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_initial_state(html: str) -> Dict:
|
||||||
|
match = INITIAL_STATE_RE.search(html)
|
||||||
|
if not match:
|
||||||
|
raise ValueError("window.__INITIAL_STATE__ not found")
|
||||||
|
return json.loads(match.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_group_urls_from_group80(state: Dict) -> List[str]:
|
||||||
|
module = (state.get("currentPageModuleIdMap") or {}).get("21") or {}
|
||||||
|
ext_info = module.get("extInfo", {}) or {}
|
||||||
|
second_group_map = ext_info.get("secondGroupMap", {}) or {}
|
||||||
|
rows = second_group_map.get("80") or []
|
||||||
|
|
||||||
|
urls: Set[str] = set()
|
||||||
|
for row in rows:
|
||||||
|
url = str(row.get("url") or "").strip()
|
||||||
|
if url:
|
||||||
|
urls.add(url)
|
||||||
|
for city in row.get("thirdGroupList") or []:
|
||||||
|
city_url = str(city.get("url") or "").strip()
|
||||||
|
if city_url:
|
||||||
|
urls.add(city_url)
|
||||||
|
return sorted(urls)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_detail_urls_from_group_html(html: str) -> Set[str]:
|
||||||
|
detail_ids = set(re.findall(r"h-pd-(\d+)\.html", html))
|
||||||
|
return {f"http://m.zhongfali.com/h-pd-{pid}.html" for pid in detail_ids}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_location_and_name(product_name: str) -> Tuple[str, str, str]:
|
||||||
|
text = re.sub(r"\s+", " ", str(product_name or "")).strip()
|
||||||
|
province = ""
|
||||||
|
city = ""
|
||||||
|
name = ""
|
||||||
|
|
||||||
|
province_match = re.search(r"([\u4e00-\u9fa5]{2,}省)", text)
|
||||||
|
if province_match:
|
||||||
|
province = province_match.group(1)
|
||||||
|
|
||||||
|
city_match = re.search(r"(?:[\u4e00-\u9fa5]+省)?([\u4e00-\u9fa5]+(?:市|区|县|州|盟))", text)
|
||||||
|
if city_match:
|
||||||
|
city = city_match.group(1)
|
||||||
|
|
||||||
|
name_match = re.search(r"([\u4e00-\u9fa5]{2,4})\s*律师", text)
|
||||||
|
if name_match:
|
||||||
|
name = name_match.group(1)
|
||||||
|
|
||||||
|
return province, city, name
|
||||||
|
|
||||||
|
|
||||||
|
def parse_detail_record(detail_url: str, html: str, source_list_url: str) -> Optional[Dict]:
|
||||||
|
state = parse_initial_state(html)
|
||||||
|
|
||||||
|
module = None
|
||||||
|
for mod in (state.get("currentPageModuleIdMap") or {}).values():
|
||||||
|
if isinstance(mod, dict) and (mod.get("extInfo") or {}).get("productInfo"):
|
||||||
|
module = mod
|
||||||
|
break
|
||||||
|
if not module:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ext_info = module.get("extInfo", {}) or {}
|
||||||
|
product_info = ext_info.get("productInfo", {}) or {}
|
||||||
|
|
||||||
|
phone = normalize_phone(product_info.get("material", ""))
|
||||||
|
if not phone:
|
||||||
|
return None
|
||||||
|
|
||||||
|
product_name = str(product_info.get("name") or "").strip()
|
||||||
|
province, city, lawyer_name = parse_location_and_name(product_name)
|
||||||
|
law_firm = str(product_info.get("prop0") or "").strip()
|
||||||
|
|
||||||
|
if not lawyer_name:
|
||||||
|
lawyer_name = product_name
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
record_id = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
|
||||||
|
return {
|
||||||
|
"record_id": record_id,
|
||||||
|
"collected_at": now,
|
||||||
|
"source": {
|
||||||
|
"site": SITE_NAME,
|
||||||
|
"list_url": source_list_url,
|
||||||
|
"detail_url": detail_url,
|
||||||
|
"province": province,
|
||||||
|
"province_py": "",
|
||||||
|
"city": city,
|
||||||
|
"city_py": "",
|
||||||
|
"page": 1,
|
||||||
|
},
|
||||||
|
"list_snapshot": {
|
||||||
|
"name": lawyer_name,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"specialties": [],
|
||||||
|
"answer_count": None,
|
||||||
|
},
|
||||||
|
"profile": {
|
||||||
|
"name": lawyer_name,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"phone": phone,
|
||||||
|
"license_no": str(product_info.get("prop1") or "").strip(),
|
||||||
|
"practice_years": None,
|
||||||
|
"email": "",
|
||||||
|
"address": str(product_info.get("prop3") or "").strip(),
|
||||||
|
"specialties": [],
|
||||||
|
},
|
||||||
|
"raw": {
|
||||||
|
"product_name": product_name,
|
||||||
|
"group_ids": product_info.get("groupIdList") or [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def to_legacy_row(record: Dict) -> Optional[Dict[str, str]]:
|
||||||
|
profile = record.get("profile", {}) or {}
|
||||||
|
source = record.get("source", {}) or {}
|
||||||
|
phone = normalize_phone(profile.get("phone", ""))
|
||||||
|
if not phone:
|
||||||
|
return None
|
||||||
|
|
||||||
|
province = str(source.get("province") or "").strip()
|
||||||
|
city = str(source.get("city") or province).strip()
|
||||||
|
return {
|
||||||
|
"name": str(profile.get("name") or "").strip(),
|
||||||
|
"law_firm": str(profile.get("law_firm") or "").strip(),
|
||||||
|
"province": province,
|
||||||
|
"city": city,
|
||||||
|
"phone": phone,
|
||||||
|
"url": str(source.get("detail_url") or "").strip(),
|
||||||
|
"domain": LEGACY_DOMAIN,
|
||||||
|
"create_time": int(record.get("collected_at") or time.time()),
|
||||||
|
"params": json.dumps(record, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def delete_old_domain_data(db: Db, domain: str) -> int:
|
||||||
|
cur = db.db.cursor()
|
||||||
|
try:
|
||||||
|
cur.execute("DELETE FROM lawyer WHERE domain=%s", (domain,))
|
||||||
|
affected = cur.rowcount
|
||||||
|
db.db.commit()
|
||||||
|
return affected
|
||||||
|
finally:
|
||||||
|
cur.close()
|
||||||
|
|
||||||
|
|
||||||
|
def write_records_to_db(db: Db, records: List[Dict]) -> int:
|
||||||
|
inserted = 0
|
||||||
|
for record in records:
|
||||||
|
row = to_legacy_row(record)
|
||||||
|
if not row:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
db.insert_data("lawyer", row)
|
||||||
|
inserted += 1
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[db] insert failed phone={row.get('phone', '')}: {exc}")
|
||||||
|
return inserted
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="众法利 groupId=80 基础字段采集(姓名/手机号/地区)")
|
||||||
|
parser.add_argument("--start-url", default=START_URL, help="入口分组页 URL")
|
||||||
|
parser.add_argument("--output", default=DEFAULT_OUTPUT, help="JSONL 输出路径")
|
||||||
|
parser.add_argument("--no-db", action="store_true", help="只写 JSON,不写 DB")
|
||||||
|
parser.add_argument("--no-reset", action="store_true", help="不清理 domain 旧数据")
|
||||||
|
parser.add_argument("--workers", type=int, default=16, help="详情页并发数")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
rotator = ProxyRotator(CLASH_CONTROLLER, CLASH_SECRET)
|
||||||
|
rotator.initialize()
|
||||||
|
if rotator.nodes:
|
||||||
|
print(f"[proxy] rotator enabled, nodes={len(rotator.nodes)}")
|
||||||
|
else:
|
||||||
|
print("[proxy] rotator disabled, using current proxy route")
|
||||||
|
|
||||||
|
start_retries = max(8, len(rotator.nodes) + 2) if rotator.nodes else 8
|
||||||
|
group_html = fetch_html(args.start_url, rotator=rotator, max_retries=start_retries)
|
||||||
|
group_state = parse_initial_state(group_html)
|
||||||
|
group_urls = extract_group_urls_from_group80(group_state)
|
||||||
|
print(f"[group] found group urls: {len(group_urls)}")
|
||||||
|
|
||||||
|
detail_url_to_source: Dict[str, str] = {}
|
||||||
|
for idx, rel_url in enumerate(group_urls, start=1):
|
||||||
|
list_url = f"http://m.zhongfali.com/{rel_url.lstrip('/')}"
|
||||||
|
try:
|
||||||
|
html = fetch_html(list_url, rotator=rotator, max_retries=4, timeout_seconds=12)
|
||||||
|
detail_urls = extract_detail_urls_from_group_html(html)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[group] failed {list_url}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for detail_url in detail_urls:
|
||||||
|
detail_url_to_source.setdefault(detail_url, list_url)
|
||||||
|
if idx % 10 == 0:
|
||||||
|
print(f"[group] {idx}/{len(group_urls)} detail_urls={len(detail_url_to_source)}")
|
||||||
|
|
||||||
|
records: List[Dict] = []
|
||||||
|
seen_phones: Set[str] = set()
|
||||||
|
detail_urls = sorted(detail_url_to_source.keys())
|
||||||
|
print(f"[detail] total detail urls: {len(detail_urls)}")
|
||||||
|
|
||||||
|
def process_detail(detail_url: str) -> Optional[Dict]:
|
||||||
|
try:
|
||||||
|
html = fetch_html(detail_url, rotator=rotator, max_retries=2, timeout_seconds=8)
|
||||||
|
record = parse_detail_record(detail_url, html, detail_url_to_source[detail_url])
|
||||||
|
return record
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[detail] failed {detail_url}: {exc}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
done = 0
|
||||||
|
with ThreadPoolExecutor(max_workers=max(1, int(args.workers))) as executor:
|
||||||
|
futures = [executor.submit(process_detail, detail_url) for detail_url in detail_urls]
|
||||||
|
for future in as_completed(futures):
|
||||||
|
done += 1
|
||||||
|
record = future.result()
|
||||||
|
if record:
|
||||||
|
phone = normalize_phone((record.get("profile", {}) or {}).get("phone", ""))
|
||||||
|
if phone and phone not in seen_phones:
|
||||||
|
seen_phones.add(phone)
|
||||||
|
records.append(record)
|
||||||
|
if done % 50 == 0:
|
||||||
|
print(f"[detail] {done}/{len(detail_urls)} valid_records={len(records)}")
|
||||||
|
|
||||||
|
with open(args.output, "w", encoding="utf-8") as out:
|
||||||
|
for record in records:
|
||||||
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
deleted = 0
|
||||||
|
inserted = 0
|
||||||
|
if not args.no_db:
|
||||||
|
with Db() as db:
|
||||||
|
if not args.no_reset:
|
||||||
|
deleted = delete_old_domain_data(db, LEGACY_DOMAIN)
|
||||||
|
inserted = write_records_to_db(db, records)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[done] records={len(records)}, db_deleted={deleted}, db_inserted={inserted}, output={args.output}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,501 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
|
|
||||||
|
import urllib3
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
request_dir = os.path.join(project_root, "request")
|
||||||
|
if request_dir not in sys.path:
|
||||||
|
sys.path.insert(0, request_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
from Db import Db
|
||||||
|
from request.requests_client import RequestClientError, RequestsClient
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
SITE_NAME = "zhongfali_single"
|
||||||
|
LEGACY_DOMAIN = "众法利单页"
|
||||||
|
DEFAULT_URL = "http://m.zhongfali.com/h-pd-552.html#mid=3&groupId=196&desc=false"
|
||||||
|
DEFAULT_OUTPUT = "/www/wwwroot/lawyers/data/one_off_sites/zhongfali_records_all.jsonl"
|
||||||
|
|
||||||
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||||
|
INITIAL_STATE_RE = re.compile(r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})</script>", re.S)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(text: str) -> str:
|
||||||
|
compact = re.sub(r"\D", "", text or "")
|
||||||
|
match = PHONE_RE.search(compact)
|
||||||
|
return match.group(0) if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
def split_specialties(text: str) -> List[str]:
|
||||||
|
source = (text or "").strip()
|
||||||
|
if not source:
|
||||||
|
return []
|
||||||
|
parts = [item.strip() for item in re.split(r"[、,,;;\s]+", source) if item.strip()]
|
||||||
|
seen: Set[str] = set()
|
||||||
|
result: List[str] = []
|
||||||
|
for item in parts:
|
||||||
|
if item in seen:
|
||||||
|
continue
|
||||||
|
seen.add(item)
|
||||||
|
result.append(item)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def strip_html(text: str) -> str:
|
||||||
|
cleaned = re.sub(r"<[^>]+>", " ", text or "")
|
||||||
|
cleaned = cleaned.replace(" ", " ")
|
||||||
|
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||||
|
return cleaned.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_specialties_from_remark(remark: str) -> List[str]:
|
||||||
|
plain = strip_html(remark)
|
||||||
|
if not plain:
|
||||||
|
return []
|
||||||
|
|
||||||
|
match = re.search(r"专业领域[::]\s*([^。;]+)", plain)
|
||||||
|
if match:
|
||||||
|
return split_specialties(match.group(1))
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def value_at(values: List[str], index: int) -> str:
|
||||||
|
if index < 0 or index >= len(values):
|
||||||
|
return ""
|
||||||
|
return str(values[index] or "").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_initial_state(html: str) -> Dict:
|
||||||
|
match = INITIAL_STATE_RE.search(html)
|
||||||
|
if not match:
|
||||||
|
raise ValueError("未找到 window.__INITIAL_STATE__")
|
||||||
|
return json.loads(match.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_location_and_name(product_name: str) -> Tuple[str, str, str]:
|
||||||
|
text = re.sub(r"\s+", " ", product_name or "").strip()
|
||||||
|
province = ""
|
||||||
|
city = ""
|
||||||
|
lawyer_name = ""
|
||||||
|
|
||||||
|
province_match = re.search(r"([\u4e00-\u9fa5]{2,}省)", text)
|
||||||
|
city_match = re.search(r"(?:[\u4e00-\u9fa5]+省)?([\u4e00-\u9fa5]+市)", text)
|
||||||
|
name_match = re.search(r"([\u4e00-\u9fa5]{2,4})\s*律师", text)
|
||||||
|
|
||||||
|
if province_match:
|
||||||
|
province = province_match.group(1)
|
||||||
|
if city_match:
|
||||||
|
city = city_match.group(1)
|
||||||
|
if name_match:
|
||||||
|
lawyer_name = name_match.group(1)
|
||||||
|
|
||||||
|
return province, city, lawyer_name
|
||||||
|
|
||||||
|
|
||||||
|
def pick_product_module(state: Dict) -> Optional[Dict]:
|
||||||
|
module_map = state.get("currentPageModuleIdMap", {}) or {}
|
||||||
|
page_ids = state.get("currentPageModuleIds", []) or []
|
||||||
|
|
||||||
|
for module_id in page_ids:
|
||||||
|
module = module_map.get(str(module_id)) or module_map.get(module_id)
|
||||||
|
if not isinstance(module, dict):
|
||||||
|
continue
|
||||||
|
ext_info = module.get("extInfo", {}) or {}
|
||||||
|
if ext_info.get("productInfo"):
|
||||||
|
return module
|
||||||
|
|
||||||
|
for module in module_map.values():
|
||||||
|
if not isinstance(module, dict):
|
||||||
|
continue
|
||||||
|
ext_info = module.get("extInfo", {}) or {}
|
||||||
|
if ext_info.get("productInfo"):
|
||||||
|
return module
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_group_id_from_url(url: str) -> int:
|
||||||
|
match = re.search(r"(?:[?&#]|^)groupId=(\d+)", url)
|
||||||
|
if not match:
|
||||||
|
return 0
|
||||||
|
try:
|
||||||
|
return int(match.group(1))
|
||||||
|
except ValueError:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def extract_records(url: str, state: Dict) -> List[Dict]:
|
||||||
|
module = pick_product_module(state)
|
||||||
|
if not module:
|
||||||
|
return []
|
||||||
|
|
||||||
|
ext_info = module.get("extInfo", {}) or {}
|
||||||
|
product_info = ext_info.get("productInfo", {}) or {}
|
||||||
|
product_name = str(product_info.get("name") or "").strip()
|
||||||
|
|
||||||
|
province, city, current_name = extract_location_and_name(product_name)
|
||||||
|
group_id = product_info.get("groupId")
|
||||||
|
if not group_id:
|
||||||
|
group_id = parse_group_id_from_url(url)
|
||||||
|
module_id = module.get("id")
|
||||||
|
|
||||||
|
prop_map: Dict[str, List[str]] = {}
|
||||||
|
for prop in ext_info.get("propList", []) or []:
|
||||||
|
name = str(prop.get("name") or "").strip()
|
||||||
|
values = [str(item or "").strip() for item in (prop.get("valueList") or [])]
|
||||||
|
if name:
|
||||||
|
prop_map[name] = values
|
||||||
|
|
||||||
|
result: List[Dict] = []
|
||||||
|
seen_phones: Set[str] = set()
|
||||||
|
now = int(time.time())
|
||||||
|
|
||||||
|
phone_values = prop_map.get("电话", [])
|
||||||
|
for idx, raw_phone in enumerate(phone_values):
|
||||||
|
phone = normalize_phone(raw_phone)
|
||||||
|
if not phone or phone in seen_phones:
|
||||||
|
continue
|
||||||
|
seen_phones.add(phone)
|
||||||
|
|
||||||
|
law_firm = value_at(prop_map.get("律师所", []), idx)
|
||||||
|
area = value_at(prop_map.get("所在地区", []), idx)
|
||||||
|
direction = value_at(prop_map.get("主攻方向", []), idx)
|
||||||
|
specialty_text = value_at(prop_map.get("专业特长", []), idx)
|
||||||
|
license_no = value_at(prop_map.get("执业证号", []), idx)
|
||||||
|
address = value_at(prop_map.get("地址", []), idx)
|
||||||
|
email = value_at(prop_map.get("电子邮箱", []), idx)
|
||||||
|
seat_phone = value_at(prop_map.get("座机", []), idx)
|
||||||
|
wechat = value_at(prop_map.get("微信", []), idx)
|
||||||
|
qq = value_at(prop_map.get("QQ", []), idx)
|
||||||
|
first_practice_date = value_at(prop_map.get("首次执业日期", []), idx)
|
||||||
|
|
||||||
|
specialties = split_specialties(direction)
|
||||||
|
if not specialties:
|
||||||
|
specialties = split_specialties(specialty_text)
|
||||||
|
|
||||||
|
record = {
|
||||||
|
"record_id": hashlib.md5(f"{url}|{phone}".encode("utf-8")).hexdigest(),
|
||||||
|
"collected_at": now,
|
||||||
|
"source": {
|
||||||
|
"site": SITE_NAME,
|
||||||
|
"list_url": url,
|
||||||
|
"detail_url": "",
|
||||||
|
"province": province,
|
||||||
|
"province_py": "",
|
||||||
|
"city": area or city,
|
||||||
|
"city_py": "",
|
||||||
|
"page": 1,
|
||||||
|
"group_id": group_id,
|
||||||
|
"module_id": module_id,
|
||||||
|
"detail_url_status": "unresolved_from_pool",
|
||||||
|
},
|
||||||
|
"list_snapshot": {
|
||||||
|
"name": "",
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"specialties": specialties,
|
||||||
|
"answer_count": None,
|
||||||
|
},
|
||||||
|
"profile": {
|
||||||
|
"name": "",
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"phone": phone,
|
||||||
|
"license_no": license_no,
|
||||||
|
"practice_years": None,
|
||||||
|
"email": email,
|
||||||
|
"address": address,
|
||||||
|
"specialties": specialties,
|
||||||
|
},
|
||||||
|
"raw": {
|
||||||
|
"source_index": idx,
|
||||||
|
"direction": direction,
|
||||||
|
"specialty_text": specialty_text,
|
||||||
|
"seat_phone": seat_phone,
|
||||||
|
"wechat": wechat,
|
||||||
|
"qq": qq,
|
||||||
|
"first_practice_date": first_practice_date,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
result.append(record)
|
||||||
|
|
||||||
|
current_phone = normalize_phone(str(product_info.get("material") or ""))
|
||||||
|
if current_phone and current_phone not in seen_phones:
|
||||||
|
seen_phones.add(current_phone)
|
||||||
|
remark = str(product_info.get("remark") or "")
|
||||||
|
specialties = extract_specialties_from_remark(remark)
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"record_id": hashlib.md5(f"{url}|{current_phone}".encode("utf-8")).hexdigest(),
|
||||||
|
"collected_at": now,
|
||||||
|
"source": {
|
||||||
|
"site": SITE_NAME,
|
||||||
|
"list_url": url,
|
||||||
|
"detail_url": url,
|
||||||
|
"province": province,
|
||||||
|
"province_py": "",
|
||||||
|
"city": city,
|
||||||
|
"city_py": "",
|
||||||
|
"page": 1,
|
||||||
|
"group_id": group_id,
|
||||||
|
"module_id": module_id,
|
||||||
|
},
|
||||||
|
"list_snapshot": {
|
||||||
|
"name": current_name,
|
||||||
|
"law_firm": str(product_info.get("prop0") or "").strip(),
|
||||||
|
"specialties": specialties,
|
||||||
|
"answer_count": None,
|
||||||
|
},
|
||||||
|
"profile": {
|
||||||
|
"name": current_name,
|
||||||
|
"law_firm": str(product_info.get("prop0") or "").strip(),
|
||||||
|
"phone": current_phone,
|
||||||
|
"license_no": str(product_info.get("prop1") or "").strip(),
|
||||||
|
"practice_years": None,
|
||||||
|
"email": "",
|
||||||
|
"address": str(product_info.get("prop3") or "").strip(),
|
||||||
|
"specialties": specialties,
|
||||||
|
},
|
||||||
|
"raw": {
|
||||||
|
"from_product_info": True,
|
||||||
|
"product_name": product_name,
|
||||||
|
"remark": remark,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def to_legacy_row(record: Dict) -> Optional[Dict[str, str]]:
|
||||||
|
source = record.get("source", {}) or {}
|
||||||
|
profile = record.get("profile", {}) or {}
|
||||||
|
phone = normalize_phone(profile.get("phone", ""))
|
||||||
|
if not phone:
|
||||||
|
return None
|
||||||
|
|
||||||
|
province = str(source.get("province") or "").strip()
|
||||||
|
city = str(source.get("city") or province).strip()
|
||||||
|
return {
|
||||||
|
"name": str(profile.get("name") or "").strip(),
|
||||||
|
"law_firm": str(profile.get("law_firm") or "").strip(),
|
||||||
|
"province": province,
|
||||||
|
"city": city,
|
||||||
|
"phone": phone,
|
||||||
|
"url": str(source.get("detail_url") or "").strip(),
|
||||||
|
"domain": LEGACY_DOMAIN,
|
||||||
|
"create_time": int(record.get("collected_at") or time.time()),
|
||||||
|
"params": json.dumps(record, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def existing_phones_in_db(db: Db, phones: List[str]) -> Set[str]:
|
||||||
|
deduped = sorted({phone for phone in phones if phone})
|
||||||
|
if not deduped:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
existing: Set[str] = set()
|
||||||
|
cur = db.db.cursor()
|
||||||
|
try:
|
||||||
|
chunk_size = 500
|
||||||
|
for i in range(0, len(deduped), chunk_size):
|
||||||
|
chunk = deduped[i:i + chunk_size]
|
||||||
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
|
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||||
|
for row in cur.fetchall():
|
||||||
|
existing.add(row[0])
|
||||||
|
finally:
|
||||||
|
cur.close()
|
||||||
|
return existing
|
||||||
|
|
||||||
|
|
||||||
|
def write_records_to_db(db: Db, records: List[Dict]) -> Tuple[int, int]:
|
||||||
|
rows: List[Dict[str, str]] = []
|
||||||
|
for record in records:
|
||||||
|
row = to_legacy_row(record)
|
||||||
|
if row:
|
||||||
|
rows.append(row)
|
||||||
|
if not rows:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
existing = existing_phones_in_db(db, [row["phone"] for row in rows])
|
||||||
|
inserted = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
phone = row.get("phone", "")
|
||||||
|
if not phone or phone in existing:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
db.insert_data("lawyer", row)
|
||||||
|
existing.add(phone)
|
||||||
|
inserted += 1
|
||||||
|
except Exception as exc:
|
||||||
|
skipped += 1
|
||||||
|
print(f"[db] 插入失败 phone={phone}: {exc}")
|
||||||
|
return inserted, skipped
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_name_map_from_db(db: Db, phones: List[str]) -> Dict[str, str]:
|
||||||
|
deduped = sorted({phone for phone in phones if phone})
|
||||||
|
if not deduped:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
name_map: Dict[str, str] = {}
|
||||||
|
cur = db.db.cursor()
|
||||||
|
try:
|
||||||
|
chunk_size = 500
|
||||||
|
for i in range(0, len(deduped), chunk_size):
|
||||||
|
chunk = deduped[i:i + chunk_size]
|
||||||
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
|
sql = (
|
||||||
|
"SELECT phone, name, create_time FROM lawyer "
|
||||||
|
f"WHERE phone IN ({placeholders}) AND name<>'' "
|
||||||
|
"ORDER BY create_time DESC"
|
||||||
|
)
|
||||||
|
cur.execute(sql, chunk)
|
||||||
|
for phone, name, _ in cur.fetchall():
|
||||||
|
if phone not in name_map and name:
|
||||||
|
name_map[phone] = str(name).strip()
|
||||||
|
finally:
|
||||||
|
cur.close()
|
||||||
|
return name_map
|
||||||
|
|
||||||
|
|
||||||
|
def apply_name_backfill(records: List[Dict], name_map: Dict[str, str]) -> int:
|
||||||
|
updated = 0
|
||||||
|
if not name_map:
|
||||||
|
return updated
|
||||||
|
|
||||||
|
for record in records:
|
||||||
|
profile = record.get("profile", {}) or {}
|
||||||
|
list_snapshot = record.get("list_snapshot", {}) or {}
|
||||||
|
phone = normalize_phone(profile.get("phone", ""))
|
||||||
|
if not phone:
|
||||||
|
continue
|
||||||
|
|
||||||
|
backfill_name = name_map.get(phone, "")
|
||||||
|
if not backfill_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_name = str(profile.get("name") or "").strip()
|
||||||
|
if current_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
profile["name"] = backfill_name
|
||||||
|
list_snapshot["name"] = backfill_name
|
||||||
|
record["profile"] = profile
|
||||||
|
record["list_snapshot"] = list_snapshot
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
return updated
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="众法利单页律师电话采集")
|
||||||
|
parser.add_argument("--url", default=DEFAULT_URL, help="详情页 URL")
|
||||||
|
parser.add_argument("--output", default=DEFAULT_OUTPUT, help="输出 jsonl 文件路径")
|
||||||
|
parser.add_argument("--direct", action="store_true", help="直连模式,不使用代理")
|
||||||
|
parser.add_argument("--no-db", action="store_true", help="仅输出 JSON,不写入数据库")
|
||||||
|
parser.add_argument("--skip-name-backfill", action="store_true", help="跳过按手机号回填姓名")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
client = RequestsClient(
|
||||||
|
headers={
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||||
|
"Mobile/15E148 Safari/604.1"
|
||||||
|
),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Connection": "close",
|
||||||
|
},
|
||||||
|
use_proxy=not args.direct,
|
||||||
|
retry_total=2,
|
||||||
|
retry_backoff_factor=1,
|
||||||
|
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||||
|
retry_allowed_methods=("GET",),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = client.get_text(args.url, timeout=30, verify=False)
|
||||||
|
if resp.status_code >= 400:
|
||||||
|
raise RequestClientError(f"{resp.status_code} Error: {args.url}")
|
||||||
|
state = parse_initial_state(resp.text)
|
||||||
|
records = extract_records(args.url, state)
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
|
||||||
|
if not records:
|
||||||
|
print("[done] 未采集到有效手机号")
|
||||||
|
return
|
||||||
|
|
||||||
|
seen_ids: Set[str] = set()
|
||||||
|
if os.path.exists(args.output):
|
||||||
|
with open(args.output, "r", encoding="utf-8") as old_file:
|
||||||
|
for line in old_file:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
item = json.loads(line)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
record_id = item.get("record_id")
|
||||||
|
if record_id:
|
||||||
|
seen_ids.add(record_id)
|
||||||
|
|
||||||
|
json_new = 0
|
||||||
|
with open(args.output, "a", encoding="utf-8") as out:
|
||||||
|
for record in records:
|
||||||
|
record_id = record["record_id"]
|
||||||
|
if record_id in seen_ids:
|
||||||
|
continue
|
||||||
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
seen_ids.add(record_id)
|
||||||
|
json_new += 1
|
||||||
|
|
||||||
|
db_new = 0
|
||||||
|
db_skip = 0
|
||||||
|
name_backfill_count = 0
|
||||||
|
if not args.skip_name_backfill:
|
||||||
|
try:
|
||||||
|
with Db() as db:
|
||||||
|
name_map = lookup_name_map_from_db(
|
||||||
|
db,
|
||||||
|
[normalize_phone((record.get("profile", {}) or {}).get("phone", "")) for record in records],
|
||||||
|
)
|
||||||
|
name_backfill_count = apply_name_backfill(records, name_map)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[name-backfill] 跳过,查询失败: {exc}")
|
||||||
|
|
||||||
|
if not args.no_db:
|
||||||
|
with Db() as db:
|
||||||
|
db_new, db_skip = write_records_to_db(db, records)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[done] 采集{len(records)}条, 姓名回填{name_backfill_count}条, JSON新增{json_new}条, "
|
||||||
|
f"DB新增{db_new}条, DB跳过{db_skip}条, 输出: {args.output}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
+1
-19
@@ -1,19 +1 @@
|
|||||||
from request.requests_client import (
|
# Package marker for request utilities.
|
||||||
RequestClientError,
|
|
||||||
RequestConnectTimeout,
|
|
||||||
RequestConnectionError,
|
|
||||||
RequestSSLError,
|
|
||||||
RequestTimeout,
|
|
||||||
RequestsClient,
|
|
||||||
ResponseData,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"RequestsClient",
|
|
||||||
"ResponseData",
|
|
||||||
"RequestClientError",
|
|
||||||
"RequestConnectTimeout",
|
|
||||||
"RequestTimeout",
|
|
||||||
"RequestConnectionError",
|
|
||||||
"RequestSSLError",
|
|
||||||
]
|
|
||||||
|
|||||||
+42
-2
@@ -24,6 +24,19 @@ def _normalize_bool(value, default: bool = True) -> bool:
|
|||||||
return text not in ("0", "false", "no", "off", "")
|
return text not in ("0", "false", "no", "off", "")
|
||||||
|
|
||||||
|
|
||||||
|
def _env_proxy_override() -> Optional[bool]:
|
||||||
|
"""
|
||||||
|
环境变量覆盖代理开关:
|
||||||
|
- PROXY_ENABLED 未设置:返回 None(不覆盖,仍读取 proxy_settings.json)
|
||||||
|
- PROXY_ENABLED=0/false/off:强制关闭代理
|
||||||
|
- PROXY_ENABLED=1/true/on:强制开启代理(前提是配置字段齐全)
|
||||||
|
"""
|
||||||
|
raw = os.getenv("PROXY_ENABLED")
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
return _normalize_bool(raw, True)
|
||||||
|
|
||||||
|
|
||||||
def _load_config() -> Dict[str, str]:
|
def _load_config() -> Dict[str, str]:
|
||||||
if not os.path.exists(CONFIG_PATH):
|
if not os.path.exists(CONFIG_PATH):
|
||||||
return dict(DEFAULT_CONFIG)
|
return dict(DEFAULT_CONFIG)
|
||||||
@@ -48,7 +61,12 @@ def report_proxy_status() -> None:
|
|||||||
_PROXY_STATUS_REPORTED = True
|
_PROXY_STATUS_REPORTED = True
|
||||||
|
|
||||||
config = _load_config()
|
config = _load_config()
|
||||||
enabled = _normalize_bool(config.get("enabled"), True)
|
override = _env_proxy_override()
|
||||||
|
if override is False:
|
||||||
|
print("[proxy] disabled by env (PROXY_ENABLED=0)")
|
||||||
|
return
|
||||||
|
|
||||||
|
enabled = _normalize_bool(config.get("enabled"), True) if override is None else True
|
||||||
if not enabled:
|
if not enabled:
|
||||||
print("[proxy] disabled by config")
|
print("[proxy] disabled by config")
|
||||||
return
|
return
|
||||||
@@ -66,7 +84,10 @@ def get_proxies() -> Optional[Dict[str, str]]:
|
|||||||
代理配置从 proxy_settings.json 读取,不依赖环境变量。
|
代理配置从 proxy_settings.json 读取,不依赖环境变量。
|
||||||
"""
|
"""
|
||||||
config = _load_config()
|
config = _load_config()
|
||||||
if not _normalize_bool(config.get("enabled"), True):
|
override = _env_proxy_override()
|
||||||
|
if override is False:
|
||||||
|
return None
|
||||||
|
if override is None and not _normalize_bool(config.get("enabled"), True):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
tunnel = str(config.get("tunnel") or "").strip()
|
tunnel = str(config.get("tunnel") or "").strip()
|
||||||
@@ -95,3 +116,22 @@ def apply_proxy(session) -> Optional[Dict[str, str]]:
|
|||||||
|
|
||||||
|
|
||||||
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status"]
|
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status"]
|
||||||
|
|
||||||
|
|
||||||
|
def is_proxy_enabled() -> bool:
|
||||||
|
"""
|
||||||
|
判断当前进程是否启用了代理。
|
||||||
|
|
||||||
|
优先遵循环境变量 PROXY_ENABLED;
|
||||||
|
未设置时回退到 proxy_settings.json 的 enabled 配置。
|
||||||
|
"""
|
||||||
|
config = _load_config()
|
||||||
|
override = _env_proxy_override()
|
||||||
|
if override is False:
|
||||||
|
return False
|
||||||
|
if override is True:
|
||||||
|
return True
|
||||||
|
return _normalize_bool(config.get("enabled"), True)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status", "is_proxy_enabled"]
|
||||||
|
|||||||
+14
-2
@@ -1,6 +1,18 @@
|
|||||||
|
# 数据库驱动
|
||||||
pymysql>=1.0.2
|
pymysql>=1.0.2
|
||||||
|
pymongo>=4.0.0
|
||||||
|
|
||||||
|
# 调度器
|
||||||
|
schedule>=1.2.0
|
||||||
|
|
||||||
|
# 其他可能需要的依赖
|
||||||
requests>=2.28.0
|
requests>=2.28.0
|
||||||
beautifulsoup4>=4.11.0
|
beautifulsoup4>=4.11.0
|
||||||
urllib3>=1.26.0
|
|
||||||
lxml>=4.9.0
|
lxml>=4.9.0
|
||||||
openpyxl>=3.1.0
|
redis>=4.0.0
|
||||||
|
pyppeteer>=1.0.2
|
||||||
|
# 可选:提升反检测能力
|
||||||
|
pyppeteer-stealth>=2.7.4
|
||||||
|
|
||||||
|
# 日志相关
|
||||||
|
python-dateutil>=2.8.2
|
||||||
|
|||||||
@@ -0,0 +1,849 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
||||||
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
from Db import Db
|
||||||
|
|
||||||
|
|
||||||
|
AREA_TABLE = os.getenv("AREA_TARGET_TABLE", "area_new")
|
||||||
|
AREA_DOMAIN = os.getenv("AREA_DOMAIN", "maxlaw")
|
||||||
|
DOUYIN_DOMAIN = os.getenv("DOUYIN_DOMAIN", "抖音")
|
||||||
|
DOUYIN_RAW_DIR = os.getenv("DOUYIN_RAW_DIR", os.path.join(project_root, "data", "douyin_raw"))
|
||||||
|
DOUYIN_SAVE_ONLY_ENV = os.getenv("DOUYIN_SAVE_ONLY", "1")
|
||||||
|
LAWYER_KEYWORDS_ENV = os.getenv("DOUYIN_LAWYER_KEYWORDS", "律师,律所")
|
||||||
|
PROGRESS_TABLE = os.getenv("LAYER_PROGRESS_TABLE", "layer_progress")
|
||||||
|
PROGRESS_DEFAULT_KEY = os.getenv("LAYER_PROGRESS_DEFAULT_KEY", "douyin_batch_default")
|
||||||
|
SERVICE_HOST = os.getenv("AREA_SERVICE_HOST", "0.0.0.0")
|
||||||
|
SERVICE_PORT = int(os.getenv("AREA_SERVICE_PORT", "9002"))
|
||||||
|
|
||||||
|
PHONE_REGEX = re.compile(r"(?:\+?86[-\s]?)?(1[3-9]\d{9})")
|
||||||
|
WX_CONTEXT_REGEX = re.compile(r"(?i)(?:微信|微.?信|wx|vx|weixin|v信|v号|v)\s*[::/\-\s]\s*([a-zA-Z0-9._-]{3,40})")
|
||||||
|
LAW_FIRM_REGEX = re.compile(r"([\u4e00-\u9fa5A-Za-z·]{2,40}律师事务所)")
|
||||||
|
RAW_WRITE_LOCK = threading.Lock()
|
||||||
|
|
||||||
|
LAWYER_KEYWORDS: Tuple[str, ...] = tuple(
|
||||||
|
keyword.strip() for keyword in LAWYER_KEYWORDS_ENV.split(",") if keyword.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_safe_table_name(table_name: str) -> bool:
|
||||||
|
return bool(re.fullmatch(r"[A-Za-z0-9_]+", table_name or ""))
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_int(value: Any, default: int = 0) -> int:
|
||||||
|
try:
|
||||||
|
return int(str(value).strip())
|
||||||
|
except Exception:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_bool(value: Any, default: bool = False) -> bool:
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
text = str(value).strip().lower()
|
||||||
|
if text in {"1", "true", "yes", "y", "on"}:
|
||||||
|
return True
|
||||||
|
if text in {"0", "false", "no", "n", "off"}:
|
||||||
|
return False
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _first_param(params: Dict[str, List[str]], key: str, default: str = "") -> str:
|
||||||
|
values = params.get(key) or []
|
||||||
|
if not values:
|
||||||
|
return default
|
||||||
|
return values[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _append_jsonl(file_path: str, payload: Dict[str, Any]) -> None:
|
||||||
|
os.makedirs(os.path.dirname(file_path) or ".", exist_ok=True)
|
||||||
|
line = json.dumps(payload, ensure_ascii=False)
|
||||||
|
with RAW_WRITE_LOCK:
|
||||||
|
with open(file_path, "a", encoding="utf-8") as out:
|
||||||
|
out.write(line)
|
||||||
|
out.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def _save_raw_index_payload(payload: Dict[str, Any], query: Dict[str, List[str]], client_ip: str) -> str:
|
||||||
|
now_ts = int(time.time())
|
||||||
|
day = time.strftime("%Y%m%d", time.localtime(now_ts))
|
||||||
|
file_path = os.path.join(DOUYIN_RAW_DIR, f"douyin_index_{day}.jsonl")
|
||||||
|
|
||||||
|
wrapped = {
|
||||||
|
"received_at": now_ts,
|
||||||
|
"client_ip": client_ip,
|
||||||
|
"query": query,
|
||||||
|
"payload": payload,
|
||||||
|
}
|
||||||
|
_append_jsonl(file_path, wrapped)
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_progress_table() -> None:
|
||||||
|
if not _is_safe_table_name(PROGRESS_TABLE):
|
||||||
|
raise ValueError("非法进度表名")
|
||||||
|
|
||||||
|
with Db() as db:
|
||||||
|
cursor = db.db.cursor()
|
||||||
|
sql = f"""
|
||||||
|
CREATE TABLE IF NOT EXISTS `{PROGRESS_TABLE}` (
|
||||||
|
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
|
||||||
|
`progress_key` varchar(128) NOT NULL,
|
||||||
|
`next_city_index` int(11) DEFAULT 0,
|
||||||
|
`area_signature` varchar(128) DEFAULT NULL,
|
||||||
|
`area_total` int(11) DEFAULT 0,
|
||||||
|
`current_city` varchar(128) DEFAULT NULL,
|
||||||
|
`reason` varchar(64) DEFAULT NULL,
|
||||||
|
`status` varchar(32) DEFAULT NULL,
|
||||||
|
`device_id` varchar(128) DEFAULT NULL,
|
||||||
|
`extra_json` longtext,
|
||||||
|
`updated_at` bigint(20) DEFAULT NULL,
|
||||||
|
`create_time` bigint(20) DEFAULT NULL,
|
||||||
|
PRIMARY KEY (`id`),
|
||||||
|
UNIQUE KEY `uk_progress_key` (`progress_key`),
|
||||||
|
KEY `idx_updated_at` (`updated_at`)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
|
||||||
|
"""
|
||||||
|
cursor.execute(sql)
|
||||||
|
db.db.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_progress(progress_key: str) -> Optional[Dict[str, Any]]:
|
||||||
|
key = str(progress_key or "").strip()
|
||||||
|
if not key:
|
||||||
|
return None
|
||||||
|
|
||||||
|
_ensure_progress_table()
|
||||||
|
with Db() as db:
|
||||||
|
cursor = db.db.cursor()
|
||||||
|
sql = (
|
||||||
|
f"SELECT progress_key, next_city_index, area_signature, area_total, current_city, "
|
||||||
|
f"reason, status, device_id, extra_json, updated_at, create_time "
|
||||||
|
f"FROM `{PROGRESS_TABLE}` WHERE progress_key=%s LIMIT 1"
|
||||||
|
)
|
||||||
|
cursor.execute(sql, (key,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
|
||||||
|
extra_json = row[8] or ""
|
||||||
|
extra_obj: Any = {}
|
||||||
|
if extra_json:
|
||||||
|
try:
|
||||||
|
extra_obj = json.loads(extra_json)
|
||||||
|
except Exception:
|
||||||
|
extra_obj = extra_json
|
||||||
|
|
||||||
|
return {
|
||||||
|
"progress_key": row[0] or "",
|
||||||
|
"next_city_index": _parse_int(row[1], 0),
|
||||||
|
"area_signature": row[2] or "",
|
||||||
|
"area_total": _parse_int(row[3], 0),
|
||||||
|
"current_city": row[4] or "",
|
||||||
|
"reason": row[5] or "",
|
||||||
|
"status": row[6] or "",
|
||||||
|
"device_id": row[7] or "",
|
||||||
|
"extra": extra_obj,
|
||||||
|
"updated_at": _parse_int(row[9], 0),
|
||||||
|
"create_time": _parse_int(row[10], 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _upsert_progress(progress_key: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
key = str(progress_key or "").strip()
|
||||||
|
if not key:
|
||||||
|
raise ValueError("progress_key 不能为空")
|
||||||
|
|
||||||
|
_ensure_progress_table()
|
||||||
|
now_ts = int(time.time())
|
||||||
|
next_city_index = _parse_int(payload.get("next_city_index"), 0)
|
||||||
|
area_signature = str(payload.get("area_signature") or "").strip()
|
||||||
|
area_total = _parse_int(payload.get("area_total"), 0)
|
||||||
|
current_city = str(payload.get("current_city") or "").strip()
|
||||||
|
reason = str(payload.get("reason") or "").strip()
|
||||||
|
status = str(payload.get("status") or "").strip()
|
||||||
|
device_id = str(payload.get("device_id") or "").strip()
|
||||||
|
extra = payload.get("extra")
|
||||||
|
if extra is None:
|
||||||
|
extra = payload.get("extra_json")
|
||||||
|
|
||||||
|
if isinstance(extra, str):
|
||||||
|
extra_json = extra
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
extra_json = json.dumps(extra or {}, ensure_ascii=False)
|
||||||
|
except Exception:
|
||||||
|
extra_json = "{}"
|
||||||
|
|
||||||
|
with Db() as db:
|
||||||
|
cursor = db.db.cursor()
|
||||||
|
sql = (
|
||||||
|
f"INSERT INTO `{PROGRESS_TABLE}` "
|
||||||
|
"(progress_key, next_city_index, area_signature, area_total, current_city, reason, status, "
|
||||||
|
"device_id, extra_json, updated_at, create_time) "
|
||||||
|
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
|
||||||
|
"ON DUPLICATE KEY UPDATE "
|
||||||
|
"next_city_index=VALUES(next_city_index), "
|
||||||
|
"area_signature=VALUES(area_signature), "
|
||||||
|
"area_total=VALUES(area_total), "
|
||||||
|
"current_city=VALUES(current_city), "
|
||||||
|
"reason=VALUES(reason), "
|
||||||
|
"status=VALUES(status), "
|
||||||
|
"device_id=VALUES(device_id), "
|
||||||
|
"extra_json=VALUES(extra_json), "
|
||||||
|
"updated_at=VALUES(updated_at)"
|
||||||
|
)
|
||||||
|
cursor.execute(
|
||||||
|
sql,
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
next_city_index,
|
||||||
|
area_signature,
|
||||||
|
area_total,
|
||||||
|
current_city,
|
||||||
|
reason,
|
||||||
|
status,
|
||||||
|
device_id,
|
||||||
|
extra_json,
|
||||||
|
now_ts,
|
||||||
|
now_ts,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
db.db.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
return _get_progress(key) or {}
|
||||||
|
|
||||||
|
|
||||||
|
def _clear_progress(progress_key: str) -> int:
|
||||||
|
key = str(progress_key or "").strip()
|
||||||
|
if not key:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
_ensure_progress_table()
|
||||||
|
with Db() as db:
|
||||||
|
cursor = db.db.cursor()
|
||||||
|
sql = f"DELETE FROM `{PROGRESS_TABLE}` WHERE progress_key=%s"
|
||||||
|
cursor.execute(sql, (key,))
|
||||||
|
affected = cursor.rowcount
|
||||||
|
db.db.commit()
|
||||||
|
cursor.close()
|
||||||
|
return affected
|
||||||
|
|
||||||
|
|
||||||
|
def _query_area_data(table_name: str, domain: str) -> List[Dict[str, Any]]:
|
||||||
|
if not _is_safe_table_name(table_name):
|
||||||
|
raise ValueError("非法表名")
|
||||||
|
|
||||||
|
with Db() as db:
|
||||||
|
cursor = db.db.cursor()
|
||||||
|
sql = (
|
||||||
|
f"SELECT province, city, name, pid, pinyin, code, domain, level, create_time "
|
||||||
|
f"FROM `{table_name}` WHERE domain=%s ORDER BY id ASC"
|
||||||
|
)
|
||||||
|
cursor.execute(sql, (domain,))
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
result: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"province": row[0] or "",
|
||||||
|
"city": row[1] or "",
|
||||||
|
"name": row[2] or "",
|
||||||
|
"pid": row[3] if row[3] is not None else 0,
|
||||||
|
"pinyin": row[4] or "",
|
||||||
|
"code": row[5] or "",
|
||||||
|
"domain": row[6] or "",
|
||||||
|
"level": row[7] if row[7] is not None else 0,
|
||||||
|
"create_time": row[8] if row[8] is not None else 0,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_dict_nodes(value: Any) -> Iterable[Dict[str, Any]]:
|
||||||
|
stack: List[Any] = [value]
|
||||||
|
while stack:
|
||||||
|
current = stack.pop()
|
||||||
|
if isinstance(current, dict):
|
||||||
|
yield current
|
||||||
|
stack.extend(current.values())
|
||||||
|
elif isinstance(current, list):
|
||||||
|
stack.extend(current)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_phones_from_text(text: str) -> List[str]:
|
||||||
|
phones: List[str] = []
|
||||||
|
seen: Set[str] = set()
|
||||||
|
for match in PHONE_REGEX.finditer(text or ""):
|
||||||
|
phone = match.group(1)
|
||||||
|
if not phone or phone in seen:
|
||||||
|
continue
|
||||||
|
seen.add(phone)
|
||||||
|
phones.append(phone)
|
||||||
|
return phones
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_phones_from_user_info(user_info: Dict[str, Any]) -> List[str]:
|
||||||
|
signature = str(user_info.get("signature") or "")
|
||||||
|
unique_id = str(user_info.get("unique_id") or "")
|
||||||
|
versatile = str(user_info.get("versatile_display") or "")
|
||||||
|
|
||||||
|
# 1) 优先从简介直接匹配手机号
|
||||||
|
phones = set(_extract_phones_from_text(signature))
|
||||||
|
if phones:
|
||||||
|
return sorted(phones)
|
||||||
|
|
||||||
|
# 2) 从微信相关标记中提取,再从抖音号字段兜底
|
||||||
|
for text in (signature, unique_id, versatile):
|
||||||
|
for match in WX_CONTEXT_REGEX.finditer(text):
|
||||||
|
wx_value = match.group(1) or ""
|
||||||
|
for phone in _extract_phones_from_text(wx_value):
|
||||||
|
phones.add(phone)
|
||||||
|
|
||||||
|
for text in (unique_id, versatile):
|
||||||
|
for phone in _extract_phones_from_text(text):
|
||||||
|
phones.add(phone)
|
||||||
|
|
||||||
|
return sorted(phones)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_law_firm_from_user_info(user_info: Dict[str, Any]) -> str:
|
||||||
|
candidates: List[str] = []
|
||||||
|
|
||||||
|
signature = str(user_info.get("signature") or "")
|
||||||
|
if signature:
|
||||||
|
candidates.append(signature)
|
||||||
|
|
||||||
|
verify_reason = str(user_info.get("enterprise_verify_reason") or "")
|
||||||
|
if verify_reason:
|
||||||
|
candidates.append(verify_reason)
|
||||||
|
|
||||||
|
cert_text = ""
|
||||||
|
account_cert_info = user_info.get("account_cert_info")
|
||||||
|
if isinstance(account_cert_info, str) and account_cert_info.strip():
|
||||||
|
try:
|
||||||
|
cert_obj = json.loads(account_cert_info)
|
||||||
|
if isinstance(cert_obj, dict):
|
||||||
|
cert_text = str(cert_obj.get("label_text") or "").strip()
|
||||||
|
except Exception:
|
||||||
|
cert_text = account_cert_info.strip()
|
||||||
|
if cert_text:
|
||||||
|
candidates.append(cert_text)
|
||||||
|
|
||||||
|
for text in candidates:
|
||||||
|
match = LAW_FIRM_REGEX.search(text)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_account_cert_text(user_info: Dict[str, Any]) -> str:
|
||||||
|
account_cert_info = user_info.get("account_cert_info")
|
||||||
|
if isinstance(account_cert_info, str) and account_cert_info.strip():
|
||||||
|
try:
|
||||||
|
cert_obj = json.loads(account_cert_info)
|
||||||
|
if isinstance(cert_obj, dict):
|
||||||
|
return str(cert_obj.get("label_text") or "").strip()
|
||||||
|
except Exception:
|
||||||
|
return account_cert_info.strip()
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _is_lawyer_related_user(user_info: Dict[str, Any], name: str, law_firm: str) -> bool:
|
||||||
|
texts = [
|
||||||
|
name,
|
||||||
|
str(user_info.get("nickname") or ""),
|
||||||
|
str(user_info.get("signature") or ""),
|
||||||
|
str(user_info.get("custom_verify") or ""),
|
||||||
|
str(user_info.get("enterprise_verify_reason") or ""),
|
||||||
|
str(user_info.get("versatile_display") or ""),
|
||||||
|
str(user_info.get("unique_id") or ""),
|
||||||
|
_extract_account_cert_text(user_info),
|
||||||
|
law_firm,
|
||||||
|
]
|
||||||
|
merged = "\n".join(text for text in texts if text).strip()
|
||||||
|
if not merged:
|
||||||
|
return False
|
||||||
|
return any(keyword in merged for keyword in LAWYER_KEYWORDS)
|
||||||
|
|
||||||
|
|
||||||
|
def _pick_first_str(node: Dict[str, Any], keys: Tuple[str, ...]) -> str:
|
||||||
|
for key in keys:
|
||||||
|
value = node.get(key)
|
||||||
|
if isinstance(value, str):
|
||||||
|
text = value.strip()
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_name(node: Dict[str, Any]) -> str:
|
||||||
|
direct = _pick_first_str(node, ("name", "nickname", "nick_name", "author_name", "title", "account_name"))
|
||||||
|
if direct:
|
||||||
|
return direct
|
||||||
|
|
||||||
|
for nested_key in ("author", "user", "user_info", "profile", "account"):
|
||||||
|
nested = node.get(nested_key)
|
||||||
|
if isinstance(nested, dict):
|
||||||
|
nested_name = _pick_first_str(nested, ("name", "nickname", "nick_name", "author_name", "title"))
|
||||||
|
if nested_name:
|
||||||
|
return nested_name
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_law_firm(node: Dict[str, Any]) -> str:
|
||||||
|
direct = _pick_first_str(
|
||||||
|
node,
|
||||||
|
(
|
||||||
|
"law_firm",
|
||||||
|
"firm",
|
||||||
|
"lawFirm",
|
||||||
|
"office",
|
||||||
|
"org_name",
|
||||||
|
"organization",
|
||||||
|
"company",
|
||||||
|
"enterprise",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if direct:
|
||||||
|
return direct
|
||||||
|
|
||||||
|
enterprise = node.get("enterprise")
|
||||||
|
if isinstance(enterprise, dict):
|
||||||
|
company_name = _pick_first_str(enterprise, ("name", "company_name", "enterprise_name"))
|
||||||
|
if company_name:
|
||||||
|
return company_name
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_detail_url(node: Dict[str, Any], fallback_api_url: str) -> str:
|
||||||
|
url = _pick_first_str(node, ("share_url", "url", "web_url", "detail_url", "jump_url"))
|
||||||
|
if url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
aweme_id = node.get("aweme_id") or node.get("item_id")
|
||||||
|
if aweme_id:
|
||||||
|
aid = str(aweme_id).strip()
|
||||||
|
if aid:
|
||||||
|
return f"https://www.douyin.com/video/{aid}"
|
||||||
|
|
||||||
|
sec_uid = node.get("sec_uid")
|
||||||
|
if sec_uid:
|
||||||
|
sec_uid_text = str(sec_uid).strip()
|
||||||
|
if sec_uid_text:
|
||||||
|
return f"https://www.douyin.com/user/{sec_uid_text}"
|
||||||
|
|
||||||
|
return fallback_api_url
|
||||||
|
|
||||||
|
|
||||||
|
def _city_from_index(city_index: int, table_name: str, domain: str) -> Tuple[str, str]:
|
||||||
|
if city_index < 0:
|
||||||
|
return "", ""
|
||||||
|
try:
|
||||||
|
areas = _query_area_data(table_name, domain)
|
||||||
|
except Exception:
|
||||||
|
return "", ""
|
||||||
|
if city_index >= len(areas):
|
||||||
|
return "", ""
|
||||||
|
area = areas[city_index]
|
||||||
|
province = str(area.get("province") or "").strip()
|
||||||
|
city = str(area.get("city") or province).strip()
|
||||||
|
return province, city
|
||||||
|
|
||||||
|
|
||||||
|
def _existing_phones(domain: str, phones: List[str]) -> Set[str]:
|
||||||
|
if not phones:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
deduped = sorted({p for p in phones if p})
|
||||||
|
if not deduped:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
existing: Set[str] = set()
|
||||||
|
with Db() as db:
|
||||||
|
cursor = db.db.cursor()
|
||||||
|
chunk_size = 500
|
||||||
|
for i in range(0, len(deduped), chunk_size):
|
||||||
|
chunk = deduped[i:i + chunk_size]
|
||||||
|
placeholders = ",".join(["%s"] * len(chunk))
|
||||||
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||||
|
cursor.execute(sql, [domain, *chunk])
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
existing.add(str(row[0]))
|
||||||
|
cursor.close()
|
||||||
|
return existing
|
||||||
|
|
||||||
|
|
||||||
|
def _insert_lawyer_rows(rows: List[Dict[str, Any]], domain: str) -> Tuple[int, int]:
|
||||||
|
if not rows:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
def row_score(item: Dict[str, Any]) -> int:
|
||||||
|
score = 0
|
||||||
|
if str(item.get("name") or "").strip():
|
||||||
|
score += 5
|
||||||
|
if str(item.get("law_firm") or "").strip():
|
||||||
|
score += 3
|
||||||
|
if str(item.get("url") or "").strip():
|
||||||
|
score += 1
|
||||||
|
if str(item.get("province") or "").strip() or str(item.get("city") or "").strip():
|
||||||
|
score += 1
|
||||||
|
phone_count_in_node = _parse_int(item.get("phone_count_in_node"), 1)
|
||||||
|
if phone_count_in_node > 1:
|
||||||
|
score -= (phone_count_in_node - 1)
|
||||||
|
return score
|
||||||
|
|
||||||
|
deduped_by_phone: Dict[str, Dict[str, Any]] = {}
|
||||||
|
skipped = 0
|
||||||
|
for row in rows:
|
||||||
|
phone = str(row.get("phone") or "").strip()
|
||||||
|
if not phone:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
old_row = deduped_by_phone.get(phone)
|
||||||
|
if old_row is not None:
|
||||||
|
if row_score(row) > row_score(old_row):
|
||||||
|
deduped_by_phone[phone] = row
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
deduped_by_phone[phone] = row
|
||||||
|
|
||||||
|
existing = _existing_phones(domain, list(deduped_by_phone.keys()))
|
||||||
|
|
||||||
|
inserted = 0
|
||||||
|
with Db() as db:
|
||||||
|
cursor = db.db.cursor()
|
||||||
|
sql = (
|
||||||
|
"INSERT INTO lawyer "
|
||||||
|
"(name, phone, law_firm, province, city, url, domain, create_time, site_time, params) "
|
||||||
|
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||||
|
)
|
||||||
|
for phone, row in deduped_by_phone.items():
|
||||||
|
if phone in existing:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
cursor.execute(
|
||||||
|
sql,
|
||||||
|
(
|
||||||
|
row.get("name") or "",
|
||||||
|
phone,
|
||||||
|
row.get("law_firm") or "",
|
||||||
|
row.get("province") or "",
|
||||||
|
row.get("city") or "",
|
||||||
|
row.get("url") or "",
|
||||||
|
domain,
|
||||||
|
_parse_int(row.get("create_time"), int(time.time())),
|
||||||
|
_parse_int(row.get("site_time"), int(time.time())),
|
||||||
|
row.get("params") or "{}",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
inserted += 1
|
||||||
|
existing.add(phone)
|
||||||
|
|
||||||
|
db.db.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
return inserted, skipped
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_lawyer_rows_from_payload(
|
||||||
|
payload: Dict[str, Any],
|
||||||
|
area_table: str,
|
||||||
|
area_domain: str,
|
||||||
|
save_domain: str,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
now_ts = int(time.time())
|
||||||
|
api_url = str(payload.get("url") or "").strip()
|
||||||
|
city_index = _parse_int(payload.get("cityIndex"), -1)
|
||||||
|
city_province, city_name = _city_from_index(city_index, area_table, area_domain)
|
||||||
|
|
||||||
|
rows: List[Dict[str, Any]] = []
|
||||||
|
data = payload.get("data") if isinstance(payload, dict) else None
|
||||||
|
user_list = data.get("user_list") if isinstance(data, dict) else None
|
||||||
|
if not isinstance(user_list, list):
|
||||||
|
return rows
|
||||||
|
|
||||||
|
for user_item in user_list:
|
||||||
|
if not isinstance(user_item, dict):
|
||||||
|
continue
|
||||||
|
user_info = user_item.get("user_info")
|
||||||
|
if not isinstance(user_info, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = str(user_info.get("nickname") or "").strip()
|
||||||
|
law_firm = _extract_law_firm_from_user_info(user_info)
|
||||||
|
|
||||||
|
# 强约束:必须出现“律师/律所”等关键词,避免非法律相关账号入库
|
||||||
|
if not _is_lawyer_related_user(user_info, name, law_firm):
|
||||||
|
continue
|
||||||
|
|
||||||
|
phones = _extract_phones_from_user_info(user_info)
|
||||||
|
if not phones:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sec_uid = str(user_info.get("sec_uid") or "").strip()
|
||||||
|
if not sec_uid:
|
||||||
|
continue
|
||||||
|
url = f"https://www.douyin.com/user/{sec_uid}"
|
||||||
|
|
||||||
|
province = city_province
|
||||||
|
city = city_name or city_province
|
||||||
|
|
||||||
|
source_record = {
|
||||||
|
"source": "douyin",
|
||||||
|
"api_source": payload.get("source") or "",
|
||||||
|
"api_url": api_url,
|
||||||
|
"city_index": city_index,
|
||||||
|
"captured_at": now_ts,
|
||||||
|
"sec_uid": sec_uid,
|
||||||
|
"user_info": {
|
||||||
|
"uid": user_info.get("uid"),
|
||||||
|
"nickname": user_info.get("nickname"),
|
||||||
|
"signature": user_info.get("signature"),
|
||||||
|
"unique_id": user_info.get("unique_id"),
|
||||||
|
"versatile_display": user_info.get("versatile_display"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for phone in phones:
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"name": name,
|
||||||
|
"phone": phone,
|
||||||
|
"law_firm": law_firm,
|
||||||
|
"province": province,
|
||||||
|
"city": city,
|
||||||
|
"url": url,
|
||||||
|
"domain": save_domain,
|
||||||
|
"create_time": now_ts,
|
||||||
|
"site_time": _parse_int(payload.get("ts"), now_ts),
|
||||||
|
"phone_count_in_node": len(phones),
|
||||||
|
"params": json.dumps(source_record, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
class AreaSyncHandler(BaseHTTPRequestHandler):
|
||||||
|
server_version = "AreaSyncService/2.0"
|
||||||
|
|
||||||
|
def _write_json(self, status: int, payload: Any) -> None:
|
||||||
|
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
||||||
|
self.send_response(status)
|
||||||
|
self.send_header("Content-Type", "application/json; charset=utf-8")
|
||||||
|
self.send_header("Content-Length", str(len(body)))
|
||||||
|
self.send_header("Access-Control-Allow-Origin", "*")
|
||||||
|
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||||
|
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(body)
|
||||||
|
|
||||||
|
def _read_json_body(self) -> Any:
|
||||||
|
length = _parse_int(self.headers.get("Content-Length"), 0)
|
||||||
|
if length <= 0:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
raw = self.rfile.read(length)
|
||||||
|
if not raw:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
return json.loads(raw.decode("utf-8"))
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def do_OPTIONS(self) -> None:
|
||||||
|
self.send_response(204)
|
||||||
|
self.send_header("Access-Control-Allow-Origin", "*")
|
||||||
|
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||||
|
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
def do_GET(self) -> None:
|
||||||
|
parsed = urlparse(self.path)
|
||||||
|
params = parse_qs(parsed.query)
|
||||||
|
|
||||||
|
if parsed.path == "/health":
|
||||||
|
self._write_json(200, {"ok": True, "service": "layer-service"})
|
||||||
|
return
|
||||||
|
|
||||||
|
if parsed.path == "/api/layer/get_area":
|
||||||
|
table_name = _first_param(params, "table", AREA_TABLE).strip() or AREA_TABLE
|
||||||
|
domain = _first_param(params, "domain", AREA_DOMAIN).strip() or AREA_DOMAIN
|
||||||
|
with_meta = _parse_bool(_first_param(params, "meta", "0"), False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
rows = _query_area_data(table_name, domain)
|
||||||
|
except Exception as exc:
|
||||||
|
self._write_json(500, {"ok": False, "error": str(exc)})
|
||||||
|
return
|
||||||
|
|
||||||
|
if with_meta:
|
||||||
|
self._write_json(
|
||||||
|
200,
|
||||||
|
{
|
||||||
|
"ok": True,
|
||||||
|
"count": len(rows),
|
||||||
|
"table": table_name,
|
||||||
|
"domain": domain,
|
||||||
|
"data": rows,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._write_json(200, rows)
|
||||||
|
return
|
||||||
|
|
||||||
|
if parsed.path == "/api/layer/progress":
|
||||||
|
progress_key = _first_param(params, "progress_key", PROGRESS_DEFAULT_KEY).strip() or PROGRESS_DEFAULT_KEY
|
||||||
|
try:
|
||||||
|
row = _get_progress(progress_key)
|
||||||
|
except Exception as exc:
|
||||||
|
self._write_json(500, {"ok": False, "error": str(exc)})
|
||||||
|
return
|
||||||
|
|
||||||
|
self._write_json(
|
||||||
|
200,
|
||||||
|
{
|
||||||
|
"ok": True,
|
||||||
|
"progress_key": progress_key,
|
||||||
|
"data": row,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
self._write_json(404, {"ok": False, "error": "not found"})
|
||||||
|
|
||||||
|
def do_POST(self) -> None:
|
||||||
|
parsed = urlparse(self.path)
|
||||||
|
params = parse_qs(parsed.query)
|
||||||
|
|
||||||
|
if parsed.path == "/api/layer/progress":
|
||||||
|
body = self._read_json_body()
|
||||||
|
if not isinstance(body, dict):
|
||||||
|
body = {}
|
||||||
|
|
||||||
|
progress_key = str(body.get("progress_key") or _first_param(params, "progress_key", PROGRESS_DEFAULT_KEY)).strip() or PROGRESS_DEFAULT_KEY
|
||||||
|
action = str(body.get("action") or _first_param(params, "action", "upsert")).strip().lower() or "upsert"
|
||||||
|
|
||||||
|
try:
|
||||||
|
if action == "clear":
|
||||||
|
deleted = _clear_progress(progress_key)
|
||||||
|
self._write_json(
|
||||||
|
200,
|
||||||
|
{
|
||||||
|
"ok": True,
|
||||||
|
"action": "clear",
|
||||||
|
"progress_key": progress_key,
|
||||||
|
"deleted": deleted,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
saved = _upsert_progress(progress_key, body)
|
||||||
|
self._write_json(
|
||||||
|
200,
|
||||||
|
{
|
||||||
|
"ok": True,
|
||||||
|
"action": "upsert",
|
||||||
|
"progress_key": progress_key,
|
||||||
|
"data": saved,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
except Exception as exc:
|
||||||
|
self._write_json(500, {"ok": False, "error": str(exc)})
|
||||||
|
return
|
||||||
|
|
||||||
|
if parsed.path == "/api/layer/index":
|
||||||
|
body = self._read_json_body()
|
||||||
|
if not isinstance(body, dict) or not body:
|
||||||
|
self._write_json(400, {"ok": False, "error": "invalid json body"})
|
||||||
|
return
|
||||||
|
|
||||||
|
area_table = _first_param(params, "table", AREA_TABLE).strip() or AREA_TABLE
|
||||||
|
area_domain = _first_param(params, "area_domain", AREA_DOMAIN).strip() or AREA_DOMAIN
|
||||||
|
save_domain = _first_param(params, "save_domain", DOUYIN_DOMAIN).strip() or DOUYIN_DOMAIN
|
||||||
|
save_only_default = _parse_bool(DOUYIN_SAVE_ONLY_ENV, True)
|
||||||
|
save_only = _parse_bool(_first_param(params, "save_only", DOUYIN_SAVE_ONLY_ENV), save_only_default)
|
||||||
|
|
||||||
|
try:
|
||||||
|
saved_file = _save_raw_index_payload(body, params, self.client_address[0] if self.client_address else "")
|
||||||
|
except Exception as exc:
|
||||||
|
self._write_json(500, {"ok": False, "error": f"save raw payload failed: {exc}"})
|
||||||
|
return
|
||||||
|
|
||||||
|
if save_only:
|
||||||
|
self._write_json(
|
||||||
|
200,
|
||||||
|
{
|
||||||
|
"ok": True,
|
||||||
|
"message": "saved_only",
|
||||||
|
"save_only": True,
|
||||||
|
"saved_file": saved_file,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
extracted = _extract_lawyer_rows_from_payload(body, area_table, area_domain, save_domain)
|
||||||
|
inserted, skipped = _insert_lawyer_rows(extracted, save_domain)
|
||||||
|
except Exception as exc:
|
||||||
|
self._write_json(500, {"ok": False, "error": str(exc)})
|
||||||
|
return
|
||||||
|
|
||||||
|
self._write_json(
|
||||||
|
200,
|
||||||
|
{
|
||||||
|
"ok": True,
|
||||||
|
"message": "received",
|
||||||
|
"save_domain": save_domain,
|
||||||
|
"save_only": False,
|
||||||
|
"saved_file": saved_file,
|
||||||
|
"extracted": len(extracted),
|
||||||
|
"inserted": inserted,
|
||||||
|
"skipped": skipped,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
self._write_json(404, {"ok": False, "error": "not found"})
|
||||||
|
|
||||||
|
|
||||||
|
def run() -> None:
|
||||||
|
try:
|
||||||
|
_ensure_progress_table()
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[layer-service] init progress table failed: {exc}")
|
||||||
|
|
||||||
|
server = ThreadingHTTPServer((SERVICE_HOST, SERVICE_PORT), AreaSyncHandler)
|
||||||
|
print(f"[layer-service] running on http://{SERVICE_HOST}:{SERVICE_PORT}")
|
||||||
|
print(f"[layer-service] get_area -> table/domain: {AREA_TABLE}/{AREA_DOMAIN}")
|
||||||
|
print(f"[layer-service] index -> save domain: {DOUYIN_DOMAIN}")
|
||||||
|
print(f"[layer-service] progress table/default key: {PROGRESS_TABLE}/{PROGRESS_DEFAULT_KEY}")
|
||||||
|
server.serve_forever()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run()
|
||||||
+183
-49
@@ -1,76 +1,210 @@
|
|||||||
"""
|
"""
|
||||||
全局请求速率限制器
|
全局请求速率限制器
|
||||||
确保代理每秒不超过5次请求
|
|
||||||
|
默认按“所有爬虫进程共享一个桶”来限流,避免 `bash start.sh`
|
||||||
|
同时启动多个进程时,每个进程各自 5 次/秒,叠加后把代理冲爆。
|
||||||
"""
|
"""
|
||||||
|
from contextlib import contextmanager
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import threading
|
import threading
|
||||||
from collections import deque
|
from pathlib import Path
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
import fcntl
|
||||||
|
from request.proxy_config import is_proxy_enabled
|
||||||
|
|
||||||
|
|
||||||
class RateLimiter:
|
class RateLimiter:
|
||||||
"""
|
"""
|
||||||
令牌桶算法实现的速率限制器
|
基于文件锁的跨进程滑动窗口限流器。
|
||||||
|
|
||||||
|
- 同一台机器上的多个 Python 进程会共享同一个状态文件
|
||||||
|
- 同一个进程内的多个线程也会一起走这个限流器
|
||||||
"""
|
"""
|
||||||
def __init__(self, max_requests_per_second: int = 5):
|
|
||||||
"""
|
|
||||||
初始化速率限制器
|
|
||||||
|
|
||||||
Args:
|
def __init__(
|
||||||
max_requests_per_second: 每秒最大请求数
|
self,
|
||||||
"""
|
max_requests_per_second: int = 5,
|
||||||
self.max_requests = max_requests_per_second
|
window_seconds: float = 1.0,
|
||||||
self.requests = deque()
|
state_file: str | None = None,
|
||||||
self.lock = threading.RLock()
|
):
|
||||||
|
self.max_requests = max(1, int(max_requests_per_second))
|
||||||
|
self.max_concurrent = max(
|
||||||
|
1,
|
||||||
|
int(os.getenv("PROXY_MAX_CONCURRENT_REQUESTS", str(self.max_requests))),
|
||||||
|
)
|
||||||
|
self.window_seconds = max(0.1, float(window_seconds))
|
||||||
|
self.lease_seconds = max(
|
||||||
|
5.0,
|
||||||
|
float(os.getenv("PROXY_REQUEST_LEASE_SECONDS", "120")),
|
||||||
|
)
|
||||||
|
default_state = os.path.join(
|
||||||
|
tempfile.gettempdir(),
|
||||||
|
"lawyers_proxy_rate_limiter.json",
|
||||||
|
)
|
||||||
|
self.state_file = Path(
|
||||||
|
state_file or os.getenv("PROXY_RATE_LIMIT_FILE", default_state)
|
||||||
|
)
|
||||||
|
self.lock_file = self.state_file.with_suffix(self.state_file.suffix + ".lock")
|
||||||
|
self._thread_lock = threading.RLock()
|
||||||
|
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.lock_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
def acquire(self):
|
def _load_state(self) -> dict:
|
||||||
"""
|
if not self.state_file.exists():
|
||||||
获取请求权限,如果需要则等待
|
return {"timestamps": [], "leases": {}}
|
||||||
"""
|
try:
|
||||||
with self.lock:
|
raw = self.state_file.read_text(encoding="utf-8").strip()
|
||||||
now = time.time()
|
if not raw:
|
||||||
|
return {"timestamps": [], "leases": {}}
|
||||||
|
data = json.loads(raw)
|
||||||
|
if isinstance(data, list):
|
||||||
|
return {
|
||||||
|
"timestamps": [float(item) for item in data],
|
||||||
|
"leases": {},
|
||||||
|
}
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return {"timestamps": [], "leases": {}}
|
||||||
|
timestamps = data.get("timestamps", []) or []
|
||||||
|
leases = data.get("leases", {}) or {}
|
||||||
|
return {
|
||||||
|
"timestamps": [float(item) for item in timestamps],
|
||||||
|
"leases": {str(key): float(value) for key, value in leases.items()},
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
return {"timestamps": [], "leases": {}}
|
||||||
|
|
||||||
# 清理超过1秒的请求记录
|
def _save_state(self, state: dict) -> None:
|
||||||
while self.requests and now - self.requests[0] >= 1.0:
|
payload = json.dumps(state, ensure_ascii=False)
|
||||||
self.requests.popleft()
|
self.state_file.write_text(payload, encoding="utf-8")
|
||||||
|
|
||||||
# 如果当前请求数已达上限,等待
|
def _normalize_state(self, state: dict, now: float) -> dict:
|
||||||
if len(self.requests) >= self.max_requests:
|
timestamps = [
|
||||||
# 计算需要等待的时间
|
float(ts)
|
||||||
wait_time = 1.0 - (now - self.requests[0])
|
for ts in (state.get("timestamps", []) or [])
|
||||||
if wait_time > 0:
|
if now - float(ts) < self.window_seconds
|
||||||
time.sleep(wait_time)
|
]
|
||||||
return self.acquire() # 递归调用以重新检查
|
leases = {
|
||||||
|
str(key): float(value)
|
||||||
|
for key, value in (state.get("leases", {}) or {}).items()
|
||||||
|
if now - float(value) < self.lease_seconds
|
||||||
|
}
|
||||||
|
return {"timestamps": timestamps, "leases": leases}
|
||||||
|
|
||||||
# 记录这次请求
|
def acquire(self) -> None:
|
||||||
self.requests.append(now)
|
token = None
|
||||||
|
while True:
|
||||||
|
token = self.try_acquire_slot()
|
||||||
|
if token:
|
||||||
|
self.release(token)
|
||||||
|
return
|
||||||
|
time.sleep(0.05)
|
||||||
|
|
||||||
|
def try_acquire_slot(self) -> str | None:
|
||||||
|
while True:
|
||||||
|
wait_time = 0.0
|
||||||
|
with self._thread_lock:
|
||||||
|
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
|
||||||
|
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
|
||||||
|
now = time.time()
|
||||||
|
state = self._normalize_state(self._load_state(), now)
|
||||||
|
timestamps = state["timestamps"]
|
||||||
|
leases = state["leases"]
|
||||||
|
|
||||||
|
if len(timestamps) < self.max_requests and len(leases) < self.max_concurrent:
|
||||||
|
token = uuid4().hex
|
||||||
|
timestamps.append(now)
|
||||||
|
leases[token] = now
|
||||||
|
self._save_state(state)
|
||||||
|
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
|
||||||
|
return token
|
||||||
|
|
||||||
|
wait_candidates = []
|
||||||
|
if len(timestamps) >= self.max_requests and timestamps:
|
||||||
|
wait_candidates.append(self.window_seconds - (now - timestamps[0]))
|
||||||
|
if len(leases) >= self.max_concurrent:
|
||||||
|
wait_candidates.append(0.05)
|
||||||
|
wait_time = max(0.05, min([item for item in wait_candidates if item > 0] or [0.05]))
|
||||||
|
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
|
||||||
|
|
||||||
|
time.sleep(wait_time)
|
||||||
|
|
||||||
|
def release(self, token: str | None) -> None:
|
||||||
|
if not token:
|
||||||
|
return
|
||||||
|
with self._thread_lock:
|
||||||
|
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
|
||||||
|
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
|
||||||
|
now = time.time()
|
||||||
|
state = self._normalize_state(self._load_state(), now)
|
||||||
|
leases = state["leases"]
|
||||||
|
if token in leases:
|
||||||
|
leases.pop(token, None)
|
||||||
|
self._save_state(state)
|
||||||
|
else:
|
||||||
|
self._save_state(state)
|
||||||
|
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
|
||||||
|
|
||||||
def can_make_request(self) -> bool:
|
def can_make_request(self) -> bool:
|
||||||
"""
|
with self._thread_lock:
|
||||||
检查是否可以立即发起请求(非阻塞)
|
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
|
||||||
"""
|
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
|
||||||
with self.lock:
|
now = time.time()
|
||||||
now = time.time()
|
state = self._normalize_state(self._load_state(), now)
|
||||||
|
self._save_state(state)
|
||||||
# 清理超过1秒的请求记录
|
allowed = (
|
||||||
while self.requests and now - self.requests[0] >= 1.0:
|
len(state["timestamps"]) < self.max_requests
|
||||||
self.requests.popleft()
|
and len(state["leases"]) < self.max_concurrent
|
||||||
|
)
|
||||||
return len(self.requests) < self.max_requests
|
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
|
||||||
|
return allowed
|
||||||
|
|
||||||
|
|
||||||
# 全局速率限制器实例
|
global_rate_limiter = RateLimiter(
|
||||||
global_rate_limiter = RateLimiter(max_requests_per_second=5)
|
max_requests_per_second=int(os.getenv("PROXY_MAX_REQUESTS_PER_SECOND", "5"))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _should_limit_proxy_requests() -> bool:
|
||||||
|
"""
|
||||||
|
仅在当前进程实际启用代理时启用全局代理限流。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return is_proxy_enabled()
|
||||||
|
except Exception:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def wait_for_request():
|
def wait_for_request():
|
||||||
"""
|
"""等待直到可以发起请求。"""
|
||||||
等待直到可以发起请求
|
if not _should_limit_proxy_requests():
|
||||||
"""
|
return
|
||||||
global_rate_limiter.acquire()
|
global_rate_limiter.acquire()
|
||||||
|
|
||||||
|
|
||||||
def can_request_now() -> bool:
|
def can_request_now() -> bool:
|
||||||
"""
|
"""检查是否可以立即发起请求。"""
|
||||||
检查是否可以立即发起请求
|
if not _should_limit_proxy_requests():
|
||||||
"""
|
return True
|
||||||
return global_rate_limiter.can_make_request()
|
return global_rate_limiter.can_make_request()
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def request_slot():
|
||||||
|
"""
|
||||||
|
申请一个跨进程共享的请求槽位,请求结束后自动释放。
|
||||||
|
|
||||||
|
这样既能限制“每秒启动多少请求”,也能限制“同时在飞多少请求”。
|
||||||
|
"""
|
||||||
|
if not _should_limit_proxy_requests():
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
token = global_rate_limiter.try_acquire_slot()
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
global_rate_limiter.release(token)
|
||||||
|
|||||||
@@ -0,0 +1,377 @@
|
|||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from html import unescape
|
||||||
|
from http.cookies import SimpleCookie
|
||||||
|
from typing import Dict, Optional
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import urllib3
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
for path in (current_dir, project_root):
|
||||||
|
if path not in sys.path:
|
||||||
|
sys.path.append(path)
|
||||||
|
|
||||||
|
import config as project_config
|
||||||
|
from utils.rate_limiter import wait_for_request, global_rate_limiter
|
||||||
|
|
||||||
|
API_ENDPOINT = "https://mp.weixin.qq.com/cgi-bin/videosnap"
|
||||||
|
DOMAIN = "mp.weixin.qq.com"
|
||||||
|
DEFAULT_HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/146.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "*/*",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
|
||||||
|
"DNT": "1",
|
||||||
|
"Priority": "u=1, i",
|
||||||
|
"Sec-CH-UA": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"',
|
||||||
|
"Sec-CH-UA-Mobile": "?0",
|
||||||
|
"Sec-CH-UA-Platform": '"Windows"',
|
||||||
|
"Sec-Fetch-Dest": "empty",
|
||||||
|
"Sec-Fetch-Mode": "cors",
|
||||||
|
"Sec-Fetch-Site": "same-origin",
|
||||||
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
|
}
|
||||||
|
DEFAULT_WEIXIN_CONFIG = {
|
||||||
|
"TOKEN": "609153506",
|
||||||
|
"FINGERPRINT": "46a7e6ac6ccf205986adc0aa99127860",
|
||||||
|
"COOKIE": {
|
||||||
|
"appmsglist_action_3258147150": "card",
|
||||||
|
"_qimei_uuid42": "1a302160d051008226aec905b63f99ff3989f30009",
|
||||||
|
"_qimei_i_3": "63b22b84c15204dfc595ac6452d722b1f0bdf0f6145b568ae68a7c0e70947438686637943989e2a1d792",
|
||||||
|
"_qimei_h38": "215986ce26aec905b63f99ff0200000e81a302",
|
||||||
|
"ua_id": "S7gglu0eZh9NkAzLAAAAADH8dynpnFZVN29lxm7BQo0=",
|
||||||
|
"wxuin": "73074968761097",
|
||||||
|
"mm_lang": "zh_CN",
|
||||||
|
"eas_sid": "91X7I7K4K5k364U2z3k2I980F5",
|
||||||
|
"_qimei_q36": "",
|
||||||
|
"_qimei_fingerprint": "d895c46d5fda98cab67d9daec00068ed",
|
||||||
|
"_clck": "501quy|1|g4t|0",
|
||||||
|
"uuid": "210d1c199a63afd4c774eccd9a06a27f",
|
||||||
|
"rand_info": "CAESIE4WqrFFVVjqrrNflbCUM7wPD5NXjuGbjfHolAEsMmEm",
|
||||||
|
"slave_bizuin": "3258147150",
|
||||||
|
"data_bizuin": "3258147150",
|
||||||
|
"bizuin": "3258147150",
|
||||||
|
"data_ticket": "tpcLjRB7B7AlUY3rFe/ILEjtCKs7dEEGsn8kXnHVzdTb9dgIpSPN1aP8FlE6FDhj",
|
||||||
|
"slave_sid": "U3hfU1Z0UV91N0U5d0lkRDhyTzh3d3hmbnBHMjBnbmFNdzVJeGlJeTJ6OTVxRjJQVVE2VkNhejYzTkxETVVSZkF3eWRORmtRS01XWFBjdnFZZWFLNjR2ZGtwdUJ2MzByclg0NjF4SHlDeVJneEhsczdSYUJVNE45VEhNRWVTQXg1dlpGdWQ0bU5VM3pnRzJN",
|
||||||
|
"slave_user": "gh_fe76760560d0",
|
||||||
|
"xid": "ef503a6864cceaef225c615a45606e4a",
|
||||||
|
"_clsk": "12arnf1|1774975723874|4|1|mp.weixin.qq.com/weheat-agent/payload/record",
|
||||||
|
"_qimei_i_1": "2ddc6a80945f59d3c7c4ab325dd526b3feeea1a31458558bbdd97e582493206c6163629d39d8e1dcd49fddc7"
|
||||||
|
},
|
||||||
|
"COUNT": 21,
|
||||||
|
"REFERER": "https://mp.weixin.qq.com/",
|
||||||
|
"HEADERS": {},
|
||||||
|
"REQUEST_PARAMS": {
|
||||||
|
"action": "search",
|
||||||
|
"scene": "1",
|
||||||
|
"lang": "zh_CN",
|
||||||
|
"f": "json",
|
||||||
|
"ajax": "1",
|
||||||
|
},
|
||||||
|
"REQUESTS_PER_SECOND": 5,
|
||||||
|
"PAGE_DELAY": 5,
|
||||||
|
"CITY_DELAY": 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _deep_merge_dict(base: Dict, incoming: Dict) -> Dict:
|
||||||
|
merged = copy.deepcopy(base)
|
||||||
|
for key, value in incoming.items():
|
||||||
|
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||||
|
merged[key] = _deep_merge_dict(merged[key], value)
|
||||||
|
else:
|
||||||
|
merged[key] = value
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_cookie_value(cookie_value) -> Dict[str, str]:
|
||||||
|
if isinstance(cookie_value, dict):
|
||||||
|
return {str(key): str(value) for key, value in cookie_value.items()}
|
||||||
|
|
||||||
|
if not cookie_value:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
if isinstance(cookie_value, str):
|
||||||
|
text = cookie_value.strip()
|
||||||
|
if not text:
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
parsed = json.loads(text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
parsed = None
|
||||||
|
if isinstance(parsed, dict):
|
||||||
|
return {str(key): str(value) for key, value in parsed.items()}
|
||||||
|
|
||||||
|
cookie = SimpleCookie()
|
||||||
|
cookie.load(text)
|
||||||
|
return {key: morsel.value for key, morsel in cookie.items()}
|
||||||
|
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_weixin_config() -> Dict:
|
||||||
|
config = copy.deepcopy(DEFAULT_WEIXIN_CONFIG)
|
||||||
|
module_config = getattr(project_config, "WEIXIN_CONFIG", None)
|
||||||
|
if isinstance(module_config, dict):
|
||||||
|
config = _deep_merge_dict(config, module_config)
|
||||||
|
|
||||||
|
env_mapping = {
|
||||||
|
"TOKEN": os.getenv("WEIXIN_TOKEN"),
|
||||||
|
"FINGERPRINT": os.getenv("WEIXIN_FINGERPRINT"),
|
||||||
|
"COOKIE": os.getenv("WEIXIN_COOKIE"),
|
||||||
|
"REFERER": os.getenv("WEIXIN_REFERER"),
|
||||||
|
"COUNT": os.getenv("WEIXIN_COUNT"),
|
||||||
|
"REQUESTS_PER_SECOND": os.getenv("WEIXIN_REQUESTS_PER_SECOND"),
|
||||||
|
"PAGE_DELAY": os.getenv("WEIXIN_PAGE_DELAY"),
|
||||||
|
"CITY_DELAY": os.getenv("WEIXIN_CITY_DELAY"),
|
||||||
|
}
|
||||||
|
for key, value in env_mapping.items():
|
||||||
|
if value not in (None, ""):
|
||||||
|
config[key] = value
|
||||||
|
|
||||||
|
config["COOKIE"] = _parse_cookie_value(config.get("COOKIE"))
|
||||||
|
|
||||||
|
for key in ("COUNT", "REQUESTS_PER_SECOND"):
|
||||||
|
try:
|
||||||
|
config[key] = int(config[key])
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
config[key] = DEFAULT_WEIXIN_CONFIG[key]
|
||||||
|
|
||||||
|
for key in ("PAGE_DELAY", "CITY_DELAY"):
|
||||||
|
try:
|
||||||
|
config[key] = float(config[key])
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
config[key] = DEFAULT_WEIXIN_CONFIG[key]
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_html(text: str) -> str:
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
return re.sub(r"<[^>]+>", "", unescape(text)).strip()
|
||||||
|
|
||||||
|
|
||||||
|
class WeixinSpider:
|
||||||
|
"""基于 requests 的微信视频号采集器"""
|
||||||
|
|
||||||
|
def __init__(self, db_connection):
|
||||||
|
self.db = db_connection
|
||||||
|
self.config = _load_weixin_config()
|
||||||
|
self.token = str(self.config.get("TOKEN", "")).strip()
|
||||||
|
self.fingerprint = str(self.config.get("FINGERPRINT", "")).strip()
|
||||||
|
self.cookies = self.config.get("COOKIE", {})
|
||||||
|
self.count = str(self.config.get("COUNT", DEFAULT_WEIXIN_CONFIG["COUNT"]))
|
||||||
|
self.referer = str(self.config.get("REFERER", DEFAULT_WEIXIN_CONFIG["REFERER"])).strip()
|
||||||
|
self.request_params = {
|
||||||
|
str(key): str(value)
|
||||||
|
for key, value in (self.config.get("REQUEST_PARAMS", {}) or {}).items()
|
||||||
|
if value is not None
|
||||||
|
}
|
||||||
|
self.page_delay = max(0.0, float(self.config.get("PAGE_DELAY", DEFAULT_WEIXIN_CONFIG["PAGE_DELAY"])))
|
||||||
|
self.city_delay = max(0.0, float(self.config.get("CITY_DELAY", DEFAULT_WEIXIN_CONFIG["CITY_DELAY"])))
|
||||||
|
max_rps = self.config.get("REQUESTS_PER_SECOND")
|
||||||
|
if max_rps:
|
||||||
|
global_rate_limiter.max_requests = int(max_rps)
|
||||||
|
|
||||||
|
headers = DEFAULT_HEADERS.copy()
|
||||||
|
project_headers = getattr(project_config, "HEADERS", None)
|
||||||
|
if isinstance(project_headers, dict):
|
||||||
|
headers.update(project_headers)
|
||||||
|
config_headers = self.config.get("HEADERS", {})
|
||||||
|
if isinstance(config_headers, dict):
|
||||||
|
headers.update({str(key): str(value) for key, value in config_headers.items()})
|
||||||
|
if self.referer:
|
||||||
|
headers["Referer"] = self.referer
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.session.trust_env = False
|
||||||
|
self.session.headers.update(headers)
|
||||||
|
if self.cookies:
|
||||||
|
self.session.cookies.update(self.cookies)
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
def _validate_runtime_config(self) -> bool:
|
||||||
|
missing = []
|
||||||
|
if not self.token:
|
||||||
|
missing.append("TOKEN")
|
||||||
|
if not self.fingerprint:
|
||||||
|
missing.append("FINGERPRINT")
|
||||||
|
if not self.cookies:
|
||||||
|
missing.append("COOKIE")
|
||||||
|
|
||||||
|
if not missing:
|
||||||
|
return True
|
||||||
|
|
||||||
|
print(
|
||||||
|
"[微信] 配置不完整,缺少: "
|
||||||
|
+ ", ".join(missing)
|
||||||
|
+ "。请在 config.py 的 WEIXIN_CONFIG 中补齐,"
|
||||||
|
+ "或通过环境变量 WEIXIN_TOKEN / WEIXIN_FINGERPRINT / WEIXIN_COOKIE 提供。"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _load_areas(self):
|
||||||
|
condition = "domain='maxlaw' AND level=2"
|
||||||
|
tables = ("area_new", "area", "area2")
|
||||||
|
last_error = None
|
||||||
|
for table in tables:
|
||||||
|
try:
|
||||||
|
rows = self.db.select_data(table, "province, city", condition) or []
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = exc
|
||||||
|
continue
|
||||||
|
if rows:
|
||||||
|
print(f"[微信] 城市来源表: {table}, 城市数: {len(rows)}")
|
||||||
|
return rows
|
||||||
|
|
||||||
|
if last_error:
|
||||||
|
print(f"[微信] 加载地区数据失败: {last_error}")
|
||||||
|
print("[微信] 无城市数据(已尝试 area_new/area/area2)")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _build_query_url(self, query: str, buffer: str) -> str:
|
||||||
|
params = self.request_params.copy()
|
||||||
|
params.update({
|
||||||
|
"query": query,
|
||||||
|
"count": self.count,
|
||||||
|
"buffer": buffer,
|
||||||
|
"fingerprint": self.fingerprint,
|
||||||
|
"token": self.token,
|
||||||
|
})
|
||||||
|
return f"{API_ENDPOINT}?{urlencode(params)}"
|
||||||
|
|
||||||
|
def _extract_phone(self, text: str) -> Optional[str]:
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
match = re.search(r"1[3-9]\d{9}", text)
|
||||||
|
return match.group(0) if match else None
|
||||||
|
|
||||||
|
def _parse_name(self, acct: Dict) -> str:
|
||||||
|
highlight = _strip_html(acct.get("highlight_nickname", ""))
|
||||||
|
if highlight:
|
||||||
|
return highlight
|
||||||
|
return _strip_html(acct.get("nickname", ""))
|
||||||
|
|
||||||
|
def _store_account(self, acct: Dict, province: str, city: str) -> None:
|
||||||
|
signature = acct.get("signature", "")
|
||||||
|
phone = self._extract_phone(signature)
|
||||||
|
if not phone:
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.db.is_data_exist("lawyer", f"phone='{phone}' and domain='{DOMAIN}'"):
|
||||||
|
name = self._parse_name(acct)
|
||||||
|
print(f" -- 已存在律师: {name} ({phone})")
|
||||||
|
return
|
||||||
|
|
||||||
|
params = json.dumps(acct, ensure_ascii=False)
|
||||||
|
lawyer_data = {
|
||||||
|
"phone": phone,
|
||||||
|
"province": province,
|
||||||
|
"city": city,
|
||||||
|
"law_firm": acct.get("auth_info", {}).get("auth_profession"),
|
||||||
|
"url": f"https://channels.weixin.qq.com/finder/creator/feeds?finder_username={acct.get('username', '')}",
|
||||||
|
"create_time": int(time.time()),
|
||||||
|
"domain": DOMAIN,
|
||||||
|
"name": self._parse_name(acct),
|
||||||
|
"params": params,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
inserted_id = self.db.insert_data("lawyer", lawyer_data)
|
||||||
|
print(f" -> 新增律师: {lawyer_data['name']} ({phone}), ID: {inserted_id}")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" 插入失败 {lawyer_data['name']} ({phone}): {exc}")
|
||||||
|
|
||||||
|
def _search_city(self, province: str, city: str) -> None:
|
||||||
|
city_name = city.replace('市', '')
|
||||||
|
query = f"{city_name}律所"
|
||||||
|
print(f"--- [微信] 开始采集城市: {province} - {city_name} ---")
|
||||||
|
|
||||||
|
buffer = ""
|
||||||
|
has_more = True
|
||||||
|
page_no = 0
|
||||||
|
|
||||||
|
while has_more:
|
||||||
|
page_no += 1
|
||||||
|
url = self._build_query_url(query, buffer)
|
||||||
|
print(f"正在采集 '{query}' 第 {page_no} 页: {url}")
|
||||||
|
|
||||||
|
wait_for_request()
|
||||||
|
try:
|
||||||
|
response = self.session.get(
|
||||||
|
url,
|
||||||
|
timeout=15,
|
||||||
|
cookies=self.cookies,
|
||||||
|
proxies={}, # 明确禁用代理
|
||||||
|
verify=False,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except requests.exceptions.RequestException as exc:
|
||||||
|
print(f"网络请求失败: {exc}")
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print("解析返回的JSON失败。返回内容:", response.text[:200])
|
||||||
|
break
|
||||||
|
|
||||||
|
base_resp = data.get("base_resp", {})
|
||||||
|
if base_resp.get("ret") != 0:
|
||||||
|
print(f"API返回错误: {base_resp.get('err_msg')}")
|
||||||
|
if "invalid ticket" in (base_resp.get('err_msg') or ""):
|
||||||
|
print("Token 或 Cookie 可能失效,请更新配置。")
|
||||||
|
break
|
||||||
|
|
||||||
|
accounts = data.get("acct_list", [])
|
||||||
|
if not accounts:
|
||||||
|
print("本页未找到更多律师信息。")
|
||||||
|
break
|
||||||
|
|
||||||
|
for acct in accounts:
|
||||||
|
self._store_account(acct, province, city_name)
|
||||||
|
|
||||||
|
has_more = bool(data.get("acct_continue_flag"))
|
||||||
|
buffer = data.get("last_buff", "")
|
||||||
|
time.sleep(self.page_delay)
|
||||||
|
|
||||||
|
print(f"--- [微信] 城市: {city_name} 采集完成 ---\n")
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
print("启动微信视频号律师信息采集...")
|
||||||
|
if not self._validate_runtime_config():
|
||||||
|
return
|
||||||
|
|
||||||
|
areas = self._load_areas()
|
||||||
|
if not areas:
|
||||||
|
print("[微信] 未能从 `area_new` 表获取到地区信息。")
|
||||||
|
return
|
||||||
|
|
||||||
|
for area in areas:
|
||||||
|
province = area.get("province", "")
|
||||||
|
city = area.get("city", "")
|
||||||
|
if not city:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
self._search_city(province, city)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"采集 {province}-{city} 时发生错误: {exc}")
|
||||||
|
time.sleep(self.city_delay)
|
||||||
|
|
||||||
|
print("微信视频号律师信息采集完成。")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from Db import Db
|
||||||
|
|
||||||
|
with Db() as db:
|
||||||
|
spider = WeixinSpider(db)
|
||||||
|
spider.run()
|
||||||
Reference in New Issue
Block a user