19cf9ce901
- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
480 lines
17 KiB
Python
480 lines
17 KiB
Python
import argparse
|
|
import ast
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
|
|
|
import urllib3
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
project_root = os.path.dirname(current_dir)
|
|
request_dir = os.path.join(project_root, "request")
|
|
if request_dir not in sys.path:
|
|
sys.path.insert(0, request_dir)
|
|
if project_root not in sys.path:
|
|
sys.path.append(project_root)
|
|
|
|
from Db import Db
|
|
from request.requests_client import RequestClientError, RequestsClient
|
|
from utils.rate_limiter import wait_for_request
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
SITE_NAME = "findlaw"
|
|
LEGACY_DOMAIN = "找法网"
|
|
SITE_BASE = "https://m.findlaw.cn"
|
|
CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
|
|
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
|
|
|
|
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
|
|
|
|
|
@dataclass
|
|
class CityTarget:
|
|
province_id: str
|
|
province_name: str
|
|
province_py: str
|
|
city_id: str
|
|
city_name: str
|
|
city_py: str
|
|
|
|
|
|
def normalize_phone(text: str) -> str:
|
|
compact = re.sub(r"\D", "", text or "")
|
|
match = PHONE_RE.search(compact)
|
|
return match.group(0) if match else ""
|
|
|
|
|
|
class FindlawCrawler:
|
|
def __init__(
|
|
self,
|
|
max_pages: int = 9999,
|
|
sleep_seconds: float = 0.1,
|
|
use_proxy: bool = True,
|
|
db_connection=None,
|
|
):
|
|
self.max_pages = max_pages
|
|
self.sleep_seconds = max(0.0, sleep_seconds)
|
|
self.db = db_connection
|
|
self.client = RequestsClient(
|
|
headers={
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
|
"Mobile/15E148 Safari/604.1"
|
|
),
|
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
"Connection": "close",
|
|
},
|
|
use_proxy=use_proxy,
|
|
retry_total=2,
|
|
retry_backoff_factor=1,
|
|
retry_status_forcelist=(429, 500, 502, 503, 504),
|
|
retry_allowed_methods=("GET",),
|
|
)
|
|
|
|
def _get_text(
|
|
self,
|
|
url: str,
|
|
timeout: int = 20,
|
|
max_retries: int = 3,
|
|
referer: str = SITE_BASE,
|
|
) -> str:
|
|
headers = {"Referer": referer}
|
|
last_error: Optional[Exception] = None
|
|
|
|
for attempt in range(max_retries):
|
|
wait_for_request()
|
|
try:
|
|
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
|
|
code = resp.status_code
|
|
if code == 403:
|
|
if attempt < max_retries - 1:
|
|
self.client.refresh()
|
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
continue
|
|
raise RequestClientError(f"{code} Error: {url}")
|
|
if code >= 500 and attempt < max_retries - 1:
|
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
continue
|
|
if code >= 400:
|
|
raise RequestClientError(f"{code} Error: {url}")
|
|
return resp.text
|
|
except Exception as exc:
|
|
last_error = exc
|
|
if attempt < max_retries - 1:
|
|
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
|
continue
|
|
raise
|
|
|
|
if last_error is not None:
|
|
raise last_error
|
|
raise RequestClientError(f"Unknown request error: {url}")
|
|
|
|
def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
|
|
pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
|
|
match = re.search(pattern, script_text)
|
|
if not match:
|
|
return []
|
|
raw = match.group(1)
|
|
try:
|
|
rows = ast.literal_eval(raw)
|
|
return rows if isinstance(rows, list) else []
|
|
except Exception:
|
|
return []
|
|
|
|
def discover_cities(self) -> List[CityTarget]:
|
|
js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
|
|
provinces = self._parse_city_js_array(js_text, "iosProvinces")
|
|
cities = self._parse_city_js_array(js_text, "iosCitys")
|
|
|
|
province_map: Dict[str, Dict] = {}
|
|
for item in provinces:
|
|
pid = str(item.get("id") or "").strip()
|
|
if pid:
|
|
province_map[pid] = item
|
|
|
|
results: List[CityTarget] = []
|
|
seen_py: Set[str] = set()
|
|
for city in cities:
|
|
city_py = str(city.get("pinyin") or "").strip()
|
|
city_name = str(city.get("value") or "").strip()
|
|
city_id = str(city.get("id") or "").strip()
|
|
province_id = str(city.get("parentId") or "").strip()
|
|
if not city_py or not city_name or not city_id:
|
|
continue
|
|
if city_py in seen_py:
|
|
continue
|
|
seen_py.add(city_py)
|
|
|
|
province_row = province_map.get(province_id, {})
|
|
province_name = str(province_row.get("value") or city_name).strip()
|
|
province_py = str(province_row.get("pinyin") or city_py).strip()
|
|
|
|
results.append(
|
|
CityTarget(
|
|
province_id=province_id,
|
|
province_name=province_name,
|
|
province_py=province_py,
|
|
city_id=city_id,
|
|
city_name=city_name,
|
|
city_py=city_py,
|
|
)
|
|
)
|
|
return results
|
|
|
|
def _parse_list_payload(self, text: str) -> Dict:
|
|
cleaned = (text or "").strip().lstrip("\ufeff")
|
|
try:
|
|
return json.loads(cleaned)
|
|
except ValueError:
|
|
start = cleaned.find("{")
|
|
end = cleaned.rfind("}")
|
|
if start == -1 or end == -1:
|
|
return {}
|
|
return json.loads(cleaned[start:end + 1])
|
|
|
|
def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
|
|
list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
|
|
referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
|
|
text = self._get_text(list_url, referer=referer)
|
|
payload = self._parse_list_payload(text)
|
|
if payload.get("errcode") != 0:
|
|
return [], False, list_url
|
|
|
|
data = payload.get("data", {}) or {}
|
|
items = data.get("lawyer_list", []) or []
|
|
has_more = str(data.get("has_more", "0")) == "1"
|
|
return items, has_more, list_url
|
|
|
|
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
|
for page in range(1, self.max_pages + 1):
|
|
try:
|
|
items, has_more, list_url = self.fetch_list_page(target.city_py, page)
|
|
except Exception as exc:
|
|
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
|
|
break
|
|
|
|
if not items:
|
|
break
|
|
|
|
for item in items:
|
|
detail_url = item.get("siteask_m") or item.get("site_url") or ""
|
|
detail_url = str(detail_url).strip()
|
|
if not detail_url.startswith("http"):
|
|
detail_url = list_url
|
|
|
|
phone = normalize_phone(item.get("mobile", ""))
|
|
profile = {
|
|
"uid": str(item.get("uid") or ""),
|
|
"name": str(item.get("username") or "").strip(),
|
|
"law_firm": str(item.get("lawyer_lawroom") or "").strip(),
|
|
"phone": phone,
|
|
"lawyer_year": item.get("lawyer_year"),
|
|
"service_area": str(item.get("service_area") or "").strip(),
|
|
"address": str(item.get("addr") or "").strip(),
|
|
"specialties": item.get("professionArr") or [],
|
|
"answer_count": item.get("ansnum"),
|
|
"comment_count": item.get("askcommentnum"),
|
|
}
|
|
|
|
now = int(time.time())
|
|
uid = profile.get("uid", "")
|
|
record_key = uid or detail_url
|
|
record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
|
|
|
|
area = item.get("areaInfo", {}) or {}
|
|
yield {
|
|
"record_id": record_id,
|
|
"collected_at": now,
|
|
"source": {
|
|
"site": SITE_NAME,
|
|
"list_url": list_url,
|
|
"detail_url": detail_url,
|
|
"province": str(area.get("province") or target.province_name),
|
|
"province_py": target.province_py,
|
|
"city": str(area.get("city") or target.city_name),
|
|
"city_py": target.city_py,
|
|
"page": page,
|
|
},
|
|
"list_snapshot": {
|
|
"uid": uid,
|
|
"name": profile["name"],
|
|
"law_firm": profile["law_firm"],
|
|
"answer_count": profile["answer_count"],
|
|
"comment_count": profile["comment_count"],
|
|
},
|
|
"profile": profile,
|
|
"raw": item,
|
|
}
|
|
if self.sleep_seconds:
|
|
time.sleep(self.sleep_seconds)
|
|
|
|
if not has_more:
|
|
break
|
|
|
|
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
|
source = record.get("source", {}) or {}
|
|
profile = record.get("profile", {}) or {}
|
|
phone = normalize_phone(profile.get("phone", ""))
|
|
if not phone:
|
|
return None
|
|
|
|
province = (source.get("province") or "").strip()
|
|
city = (source.get("city") or province).strip()
|
|
return {
|
|
"name": (profile.get("name") or "").strip(),
|
|
"law_firm": (profile.get("law_firm") or "").strip(),
|
|
"province": province,
|
|
"city": city,
|
|
"phone": phone,
|
|
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
|
|
"domain": LEGACY_DOMAIN,
|
|
"create_time": int(record.get("collected_at") or time.time()),
|
|
"params": json.dumps(record, ensure_ascii=False),
|
|
}
|
|
|
|
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
|
if not self.db or not phones:
|
|
return set()
|
|
deduped = sorted({p for p in phones if p})
|
|
if not deduped:
|
|
return set()
|
|
|
|
existing: Set[str] = set()
|
|
cur = self.db.db.cursor()
|
|
try:
|
|
chunk_size = 500
|
|
for i in range(0, len(deduped), chunk_size):
|
|
chunk = deduped[i:i + chunk_size]
|
|
placeholders = ",".join(["%s"] * len(chunk))
|
|
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
|
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
|
for row in cur.fetchall():
|
|
existing.add(row[0])
|
|
finally:
|
|
cur.close()
|
|
return existing
|
|
|
|
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
|
if not self.db:
|
|
return 0, 0
|
|
|
|
rows: List[Dict[str, str]] = []
|
|
for record in records:
|
|
row = self._to_legacy_lawyer_row(record)
|
|
if row:
|
|
rows.append(row)
|
|
if not rows:
|
|
return 0, 0
|
|
|
|
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
|
inserted = 0
|
|
skipped = 0
|
|
for row in rows:
|
|
phone = row.get("phone", "")
|
|
if not phone or phone in existing:
|
|
skipped += 1
|
|
continue
|
|
try:
|
|
self.db.insert_data("lawyer", row)
|
|
existing.add(phone)
|
|
inserted += 1
|
|
except Exception as exc:
|
|
skipped += 1
|
|
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
|
return inserted, skipped
|
|
|
|
def crawl(
|
|
self,
|
|
output_path: str,
|
|
max_cities: int = 0,
|
|
city_filter: Optional[str] = None,
|
|
) -> None:
|
|
cities = self.discover_cities()
|
|
print(f"[discover] 共发现城市 {len(cities)} 个")
|
|
if city_filter:
|
|
key = city_filter.strip().lower()
|
|
cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
|
|
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
|
if max_cities > 0:
|
|
cities = cities[:max_cities]
|
|
print(f"[discover] 截断城市数 {len(cities)}")
|
|
|
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|
|
|
seen_ids: Set[str] = set()
|
|
if os.path.exists(output_path):
|
|
with open(output_path, "r", encoding="utf-8") as old_file:
|
|
for line in old_file:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
item = json.loads(line)
|
|
except Exception:
|
|
continue
|
|
rid = item.get("record_id")
|
|
if rid:
|
|
seen_ids.add(rid)
|
|
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
|
|
|
total_new_json = 0
|
|
total_new_db = 0
|
|
total_skip_db = 0
|
|
|
|
with open(output_path, "a", encoding="utf-8") as out:
|
|
for idx, target in enumerate(cities, start=1):
|
|
print(
|
|
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
|
f"({target.city_py})"
|
|
)
|
|
city_records = list(self.crawl_city(target))
|
|
|
|
city_new_json = 0
|
|
for record in city_records:
|
|
rid = record["record_id"]
|
|
if rid in seen_ids:
|
|
continue
|
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
seen_ids.add(rid)
|
|
city_new_json += 1
|
|
total_new_json += 1
|
|
|
|
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
|
total_new_db += city_new_db
|
|
total_skip_db += city_skip_db
|
|
print(
|
|
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
|
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
|
)
|
|
|
|
print(
|
|
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
|
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
|
)
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="找法网全新采集脚本(重写版)")
|
|
parser.add_argument(
|
|
"--output",
|
|
default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
|
|
help="输出 jsonl 文件路径",
|
|
)
|
|
parser.add_argument(
|
|
"--max-cities",
|
|
type=int,
|
|
default=0,
|
|
help="最多采集多少个城市,0 表示不限",
|
|
)
|
|
parser.add_argument(
|
|
"--max-pages",
|
|
type=int,
|
|
default=9999,
|
|
help="每个城市最多采集多少页",
|
|
)
|
|
parser.add_argument(
|
|
"--city-filter",
|
|
default="",
|
|
help="按城市拼音或城市名过滤,如 beijing",
|
|
)
|
|
parser.add_argument(
|
|
"--sleep",
|
|
type=float,
|
|
default=0.1,
|
|
help="每条记录采集间隔秒数",
|
|
)
|
|
parser.add_argument(
|
|
"--direct",
|
|
action="store_true",
|
|
help="直连模式,不使用 proxy_settings.json 代理",
|
|
)
|
|
parser.add_argument(
|
|
"--no-db",
|
|
action="store_true",
|
|
help="只输出 JSONL,不写入数据库",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
if args.no_db:
|
|
crawler = FindlawCrawler(
|
|
max_pages=args.max_pages,
|
|
sleep_seconds=args.sleep,
|
|
use_proxy=not args.direct,
|
|
db_connection=None,
|
|
)
|
|
crawler.crawl(
|
|
output_path=args.output,
|
|
max_cities=args.max_cities,
|
|
city_filter=args.city_filter or None,
|
|
)
|
|
return
|
|
|
|
with Db() as db:
|
|
crawler = FindlawCrawler(
|
|
max_pages=args.max_pages,
|
|
sleep_seconds=args.sleep,
|
|
use_proxy=not args.direct,
|
|
db_connection=db,
|
|
)
|
|
crawler.crawl(
|
|
output_path=args.output,
|
|
max_cities=args.max_cities,
|
|
city_filter=args.city_filter or None,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|