chore: commit remaining local changes

2026-03-06 23:57:43 +08:00
parent a96b9a50e4
commit 86cf933913
3 changed files with 487 additions and 2 deletions
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+import argparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import hashlib
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+import urllib.request
+from typing import Dict, List, Optional, Set, Tuple
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+from Db import Db
+
+SITE_NAME = "zhongfali_group80"
+LEGACY_DOMAIN = "众法利单页"
+START_URL = "http://m.zhongfali.com/pg.jsp?groupId=80&pgt=0&pgs=1"
+DEFAULT_OUTPUT = "/www/wwwroot/lawyers/data/one_off_sites/zhongfali_records_all.jsonl"
+
+SOCKS_PROXY = "127.0.0.1:7891"
+CLASH_CONTROLLER = os.environ.get("CLASH_CONTROLLER", "http://127.0.0.1:9090")
+CLASH_SECRET = os.environ.get("CLASH_SECRET", "")
+PHONE_RE = re.compile(r"1[3-9]\d{9}")
+INITIAL_STATE_RE = re.compile(r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})</script>", re.S)
+
+
+class ProxyRotator:
+    def __init__(self, controller: str, secret: str):
+        self.controller = controller.rstrip("/")
+        self.secret = secret.strip()
+        self.nodes: List[str] = []
+        self.index = 0
+
+    def _api(self, path: str, method: str = "GET", payload: Optional[Dict] = None) -> Dict:
+        headers = {}
+        if self.secret:
+            headers["Authorization"] = f"Bearer {self.secret}"
+        body = None
+        if payload is not None:
+            headers["Content-Type"] = "application/json"
+            body = json.dumps(payload).encode("utf-8")
+        req = urllib.request.Request(
+            f"{self.controller}{path}",
+            data=body,
+            headers=headers,
+            method=method,
+        )
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            raw = resp.read().decode("utf-8", errors="ignore")
+            return json.loads(raw) if raw else {}
+
+    def initialize(self) -> None:
+        if not self.secret:
+            return
+        try:
+            self._api("/configs", method="PATCH", payload={"mode": "global"})
+            proxy_data = self._api("/proxies")
+            proxies = proxy_data.get("proxies", {}) or {}
+            skip = {
+                "GLOBAL",
+                "DIRECT",
+                "REJECT",
+                "REJECT-DROP",
+                "PASS",
+                "COMPATIBLE",
+                "🔰 选择节点",
+                "☁️ OneDrive",
+                "🐟 漏网之鱼",
+                "🎯 全球直连",
+                "🛑 拦截广告",
+                "🌍 爱奇艺&哔哩哔哩",
+                "🎮 Steam 登录/下载",
+                "🎮 Steam 商店/社区",
+                "🌩️ Cloudflare",
+                "🎬 动画疯",
+                "🎓学术网站",
+                "🇨🇳 国内网站",
+            }
+            self.nodes = [
+                name
+                for name, info in proxies.items()
+                if name not in skip and isinstance(info, dict)
+                and info.get("type") not in {"Selector", "URLTest", "Fallback", "LoadBalance"}
+            ]
+            if self.nodes:
+                self.switch_to(self.nodes[0])
+        except Exception as exc:
+            print(f"[proxy] rotator init failed: {exc}")
+            self.nodes = []
+
+    def switch_to(self, node_name: str) -> None:
+        self._api("/proxies/GLOBAL", method="PUT", payload={"name": node_name})
+
+    def rotate(self) -> None:
+        if not self.nodes:
+            return
+        self.index = (self.index + 1) % len(self.nodes)
+        node = self.nodes[self.index]
+        self.switch_to(node)
+
+
+def normalize_phone(value: str) -> str:
+    compact = "".join(ch for ch in str(value or "") if ch.isdigit())
+    match = PHONE_RE.search(compact)
+    return match.group(0) if match else ""
+
+
+def fetch_html(
+    url: str,
+    rotator: Optional[ProxyRotator] = None,
+    max_retries: int = 6,
+    timeout_seconds: int = 18,
+) -> str:
+    last_error = ""
+    for attempt in range(max_retries):
+        cmd = [
+            "curl",
+            "-sS",
+            "--socks5-hostname",
+            SOCKS_PROXY,
+            "-L",
+            "--compressed",
+            "--max-time",
+            str(timeout_seconds),
+            "-w",
+            "\n__CODE__:%{http_code}",
+            url,
+        ]
+        proc = subprocess.run(cmd, capture_output=True)
+        if proc.returncode == 0:
+            raw = proc.stdout.decode("utf-8", errors="ignore")
+            marker = "\n__CODE__:"
+            split_at = raw.rfind(marker)
+            if split_at != -1:
+                text = raw[:split_at]
+                code_text = raw[split_at + len(marker):].strip()
+            else:
+                text = raw
+                code_text = ""
+            code_ok = code_text == "200" if code_text else bool(text)
+            if text and code_ok:
+                return text
+            last_error = "empty body"
+        else:
+            last_error = proc.stderr.decode("utf-8", errors="ignore").strip() or f"exit={proc.returncode}"
+        if rotator and rotator.nodes:
+            try:
+                rotator.rotate()
+            except Exception as exc:
+                last_error = f"{last_error}; rotate failed: {exc}"
+        if attempt < max_retries - 1:
+            time.sleep(0.6 * (attempt + 1))
+    raise RuntimeError(f"fetch failed: {url}, reason={last_error}")
+
+
+def parse_initial_state(html: str) -> Dict:
+    match = INITIAL_STATE_RE.search(html)
+    if not match:
+        raise ValueError("window.__INITIAL_STATE__ not found")
+    return json.loads(match.group(1))
+
+
+def extract_group_urls_from_group80(state: Dict) -> List[str]:
+    module = (state.get("currentPageModuleIdMap") or {}).get("21") or {}
+    ext_info = module.get("extInfo", {}) or {}
+    second_group_map = ext_info.get("secondGroupMap", {}) or {}
+    rows = second_group_map.get("80") or []
+
+    urls: Set[str] = set()
+    for row in rows:
+        url = str(row.get("url") or "").strip()
+        if url:
+            urls.add(url)
+        for city in row.get("thirdGroupList") or []:
+            city_url = str(city.get("url") or "").strip()
+            if city_url:
+                urls.add(city_url)
+    return sorted(urls)
+
+
+def extract_detail_urls_from_group_html(html: str) -> Set[str]:
+    detail_ids = set(re.findall(r"h-pd-(\d+)\.html", html))
+    return {f"http://m.zhongfali.com/h-pd-{pid}.html" for pid in detail_ids}
+
+
+def parse_location_and_name(product_name: str) -> Tuple[str, str, str]:
+    text = re.sub(r"\s+", " ", str(product_name or "")).strip()
+    province = ""
+    city = ""
+    name = ""
+
+    province_match = re.search(r"([\u4e00-\u9fa5]{2,}省)", text)
+    if province_match:
+        province = province_match.group(1)
+
+    city_match = re.search(r"(?:[\u4e00-\u9fa5]+省)?([\u4e00-\u9fa5]+(?:市|区|县|州|盟))", text)
+    if city_match:
+        city = city_match.group(1)
+
+    name_match = re.search(r"([\u4e00-\u9fa5]{2,4})\s*律师", text)
+    if name_match:
+        name = name_match.group(1)
+
+    return province, city, name
+
+
+def parse_detail_record(detail_url: str, html: str, source_list_url: str) -> Optional[Dict]:
+    state = parse_initial_state(html)
+
+    module = None
+    for mod in (state.get("currentPageModuleIdMap") or {}).values():
+        if isinstance(mod, dict) and (mod.get("extInfo") or {}).get("productInfo"):
+            module = mod
+            break
+    if not module:
+        return None
+
+    ext_info = module.get("extInfo", {}) or {}
+    product_info = ext_info.get("productInfo", {}) or {}
+
+    phone = normalize_phone(product_info.get("material", ""))
+    if not phone:
+        return None
+
+    product_name = str(product_info.get("name") or "").strip()
+    province, city, lawyer_name = parse_location_and_name(product_name)
+    law_firm = str(product_info.get("prop0") or "").strip()
+
+    if not lawyer_name:
+        lawyer_name = product_name
+
+    now = int(time.time())
+    record_id = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
+    return {
+        "record_id": record_id,
+        "collected_at": now,
+        "source": {
+            "site": SITE_NAME,
+            "list_url": source_list_url,
+            "detail_url": detail_url,
+            "province": province,
+            "province_py": "",
+            "city": city,
+            "city_py": "",
+            "page": 1,
+        },
+        "list_snapshot": {
+            "name": lawyer_name,
+            "law_firm": law_firm,
+            "specialties": [],
+            "answer_count": None,
+        },
+        "profile": {
+            "name": lawyer_name,
+            "law_firm": law_firm,
+            "phone": phone,
+            "license_no": str(product_info.get("prop1") or "").strip(),
+            "practice_years": None,
+            "email": "",
+            "address": str(product_info.get("prop3") or "").strip(),
+            "specialties": [],
+        },
+        "raw": {
+            "product_name": product_name,
+            "group_ids": product_info.get("groupIdList") or [],
+        },
+    }
+
+
+def to_legacy_row(record: Dict) -> Optional[Dict[str, str]]:
+    profile = record.get("profile", {}) or {}
+    source = record.get("source", {}) or {}
+    phone = normalize_phone(profile.get("phone", ""))
+    if not phone:
+        return None
+
+    province = str(source.get("province") or "").strip()
+    city = str(source.get("city") or province).strip()
+    return {
+        "name": str(profile.get("name") or "").strip(),
+        "law_firm": str(profile.get("law_firm") or "").strip(),
+        "province": province,
+        "city": city,
+        "phone": phone,
+        "url": str(source.get("detail_url") or "").strip(),
+        "domain": LEGACY_DOMAIN,
+        "create_time": int(record.get("collected_at") or time.time()),
+        "params": json.dumps(record, ensure_ascii=False),
+    }
+
+
+def delete_old_domain_data(db: Db, domain: str) -> int:
+    cur = db.db.cursor()
+    try:
+        cur.execute("DELETE FROM lawyer WHERE domain=%s", (domain,))
+        affected = cur.rowcount
+        db.db.commit()
+        return affected
+    finally:
+        cur.close()
+
+
+def write_records_to_db(db: Db, records: List[Dict]) -> int:
+    inserted = 0
+    for record in records:
+        row = to_legacy_row(record)
+        if not row:
+            continue
+        try:
+            db.insert_data("lawyer", row)
+            inserted += 1
+        except Exception as exc:
+            print(f"[db] insert failed phone={row.get('phone', '')}: {exc}")
+    return inserted
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="众法利 groupId=80 基础字段采集（姓名/手机号/地区）")
+    parser.add_argument("--start-url", default=START_URL, help="入口分组页 URL")
+    parser.add_argument("--output", default=DEFAULT_OUTPUT, help="JSONL 输出路径")
+    parser.add_argument("--no-db", action="store_true", help="只写 JSON，不写 DB")
+    parser.add_argument("--no-reset", action="store_true", help="不清理 domain 旧数据")
+    parser.add_argument("--workers", type=int, default=16, help="详情页并发数")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+
+    rotator = ProxyRotator(CLASH_CONTROLLER, CLASH_SECRET)
+    rotator.initialize()
+    if rotator.nodes:
+        print(f"[proxy] rotator enabled, nodes={len(rotator.nodes)}")
+    else:
+        print("[proxy] rotator disabled, using current proxy route")
+
+    start_retries = max(8, len(rotator.nodes) + 2) if rotator.nodes else 8
+    group_html = fetch_html(args.start_url, rotator=rotator, max_retries=start_retries)
+    group_state = parse_initial_state(group_html)
+    group_urls = extract_group_urls_from_group80(group_state)
+    print(f"[group] found group urls: {len(group_urls)}")
+
+    detail_url_to_source: Dict[str, str] = {}
+    for idx, rel_url in enumerate(group_urls, start=1):
+        list_url = f"http://m.zhongfali.com/{rel_url.lstrip('/')}"
+        try:
+            html = fetch_html(list_url, rotator=rotator, max_retries=4, timeout_seconds=12)
+            detail_urls = extract_detail_urls_from_group_html(html)
+        except Exception as exc:
+            print(f"[group] failed {list_url}: {exc}")
+            continue
+
+        for detail_url in detail_urls:
+            detail_url_to_source.setdefault(detail_url, list_url)
+        if idx % 10 == 0:
+            print(f"[group] {idx}/{len(group_urls)} detail_urls={len(detail_url_to_source)}")
+
+    records: List[Dict] = []
+    seen_phones: Set[str] = set()
+    detail_urls = sorted(detail_url_to_source.keys())
+    print(f"[detail] total detail urls: {len(detail_urls)}")
+
+    def process_detail(detail_url: str) -> Optional[Dict]:
+        try:
+            html = fetch_html(detail_url, rotator=rotator, max_retries=2, timeout_seconds=8)
+            record = parse_detail_record(detail_url, html, detail_url_to_source[detail_url])
+            return record
+        except Exception as exc:
+            print(f"[detail] failed {detail_url}: {exc}")
+            return None
+
+    done = 0
+    with ThreadPoolExecutor(max_workers=max(1, int(args.workers))) as executor:
+        futures = [executor.submit(process_detail, detail_url) for detail_url in detail_urls]
+        for future in as_completed(futures):
+            done += 1
+            record = future.result()
+            if record:
+                phone = normalize_phone((record.get("profile", {}) or {}).get("phone", ""))
+                if phone and phone not in seen_phones:
+                    seen_phones.add(phone)
+                    records.append(record)
+            if done % 50 == 0:
+                print(f"[detail] {done}/{len(detail_urls)} valid_records={len(records)}")
+
+    with open(args.output, "w", encoding="utf-8") as out:
+        for record in records:
+            out.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+    deleted = 0
+    inserted = 0
+    if not args.no_db:
+        with Db() as db:
+            if not args.no_reset:
+                deleted = delete_old_domain_data(db, LEGACY_DOMAIN)
+            inserted = write_records_to_db(db, records)
+
+    print(
+        f"[done] records={len(records)}, db_deleted={deleted}, db_inserted={inserted}, output={args.output}"
+    )
+
+
+if __name__ == "__main__":
+    main()
@@ -191,7 +191,7 @@ def extract_records(url: str, state: Dict) -> List[Dict]:
            "source": {
                "site": SITE_NAME,
                "list_url": url,
-                "detail_url": url,
+                "detail_url": "",
                "province": province,
                "province_py": "",
                "city": area or city,
@@ -199,6 +199,7 @@ def extract_records(url: str, state: Dict) -> List[Dict]:
                "page": 1,
                "group_id": group_id,
                "module_id": module_id,
+                "detail_url_status": "unresolved_from_pool",
            },
            "list_snapshot": {
                "name": "",
@@ -347,12 +348,68 @@ def write_records_to_db(db: Db, records: List[Dict]) -> Tuple[int, int]:
    return inserted, skipped


+def lookup_name_map_from_db(db: Db, phones: List[str]) -> Dict[str, str]:
+    deduped = sorted({phone for phone in phones if phone})
+    if not deduped:
+        return {}
+
+    name_map: Dict[str, str] = {}
+    cur = db.db.cursor()
+    try:
+        chunk_size = 500
+        for i in range(0, len(deduped), chunk_size):
+            chunk = deduped[i:i + chunk_size]
+            placeholders = ",".join(["%s"] * len(chunk))
+            sql = (
+                "SELECT phone, name, create_time FROM lawyer "
+                f"WHERE phone IN ({placeholders}) AND name<>'' "
+                "ORDER BY create_time DESC"
+            )
+            cur.execute(sql, chunk)
+            for phone, name, _ in cur.fetchall():
+                if phone not in name_map and name:
+                    name_map[phone] = str(name).strip()
+    finally:
+        cur.close()
+    return name_map
+
+
+def apply_name_backfill(records: List[Dict], name_map: Dict[str, str]) -> int:
+    updated = 0
+    if not name_map:
+        return updated
+
+    for record in records:
+        profile = record.get("profile", {}) or {}
+        list_snapshot = record.get("list_snapshot", {}) or {}
+        phone = normalize_phone(profile.get("phone", ""))
+        if not phone:
+            continue
+
+        backfill_name = name_map.get(phone, "")
+        if not backfill_name:
+            continue
+
+        current_name = str(profile.get("name") or "").strip()
+        if current_name:
+            continue
+
+        profile["name"] = backfill_name
+        list_snapshot["name"] = backfill_name
+        record["profile"] = profile
+        record["list_snapshot"] = list_snapshot
+        updated += 1
+
+    return updated
+
+
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="众法利单页律师电话采集")
    parser.add_argument("--url", default=DEFAULT_URL, help="详情页 URL")
    parser.add_argument("--output", default=DEFAULT_OUTPUT, help="输出 jsonl 文件路径")
    parser.add_argument("--direct", action="store_true", help="直连模式，不使用代理")
    parser.add_argument("--no-db", action="store_true", help="仅输出 JSON，不写入数据库")
+    parser.add_argument("--skip-name-backfill", action="store_true", help="跳过按手机号回填姓名")
    return parser.parse_args()


@@ -418,12 +475,24 @@ def main() -> None:

    db_new = 0
    db_skip = 0
+    name_backfill_count = 0
+    if not args.skip_name_backfill:
+        try:
+            with Db() as db:
+                name_map = lookup_name_map_from_db(
+                    db,
+                    [normalize_phone((record.get("profile", {}) or {}).get("phone", "")) for record in records],
+                )
+                name_backfill_count = apply_name_backfill(records, name_map)
+        except Exception as exc:
+            print(f"[name-backfill] 跳过，查询失败: {exc}")
+
    if not args.no_db:
        with Db() as db:
            db_new, db_skip = write_records_to_db(db, records)

    print(
-        f"[done] 采集{len(records)}条, JSON新增{json_new}条, "
+        f"[done] 采集{len(records)}条, 姓名回填{name_backfill_count}条, JSON新增{json_new}条, "
        f"DB新增{db_new}条, DB跳过{db_skip}条, 输出: {args.output}"
    )