重构采集脚本并新增按时间导出Excel

- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore，忽略 data/logs 本地产物
2026-03-02 11:46:05 +08:00
parent 03847a4b8e
commit 19cf9ce901
12 changed files with 3545 additions and 1059 deletions
@@ -1,9 +1,16 @@
+import argparse
+import ast
+import hashlib
 import json
 import os
+import random
+import re
 import sys
 import time
-import random
-from typing import Dict, List, Set, Optional
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Set, Tuple
+
+import urllib3

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -13,197 +20,460 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

-from request.requests_client import RequestClientError, RequestSSLError, RequestsClient
 from Db import Db
+from request.requests_client import RequestClientError, RequestsClient
+from utils.rate_limiter import wait_for_request

-DOMAIN = "找法网"
-LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+SITE_NAME = "findlaw"
+LEGACY_DOMAIN = "找法网"
+SITE_BASE = "https://m.findlaw.cn"
+CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
+LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
+
+PHONE_RE = re.compile(r"1[3-9]\d{9}")


-class FindlawSpider:
-    def __init__(self, db_connection):
+@dataclass
+class CityTarget:
+    province_id: str
+    province_name: str
+    province_py: str
+    city_id: str
+    city_name: str
+    city_py: str
+
+
+def normalize_phone(text: str) -> str:
+    compact = re.sub(r"\D", "", text or "")
+    match = PHONE_RE.search(compact)
+    return match.group(0) if match else ""
+
+
+class FindlawCrawler:
+    def __init__(
+        self,
+        max_pages: int = 9999,
+        sleep_seconds: float = 0.1,
+        use_proxy: bool = True,
+        db_connection=None,
+    ):
+        self.max_pages = max_pages
+        self.sleep_seconds = max(0.0, sleep_seconds)
        self.db = db_connection
-        self.client = self._build_session()
-        self.cities = self._load_cities()
+        self.client = RequestsClient(
+            headers={
+                "User-Agent": (
+                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                    "Mobile/15E148 Safari/604.1"
+                ),
+                "Accept": "application/json, text/javascript, */*; q=0.01",
+                "X-Requested-With": "XMLHttpRequest",
+                "Connection": "close",
+            },
+            use_proxy=use_proxy,
+            retry_total=2,
+            retry_backoff_factor=1,
+            retry_status_forcelist=(429, 500, 502, 503, 504),
+            retry_allowed_methods=("GET",),
+        )

-    def _build_session(self) -> RequestsClient:
-        return RequestsClient(headers={
-            "User-Agent": (
-                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
-                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
-                "Mobile/15E148 Safari/604.1"
-            ),
-            "Accept": "application/json, text/javascript, */*; q=0.01",
-            "X-Requested-With": "XMLHttpRequest",
-            "Connection": "close",
-        })
-
-    def _refresh_session(self) -> None:
-        self.client.refresh()
-
-    def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
+    def _get_text(
+        self,
+        url: str,
+        timeout: int = 20,
+        max_retries: int = 3,
+        referer: str = SITE_BASE,
+    ) -> str:
        headers = {"Referer": referer}
-        for attempt in range(max_retries):
-            try:
-                resp = self.client.get_text(url, timeout=15, verify=verify, headers=headers)
-                status_code = resp.status_code
-                text = resp.text
-                if status_code == 403:
-                    if attempt < max_retries - 1:
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
-                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
-                        self._refresh_session()
-                        time.sleep(wait_time)
-                        continue
-                    print(f"请求失败 {url}: 403 Forbidden")
-                    return None
-                if status_code >= 400:
-                    raise RequestClientError(f"{status_code} Error: {url}")
-                return text
-            except RequestSSLError:
-                if verify:
-                    return self._get(url, referer, verify=False, max_retries=max_retries)
-                print(f"SSL错误 {url}")
-                return None
-            except RequestClientError as exc:
-                print(f"请求失败 {url}: {exc}")
-                return None
-        return None
+        last_error: Optional[Exception] = None

-    def _existing_phones(self, phones: List[str]) -> Set[str]:
-        if not phones:
+        for attempt in range(max_retries):
+            wait_for_request()
+            try:
+                resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
+                code = resp.status_code
+                if code == 403:
+                    if attempt < max_retries - 1:
+                        self.client.refresh()
+                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                        continue
+                    raise RequestClientError(f"{code} Error: {url}")
+                if code >= 500 and attempt < max_retries - 1:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                    continue
+                if code >= 400:
+                    raise RequestClientError(f"{code} Error: {url}")
+                return resp.text
+            except Exception as exc:
+                last_error = exc
+                if attempt < max_retries - 1:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                    continue
+                raise
+
+        if last_error is not None:
+            raise last_error
+        raise RequestClientError(f"Unknown request error: {url}")
+
+    def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
+        pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
+        match = re.search(pattern, script_text)
+        if not match:
+            return []
+        raw = match.group(1)
+        try:
+            rows = ast.literal_eval(raw)
+            return rows if isinstance(rows, list) else []
+        except Exception:
+            return []
+
+    def discover_cities(self) -> List[CityTarget]:
+        js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
+        provinces = self._parse_city_js_array(js_text, "iosProvinces")
+        cities = self._parse_city_js_array(js_text, "iosCitys")
+
+        province_map: Dict[str, Dict] = {}
+        for item in provinces:
+            pid = str(item.get("id") or "").strip()
+            if pid:
+                province_map[pid] = item
+
+        results: List[CityTarget] = []
+        seen_py: Set[str] = set()
+        for city in cities:
+            city_py = str(city.get("pinyin") or "").strip()
+            city_name = str(city.get("value") or "").strip()
+            city_id = str(city.get("id") or "").strip()
+            province_id = str(city.get("parentId") or "").strip()
+            if not city_py or not city_name or not city_id:
+                continue
+            if city_py in seen_py:
+                continue
+            seen_py.add(city_py)
+
+            province_row = province_map.get(province_id, {})
+            province_name = str(province_row.get("value") or city_name).strip()
+            province_py = str(province_row.get("pinyin") or city_py).strip()
+
+            results.append(
+                CityTarget(
+                    province_id=province_id,
+                    province_name=province_name,
+                    province_py=province_py,
+                    city_id=city_id,
+                    city_name=city_name,
+                    city_py=city_py,
+                )
+            )
+        return results
+
+    def _parse_list_payload(self, text: str) -> Dict:
+        cleaned = (text or "").strip().lstrip("\ufeff")
+        try:
+            return json.loads(cleaned)
+        except ValueError:
+            start = cleaned.find("{")
+            end = cleaned.rfind("}")
+            if start == -1 or end == -1:
+                return {}
+            return json.loads(cleaned[start:end + 1])
+
+    def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
+        list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
+        referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
+        text = self._get_text(list_url, referer=referer)
+        payload = self._parse_list_payload(text)
+        if payload.get("errcode") != 0:
+            return [], False, list_url
+
+        data = payload.get("data", {}) or {}
+        items = data.get("lawyer_list", []) or []
+        has_more = str(data.get("has_more", "0")) == "1"
+        return items, has_more, list_url
+
+    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
+        for page in range(1, self.max_pages + 1):
+            try:
+                items, has_more, list_url = self.fetch_list_page(target.city_py, page)
+            except Exception as exc:
+                print(f"[list] 失败 {target.city_py} p{page}: {exc}")
+                break
+
+            if not items:
+                break
+
+            for item in items:
+                detail_url = item.get("siteask_m") or item.get("site_url") or ""
+                detail_url = str(detail_url).strip()
+                if not detail_url.startswith("http"):
+                    detail_url = list_url
+
+                phone = normalize_phone(item.get("mobile", ""))
+                profile = {
+                    "uid": str(item.get("uid") or ""),
+                    "name": str(item.get("username") or "").strip(),
+                    "law_firm": str(item.get("lawyer_lawroom") or "").strip(),
+                    "phone": phone,
+                    "lawyer_year": item.get("lawyer_year"),
+                    "service_area": str(item.get("service_area") or "").strip(),
+                    "address": str(item.get("addr") or "").strip(),
+                    "specialties": item.get("professionArr") or [],
+                    "answer_count": item.get("ansnum"),
+                    "comment_count": item.get("askcommentnum"),
+                }
+
+                now = int(time.time())
+                uid = profile.get("uid", "")
+                record_key = uid or detail_url
+                record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
+
+                area = item.get("areaInfo", {}) or {}
+                yield {
+                    "record_id": record_id,
+                    "collected_at": now,
+                    "source": {
+                        "site": SITE_NAME,
+                        "list_url": list_url,
+                        "detail_url": detail_url,
+                        "province": str(area.get("province") or target.province_name),
+                        "province_py": target.province_py,
+                        "city": str(area.get("city") or target.city_name),
+                        "city_py": target.city_py,
+                        "page": page,
+                    },
+                    "list_snapshot": {
+                        "uid": uid,
+                        "name": profile["name"],
+                        "law_firm": profile["law_firm"],
+                        "answer_count": profile["answer_count"],
+                        "comment_count": profile["comment_count"],
+                    },
+                    "profile": profile,
+                    "raw": item,
+                }
+                if self.sleep_seconds:
+                    time.sleep(self.sleep_seconds)
+
+            if not has_more:
+                break
+
+    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
+        source = record.get("source", {}) or {}
+        profile = record.get("profile", {}) or {}
+        phone = normalize_phone(profile.get("phone", ""))
+        if not phone:
+            return None
+
+        province = (source.get("province") or "").strip()
+        city = (source.get("city") or province).strip()
+        return {
+            "name": (profile.get("name") or "").strip(),
+            "law_firm": (profile.get("law_firm") or "").strip(),
+            "province": province,
+            "city": city,
+            "phone": phone,
+            "url": (source.get("detail_url") or source.get("list_url") or "").strip(),
+            "domain": LEGACY_DOMAIN,
+            "create_time": int(record.get("collected_at") or time.time()),
+            "params": json.dumps(record, ensure_ascii=False),
+        }
+
+    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
+        if not self.db or not phones:
            return set()
+        deduped = sorted({p for p in phones if p})
+        if not deduped:
+            return set()
+
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
-            for i in range(0, len(phones), chunk_size):
-                chunk = phones[i:i + chunk_size]
+            for i in range(0, len(deduped), chunk_size):
+                chunk = deduped[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [DOMAIN, *chunk])
+                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
        return existing

-    def _load_cities(self):
-        condition = "domain='findlaw' AND level=2"
-        tables = ("area_new", "area2", "area")
-        last_error = None
-        for table in tables:
+    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
+        if not self.db:
+            return 0, 0
+
+        rows: List[Dict[str, str]] = []
+        for record in records:
+            row = self._to_legacy_lawyer_row(record)
+            if row:
+                rows.append(row)
+        if not rows:
+            return 0, 0
+
+        existing = self._existing_phones_in_db([row["phone"] for row in rows])
+        inserted = 0
+        skipped = 0
+        for row in rows:
+            phone = row.get("phone", "")
+            if not phone or phone in existing:
+                skipped += 1
+                continue
            try:
-                rows = self.db.select_data(table, "city, province, pinyin", condition) or []
+                self.db.insert_data("lawyer", row)
+                existing.add(phone)
+                inserted += 1
            except Exception as exc:
-                last_error = exc
-                continue
-            if rows:
-                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
-                print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
-                return rows
+                skipped += 1
+                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
+        return inserted, skipped

-        if last_error:
-            print(f"[找法网] 加载地区数据失败: {last_error}")
-        print("[找法网] 无城市数据（已尝试 area_new/area2/area）")
-        for table in tables:
-            try:
-                cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
-                c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
-                print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
-            except Exception:
-                pass
-        return []
+    def crawl(
+        self,
+        output_path: str,
+        max_cities: int = 0,
+        city_filter: Optional[str] = None,
+    ) -> None:
+        cities = self.discover_cities()
+        print(f"[discover] 共发现城市 {len(cities)} 个")
+        if city_filter:
+            key = city_filter.strip().lower()
+            cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
+            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
+        if max_cities > 0:
+            cities = cities[:max_cities]
+            print(f"[discover] 截断城市数 {len(cities)}")

-    def _fetch_page(self, url: str, referer: str) -> List[Dict]:
-        text = self._get(url, referer, verify=True)
-        if not text:
-            return []
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)

-        try:
-            # 某些返回体前会携带 BOM 或包装脚本，此处做兼容
-            text = text.strip().lstrip("\ufeff")
-            try:
-                data = json.loads(text)
-            except ValueError:
-                json_start = text.find('{')
-                json_end = text.rfind('}')
-                if json_start == -1 or json_end == -1:
-                    print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
-                    return []
-                cleaned = text[json_start:json_end + 1]
-                data = json.loads(cleaned)
-            if isinstance(data, str):
-                try:
-                    data = json.loads(data)
-                except ValueError:
-                    print(f"解析JSON失败 {url}: 二次解析仍为字符串，开头: {str(data)[:80]!r}")
-                    return []
-        except ValueError as exc:
-            print(f"解析JSON失败 {url}: {exc}")
-            return []
-
-        items = data.get("data", {}).get("lawyer_list", [])
-        parsed = []
-        for item in items:
-            phone = (item.get("mobile") or "").replace("-", "")
-            parsed.append({
-                "name": item.get("username", ""),
-                "law_firm": item.get("lawyer_lawroom", ""),
-                "province": item.get("areaInfo", {}).get("province", ""),
-                "city": item.get("areaInfo", {}).get("city", ""),
-                "phone": phone,
-                "url": url,
-                "domain": DOMAIN,
-                "create_time": int(time.time()),
-                "params": json.dumps(item, ensure_ascii=False)
-            })
-        return parsed
-
-    def run(self):
-        print("启动找法网采集...")
-        if not self.cities:
-            print("无城市数据")
-            return
-
-        for city in self.cities:
-            pinyin = city.get("pinyin")
-            province = city.get("province", "")
-            city_name = city.get("city", "")
-            if not pinyin:
-                continue
-            print(f"采集 {province}-{city_name}")
-            page = 1
-            while True:
-                url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
-                referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
-                print(f"  第 {page} 页: {url}")
-                items = self._fetch_page(url, referer)
-                if not items:
-                    break
-
-                phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
-                existing = self._existing_phones(phones)
-
-                for entry in items:
-                    phone = entry.get("phone")
-                    if not phone:
-                        continue
-                    if phone in existing:
-                        print(f"    -- 已存在: {entry['name']} ({phone})")
+        seen_ids: Set[str] = set()
+        if os.path.exists(output_path):
+            with open(output_path, "r", encoding="utf-8") as old_file:
+                for line in old_file:
+                    line = line.strip()
+                    if not line:
                        continue
                    try:
-                        self.db.insert_data("lawyer", entry)
-                        print(f"    -> 新增: {entry['name']} ({phone})")
-                    except Exception as exc:
-                        print(f"    插入失败: {exc}")
+                        item = json.loads(line)
+                    except Exception:
+                        continue
+                    rid = item.get("record_id")
+                    if rid:
+                        seen_ids.add(rid)
+            print(f"[resume] 已有记录 {len(seen_ids)} 条")

-                page += 1
+        total_new_json = 0
+        total_new_db = 0
+        total_skip_db = 0

-        print("找法网采集完成")
+        with open(output_path, "a", encoding="utf-8") as out:
+            for idx, target in enumerate(cities, start=1):
+                print(
+                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
+                    f"({target.city_py})"
+                )
+                city_records = list(self.crawl_city(target))
+
+                city_new_json = 0
+                for record in city_records:
+                    rid = record["record_id"]
+                    if rid in seen_ids:
+                        continue
+                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
+                    seen_ids.add(rid)
+                    city_new_json += 1
+                    total_new_json += 1
+
+                city_new_db, city_skip_db = self._write_records_to_db(city_records)
+                total_new_db += city_new_db
+                total_skip_db += city_skip_db
+                print(
+                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
+                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
+                )
+
+        print(
+            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
+            f"DB跳过{total_skip_db}条, 输出: {output_path}"
+        )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="找法网全新采集脚本（重写版）")
+    parser.add_argument(
+        "--output",
+        default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
+        help="输出 jsonl 文件路径",
+    )
+    parser.add_argument(
+        "--max-cities",
+        type=int,
+        default=0,
+        help="最多采集多少个城市，0 表示不限",
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=9999,
+        help="每个城市最多采集多少页",
+    )
+    parser.add_argument(
+        "--city-filter",
+        default="",
+        help="按城市拼音或城市名过滤，如 beijing",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=float,
+        default=0.1,
+        help="每条记录采集间隔秒数",
+    )
+    parser.add_argument(
+        "--direct",
+        action="store_true",
+        help="直连模式，不使用 proxy_settings.json 代理",
+    )
+    parser.add_argument(
+        "--no-db",
+        action="store_true",
+        help="只输出 JSONL，不写入数据库",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.no_db:
+        crawler = FindlawCrawler(
+            max_pages=args.max_pages,
+            sleep_seconds=args.sleep,
+            use_proxy=not args.direct,
+            db_connection=None,
+        )
+        crawler.crawl(
+            output_path=args.output,
+            max_cities=args.max_cities,
+            city_filter=args.city_filter or None,
+        )
+        return
+
+    with Db() as db:
+        crawler = FindlawCrawler(
+            max_pages=args.max_pages,
+            sleep_seconds=args.sleep,
+            use_proxy=not args.direct,
+            db_connection=db,
+        )
+        crawler.crawl(
+            output_path=args.output,
+            max_cities=args.max_cities,
+            city_filter=args.city_filter or None,
+        )


 if __name__ == "__main__":
-    with Db() as db:
-        spider = FindlawSpider(db)
-        spider.run()
+    main()