chore: initialize lawyers crawler project

2026-03-02 00:19:48 +08:00
commit 03847a4b8e
17 changed files with 1928 additions and 0 deletions
@@ -0,0 +1,209 @@
+import json
+import os
+import sys
+import time
+import random
+from typing import Dict, List, Set, Optional
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+from request.requests_client import RequestClientError, RequestSSLError, RequestsClient
+from Db import Db
+
+DOMAIN = "找法网"
+LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
+
+
+class FindlawSpider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.cities = self._load_cities()
+
+    def _build_session(self) -> RequestsClient:
+        return RequestsClient(headers={
+            "User-Agent": (
+                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                "Mobile/15E148 Safari/604.1"
+            ),
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "X-Requested-With": "XMLHttpRequest",
+            "Connection": "close",
+        })
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+
+    def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
+        headers = {"Referer": referer}
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.get_text(url, timeout=15, verify=verify, headers=headers)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error: {url}")
+                return text
+            except RequestSSLError:
+                if verify:
+                    return self._get(url, referer, verify=False, max_retries=max_retries)
+                print(f"SSL错误 {url}")
+                return None
+            except RequestClientError as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        return None
+
+    def _existing_phones(self, phones: List[str]) -> Set[str]:
+        if not phones:
+            return set()
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            chunk_size = 500
+            for i in range(0, len(phones), chunk_size):
+                chunk = phones[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
+                cur.execute(sql, [DOMAIN, *chunk])
+                for row in cur.fetchall():
+                    existing.add(row[0])
+        finally:
+            cur.close()
+        return existing
+
+    def _load_cities(self):
+        condition = "domain='findlaw' AND level=2"
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
+            try:
+                rows = self.db.select_data(table, "city, province, pinyin", condition) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+            if rows:
+                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
+                print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
+                return rows
+
+        if last_error:
+            print(f"[找法网] 加载地区数据失败: {last_error}")
+        print("[找法网] 无城市数据（已尝试 area_new/area2/area）")
+        for table in tables:
+            try:
+                cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
+                c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
+                print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
+            except Exception:
+                pass
+        return []
+
+    def _fetch_page(self, url: str, referer: str) -> List[Dict]:
+        text = self._get(url, referer, verify=True)
+        if not text:
+            return []
+
+        try:
+            # 某些返回体前会携带 BOM 或包装脚本，此处做兼容
+            text = text.strip().lstrip("\ufeff")
+            try:
+                data = json.loads(text)
+            except ValueError:
+                json_start = text.find('{')
+                json_end = text.rfind('}')
+                if json_start == -1 or json_end == -1:
+                    print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
+                    return []
+                cleaned = text[json_start:json_end + 1]
+                data = json.loads(cleaned)
+            if isinstance(data, str):
+                try:
+                    data = json.loads(data)
+                except ValueError:
+                    print(f"解析JSON失败 {url}: 二次解析仍为字符串，开头: {str(data)[:80]!r}")
+                    return []
+        except ValueError as exc:
+            print(f"解析JSON失败 {url}: {exc}")
+            return []
+
+        items = data.get("data", {}).get("lawyer_list", [])
+        parsed = []
+        for item in items:
+            phone = (item.get("mobile") or "").replace("-", "")
+            parsed.append({
+                "name": item.get("username", ""),
+                "law_firm": item.get("lawyer_lawroom", ""),
+                "province": item.get("areaInfo", {}).get("province", ""),
+                "city": item.get("areaInfo", {}).get("city", ""),
+                "phone": phone,
+                "url": url,
+                "domain": DOMAIN,
+                "create_time": int(time.time()),
+                "params": json.dumps(item, ensure_ascii=False)
+            })
+        return parsed
+
+    def run(self):
+        print("启动找法网采集...")
+        if not self.cities:
+            print("无城市数据")
+            return
+
+        for city in self.cities:
+            pinyin = city.get("pinyin")
+            province = city.get("province", "")
+            city_name = city.get("city", "")
+            if not pinyin:
+                continue
+            print(f"采集 {province}-{city_name}")
+            page = 1
+            while True:
+                url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
+                referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
+                print(f"  第 {page} 页: {url}")
+                items = self._fetch_page(url, referer)
+                if not items:
+                    break
+
+                phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
+                existing = self._existing_phones(phones)
+
+                for entry in items:
+                    phone = entry.get("phone")
+                    if not phone:
+                        continue
+                    if phone in existing:
+                        print(f"    -- 已存在: {entry['name']} ({phone})")
+                        continue
+                    try:
+                        self.db.insert_data("lawyer", entry)
+                        print(f"    -> 新增: {entry['name']} ({phone})")
+                    except Exception as exc:
+                        print(f"    插入失败: {exc}")
+
+                page += 1
+
+        print("找法网采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = FindlawSpider(db)
+        spider.run()