feat: enhance project configuration and improve data export functionality

- Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
2026-03-18 10:02:25 +08:00
parent c2b77975c1
commit 38e7c284e8
14 changed files with 1665 additions and 3004 deletions
@@ -1,16 +1,9 @@
-import argparse
-import ast
-import hashlib
 import json
 import os
-import random
-import re
 import sys
 import time
-from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Set, Tuple
-
-import urllib3
+import random
+from typing import Dict, List, Set, Optional

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -20,460 +13,212 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

+import requests
+from request.proxy_config import get_proxies, report_proxy_status
 from Db import Db
-from request.requests_client import RequestClientError, RequestsClient
-from utils.rate_limiter import wait_for_request

-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-SITE_NAME = "findlaw"
-LEGACY_DOMAIN = "找法网"
-SITE_BASE = "https://m.findlaw.cn"
-CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
-LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
-
-PHONE_RE = re.compile(r"1[3-9]\d{9}")
+DOMAIN = "找法网"
+LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"


-@dataclass
-class CityTarget:
-    province_id: str
-    province_name: str
-    province_py: str
-    city_id: str
-    city_name: str
-    city_py: str
-
-
-def normalize_phone(text: str) -> str:
-    compact = re.sub(r"\D", "", text or "")
-    match = PHONE_RE.search(compact)
-    return match.group(0) if match else ""
-
-
-class FindlawCrawler:
-    def __init__(
-        self,
-        max_pages: int = 9999,
-        sleep_seconds: float = 0.1,
-        use_proxy: bool = True,
-        db_connection=None,
-    ):
-        self.max_pages = max_pages
-        self.sleep_seconds = max(0.0, sleep_seconds)
+class FindlawSpider:
+    def __init__(self, db_connection):
        self.db = db_connection
-        self.client = RequestsClient(
-            headers={
-                "User-Agent": (
-                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
-                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
-                    "Mobile/15E148 Safari/604.1"
-                ),
-                "Accept": "application/json, text/javascript, */*; q=0.01",
-                "X-Requested-With": "XMLHttpRequest",
-                "Connection": "close",
-            },
-            use_proxy=use_proxy,
-            retry_total=2,
-            retry_backoff_factor=1,
-            retry_status_forcelist=(429, 500, 502, 503, 504),
-            retry_allowed_methods=("GET",),
-        )
+        self.session = self._build_session()
+        self.cities = self._load_cities()

-    def _get_text(
-        self,
-        url: str,
-        timeout: int = 20,
-        max_retries: int = 3,
-        referer: str = SITE_BASE,
-    ) -> str:
-        headers = {"Referer": referer}
-        last_error: Optional[Exception] = None
+    def _build_session(self) -> requests.Session:
+        report_proxy_status()
+        session = requests.Session()
+        session.trust_env = False
+        proxies = get_proxies()
+        if proxies:
+            session.proxies.update(proxies)
+        else:
+            session.proxies.clear()
+        session.headers.update({
+            "User-Agent": (
+                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                "Mobile/15E148 Safari/604.1"
+            ),
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "X-Requested-With": "XMLHttpRequest",
+            "Connection": "close",
+        })
+        return session

-        for attempt in range(max_retries):
-            wait_for_request()
-            try:
-                resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
-                code = resp.status_code
-                if code == 403:
-                    if attempt < max_retries - 1:
-                        self.client.refresh()
-                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                        continue
-                    raise RequestClientError(f"{code} Error: {url}")
-                if code >= 500 and attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                if code >= 400:
-                    raise RequestClientError(f"{code} Error: {url}")
-                return resp.text
-            except Exception as exc:
-                last_error = exc
-                if attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                raise
-
-        if last_error is not None:
-            raise last_error
-        raise RequestClientError(f"Unknown request error: {url}")
-
-    def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
-        pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
-        match = re.search(pattern, script_text)
-        if not match:
-            return []
-        raw = match.group(1)
+    def _refresh_session(self) -> None:
        try:
-            rows = ast.literal_eval(raw)
-            return rows if isinstance(rows, list) else []
+            self.session.close()
        except Exception:
-            return []
+            pass
+        self.session = self._build_session()

-    def discover_cities(self) -> List[CityTarget]:
-        js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
-        provinces = self._parse_city_js_array(js_text, "iosProvinces")
-        cities = self._parse_city_js_array(js_text, "iosCitys")
-
-        province_map: Dict[str, Dict] = {}
-        for item in provinces:
-            pid = str(item.get("id") or "").strip()
-            if pid:
-                province_map[pid] = item
-
-        results: List[CityTarget] = []
-        seen_py: Set[str] = set()
-        for city in cities:
-            city_py = str(city.get("pinyin") or "").strip()
-            city_name = str(city.get("value") or "").strip()
-            city_id = str(city.get("id") or "").strip()
-            province_id = str(city.get("parentId") or "").strip()
-            if not city_py or not city_name or not city_id:
-                continue
-            if city_py in seen_py:
-                continue
-            seen_py.add(city_py)
-
-            province_row = province_map.get(province_id, {})
-            province_name = str(province_row.get("value") or city_name).strip()
-            province_py = str(province_row.get("pinyin") or city_py).strip()
-
-            results.append(
-                CityTarget(
-                    province_id=province_id,
-                    province_name=province_name,
-                    province_py=province_py,
-                    city_id=city_id,
-                    city_name=city_name,
-                    city_py=city_py,
-                )
-            )
-        return results
-
-    def _parse_list_payload(self, text: str) -> Dict:
-        cleaned = (text or "").strip().lstrip("\ufeff")
-        try:
-            return json.loads(cleaned)
-        except ValueError:
-            start = cleaned.find("{")
-            end = cleaned.rfind("}")
-            if start == -1 or end == -1:
-                return {}
-            return json.loads(cleaned[start:end + 1])
-
-    def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
-        list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
-        referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
-        text = self._get_text(list_url, referer=referer)
-        payload = self._parse_list_payload(text)
-        if payload.get("errcode") != 0:
-            return [], False, list_url
-
-        data = payload.get("data", {}) or {}
-        items = data.get("lawyer_list", []) or []
-        has_more = str(data.get("has_more", "0")) == "1"
-        return items, has_more, list_url
-
-    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
-        for page in range(1, self.max_pages + 1):
+    def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
+        headers = {"Referer": referer}
+        for attempt in range(max_retries):
            try:
-                items, has_more, list_url = self.fetch_list_page(target.city_py, page)
-            except Exception as exc:
-                print(f"[list] 失败 {target.city_py} p{page}: {exc}")
-                break
+                resp = self.session.get(url, timeout=15, verify=verify, headers=headers)
+                status_code = resp.status_code
+                text = resp.text
+                resp.close()
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
+                return text
+            except requests.exceptions.SSLError:
+                if verify:
+                    return self._get(url, referer, verify=False, max_retries=max_retries)
+                print(f"SSL错误 {url}")
+                return None
+            except requests.exceptions.RequestException as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        return None

-            if not items:
-                break
-
-            for item in items:
-                detail_url = item.get("siteask_m") or item.get("site_url") or ""
-                detail_url = str(detail_url).strip()
-                if not detail_url.startswith("http"):
-                    detail_url = list_url
-
-                phone = normalize_phone(item.get("mobile", ""))
-                profile = {
-                    "uid": str(item.get("uid") or ""),
-                    "name": str(item.get("username") or "").strip(),
-                    "law_firm": str(item.get("lawyer_lawroom") or "").strip(),
-                    "phone": phone,
-                    "lawyer_year": item.get("lawyer_year"),
-                    "service_area": str(item.get("service_area") or "").strip(),
-                    "address": str(item.get("addr") or "").strip(),
-                    "specialties": item.get("professionArr") or [],
-                    "answer_count": item.get("ansnum"),
-                    "comment_count": item.get("askcommentnum"),
-                }
-
-                now = int(time.time())
-                uid = profile.get("uid", "")
-                record_key = uid or detail_url
-                record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
-
-                area = item.get("areaInfo", {}) or {}
-                yield {
-                    "record_id": record_id,
-                    "collected_at": now,
-                    "source": {
-                        "site": SITE_NAME,
-                        "list_url": list_url,
-                        "detail_url": detail_url,
-                        "province": str(area.get("province") or target.province_name),
-                        "province_py": target.province_py,
-                        "city": str(area.get("city") or target.city_name),
-                        "city_py": target.city_py,
-                        "page": page,
-                    },
-                    "list_snapshot": {
-                        "uid": uid,
-                        "name": profile["name"],
-                        "law_firm": profile["law_firm"],
-                        "answer_count": profile["answer_count"],
-                        "comment_count": profile["comment_count"],
-                    },
-                    "profile": profile,
-                    "raw": item,
-                }
-                if self.sleep_seconds:
-                    time.sleep(self.sleep_seconds)
-
-            if not has_more:
-                break
-
-    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
-        source = record.get("source", {}) or {}
-        profile = record.get("profile", {}) or {}
-        phone = normalize_phone(profile.get("phone", ""))
-        if not phone:
-            return None
-
-        province = (source.get("province") or "").strip()
-        city = (source.get("city") or province).strip()
-        return {
-            "name": (profile.get("name") or "").strip(),
-            "law_firm": (profile.get("law_firm") or "").strip(),
-            "province": province,
-            "city": city,
-            "phone": phone,
-            "url": (source.get("detail_url") or source.get("list_url") or "").strip(),
-            "domain": LEGACY_DOMAIN,
-            "create_time": int(record.get("collected_at") or time.time()),
-            "params": json.dumps(record, ensure_ascii=False),
-        }
-
-    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
-        if not self.db or not phones:
+    def _existing_phones(self, phones: List[str]) -> Set[str]:
+        if not phones:
            return set()
-        deduped = sorted({p for p in phones if p})
-        if not deduped:
-            return set()
-
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
-            for i in range(0, len(deduped), chunk_size):
-                chunk = deduped[i:i + chunk_size]
+            for i in range(0, len(phones), chunk_size):
+                chunk = phones[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
+                cur.execute(sql, [DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
        return existing

-    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
-        if not self.db:
-            return 0, 0
-
-        rows: List[Dict[str, str]] = []
-        for record in records:
-            row = self._to_legacy_lawyer_row(record)
-            if row:
-                rows.append(row)
-        if not rows:
-            return 0, 0
-
-        existing = self._existing_phones_in_db([row["phone"] for row in rows])
-        inserted = 0
-        skipped = 0
-        for row in rows:
-            phone = row.get("phone", "")
-            if not phone or phone in existing:
-                skipped += 1
-                continue
+    def _load_cities(self):
+        condition = "domain='findlaw' AND level=2"
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
            try:
-                self.db.insert_data("lawyer", row)
-                existing.add(phone)
-                inserted += 1
+                rows = self.db.select_data(table, "city, province, pinyin", condition) or []
            except Exception as exc:
-                skipped += 1
-                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
-        return inserted, skipped
+                last_error = exc
+                continue
+            if rows:
+                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
+                print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
+                return rows

-    def crawl(
-        self,
-        output_path: str,
-        max_cities: int = 0,
-        city_filter: Optional[str] = None,
-    ) -> None:
-        cities = self.discover_cities()
-        print(f"[discover] 共发现城市 {len(cities)} 个")
-        if city_filter:
-            key = city_filter.strip().lower()
-            cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
-            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
-        if max_cities > 0:
-            cities = cities[:max_cities]
-            print(f"[discover] 截断城市数 {len(cities)}")
+        if last_error:
+            print(f"[找法网] 加载地区数据失败: {last_error}")
+        print("[找法网] 无城市数据（已尝试 area_new/area2/area）")
+        for table in tables:
+            try:
+                cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
+                c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
+                print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
+            except Exception:
+                pass
+        return []

-        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    def _fetch_page(self, url: str, referer: str) -> List[Dict]:
+        text = self._get(url, referer, verify=True)
+        if not text:
+            return []

-        seen_ids: Set[str] = set()
-        if os.path.exists(output_path):
-            with open(output_path, "r", encoding="utf-8") as old_file:
-                for line in old_file:
-                    line = line.strip()
-                    if not line:
+        try:
+            # 某些返回体前会携带 BOM 或包装脚本，此处做兼容
+            text = text.strip().lstrip("\ufeff")
+            try:
+                data = json.loads(text)
+            except ValueError:
+                json_start = text.find('{')
+                json_end = text.rfind('}')
+                if json_start == -1 or json_end == -1:
+                    print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
+                    return []
+                cleaned = text[json_start:json_end + 1]
+                data = json.loads(cleaned)
+            if isinstance(data, str):
+                try:
+                    data = json.loads(data)
+                except ValueError:
+                    print(f"解析JSON失败 {url}: 二次解析仍为字符串，开头: {str(data)[:80]!r}")
+                    return []
+        except ValueError as exc:
+            print(f"解析JSON失败 {url}: {exc}")
+            return []
+
+        items = data.get("data", {}).get("lawyer_list", [])
+        parsed = []
+        for item in items:
+            phone = (item.get("mobile") or "").replace("-", "")
+            parsed.append({
+                "name": item.get("username", ""),
+                "law_firm": item.get("lawyer_lawroom", ""),
+                "province": item.get("areaInfo", {}).get("province", ""),
+                "city": item.get("areaInfo", {}).get("city", ""),
+                "phone": phone,
+                "url": url,
+                "domain": DOMAIN,
+                "create_time": int(time.time()),
+                "params": json.dumps(item, ensure_ascii=False)
+            })
+        return parsed
+
+    def run(self):
+        print("启动找法网采集...")
+        if not self.cities:
+            print("无城市数据")
+            return
+
+        for city in self.cities:
+            pinyin = city.get("pinyin")
+            province = city.get("province", "")
+            city_name = city.get("city", "")
+            if not pinyin:
+                continue
+            print(f"采集 {province}-{city_name}")
+            page = 1
+            while True:
+                url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
+                referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
+                print(f"  第 {page} 页: {url}")
+                items = self._fetch_page(url, referer)
+                if not items:
+                    break
+
+                phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
+                existing = self._existing_phones(phones)
+
+                for entry in items:
+                    phone = entry.get("phone")
+                    if not phone:
+                        continue
+                    if phone in existing:
+                        print(f"    -- 已存在: {entry['name']} ({phone})")
                        continue
                    try:
-                        item = json.loads(line)
-                    except Exception:
-                        continue
-                    rid = item.get("record_id")
-                    if rid:
-                        seen_ids.add(rid)
-            print(f"[resume] 已有记录 {len(seen_ids)} 条")
+                        self.db.insert_data("lawyer", entry)
+                        print(f"    -> 新增: {entry['name']} ({phone})")
+                    except Exception as exc:
+                        print(f"    插入失败: {exc}")

-        total_new_json = 0
-        total_new_db = 0
-        total_skip_db = 0
+                page += 1

-        with open(output_path, "a", encoding="utf-8") as out:
-            for idx, target in enumerate(cities, start=1):
-                print(
-                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
-                    f"({target.city_py})"
-                )
-                city_records = list(self.crawl_city(target))
-
-                city_new_json = 0
-                for record in city_records:
-                    rid = record["record_id"]
-                    if rid in seen_ids:
-                        continue
-                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
-                    seen_ids.add(rid)
-                    city_new_json += 1
-                    total_new_json += 1
-
-                city_new_db, city_skip_db = self._write_records_to_db(city_records)
-                total_new_db += city_new_db
-                total_skip_db += city_skip_db
-                print(
-                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
-                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
-                )
-
-        print(
-            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
-            f"DB跳过{total_skip_db}条, 输出: {output_path}"
-        )
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="找法网全新采集脚本（重写版）")
-    parser.add_argument(
-        "--output",
-        default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
-        help="输出 jsonl 文件路径",
-    )
-    parser.add_argument(
-        "--max-cities",
-        type=int,
-        default=0,
-        help="最多采集多少个城市，0 表示不限",
-    )
-    parser.add_argument(
-        "--max-pages",
-        type=int,
-        default=9999,
-        help="每个城市最多采集多少页",
-    )
-    parser.add_argument(
-        "--city-filter",
-        default="",
-        help="按城市拼音或城市名过滤，如 beijing",
-    )
-    parser.add_argument(
-        "--sleep",
-        type=float,
-        default=0.1,
-        help="每条记录采集间隔秒数",
-    )
-    parser.add_argument(
-        "--direct",
-        action="store_true",
-        help="直连模式，不使用 proxy_settings.json 代理",
-    )
-    parser.add_argument(
-        "--no-db",
-        action="store_true",
-        help="只输出 JSONL，不写入数据库",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    if args.no_db:
-        crawler = FindlawCrawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=None,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
-        return
-
-    with Db() as db:
-        crawler = FindlawCrawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=db,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
+        print("找法网采集完成")


 if __name__ == "__main__":
-    main()
+    with Db() as db:
+        spider = FindlawSpider(db)
+        spider.run()