lawyers/common_sites/findlaw.py

import json
import os
import sys
import time
import random
from typing import Dict, List, Set, Optional

current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
request_dir = os.path.join(project_root, "request")
if request_dir not in sys.path:
    sys.path.insert(0, request_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

import requests
from request.proxy_config import get_proxies, report_proxy_status
from Db import Db

DOMAIN = "找法网"
LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"


class FindlawSpider:
    def __init__(self, db_connection):
        self.db = db_connection
        self.session = self._build_session()
        self.cities = self._load_cities()

    def _build_session(self) -> requests.Session:
        report_proxy_status()
        session = requests.Session()
        session.trust_env = False
        proxies = get_proxies()
        if proxies:
            session.proxies.update(proxies)
        else:
            session.proxies.clear()
        session.headers.update({
            "User-Agent": (
                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
                "Mobile/15E148 Safari/604.1"
            ),
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "X-Requested-With": "XMLHttpRequest",
            "Connection": "close",
        })
        return session

    def _refresh_session(self) -> None:
        try:
            self.session.close()
        except Exception:
            pass
        self.session = self._build_session()

    def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
        headers = {"Referer": referer}
        for attempt in range(max_retries):
            try:
                resp = self.session.get(url, timeout=15, verify=verify, headers=headers)
                status_code = resp.status_code
                text = resp.text
                resp.close()
                if status_code == 403:
                    if attempt < max_retries - 1:
                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
                        self._refresh_session()
                        time.sleep(wait_time)
                        continue
                    print(f"请求失败 {url}: 403 Forbidden")
                    return None
                if status_code >= 400:
                    raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
                return text
            except requests.exceptions.SSLError:
                if verify:
                    return self._get(url, referer, verify=False, max_retries=max_retries)
                print(f"SSL错误 {url}")
                return None
            except requests.exceptions.RequestException as exc:
                print(f"请求失败 {url}: {exc}")
                return None
        return None

    def _existing_phones(self, phones: List[str]) -> Set[str]:
        if not phones:
            return set()
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
            for i in range(0, len(phones), chunk_size):
                chunk = phones[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
                cur.execute(sql, [DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
        return existing

    def _load_cities(self):
        condition = "domain='findlaw' AND level=2"
        tables = ("area_new", "area2", "area")
        last_error = None
        for table in tables:
            try:
                rows = self.db.select_data(table, "city, province, pinyin", condition) or []
            except Exception as exc:
                last_error = exc
                continue
            if rows:
                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
                print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
                return rows

        if last_error:
            print(f"[找法网] 加载地区数据失败: {last_error}")
        print("[找法网] 无城市数据（已尝试 area_new/area2/area）")
        for table in tables:
            try:
                cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
                c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
                print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
            except Exception:
                pass
        return []

    def _fetch_page(self, url: str, referer: str) -> List[Dict]:
        text = self._get(url, referer, verify=True)
        if not text:
            return []

        try:
            # 某些返回体前会携带 BOM 或包装脚本，此处做兼容
            text = text.strip().lstrip("\ufeff")
            try:
                data = json.loads(text)
            except ValueError:
                json_start = text.find('{')
                json_end = text.rfind('}')
                if json_start == -1 or json_end == -1:
                    print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
                    return []
                cleaned = text[json_start:json_end + 1]
                data = json.loads(cleaned)
            if isinstance(data, str):
                try:
                    data = json.loads(data)
                except ValueError:
                    print(f"解析JSON失败 {url}: 二次解析仍为字符串，开头: {str(data)[:80]!r}")
                    return []
        except ValueError as exc:
            print(f"解析JSON失败 {url}: {exc}")
            return []

        items = data.get("data", {}).get("lawyer_list", [])
        parsed = []
        for item in items:
            phone = (item.get("mobile") or "").replace("-", "")
            parsed.append({
                "name": item.get("username", ""),
                "law_firm": item.get("lawyer_lawroom", ""),
                "province": item.get("areaInfo", {}).get("province", ""),
                "city": item.get("areaInfo", {}).get("city", ""),
                "phone": phone,
                "url": url,
                "domain": DOMAIN,
                "create_time": int(time.time()),
                "params": json.dumps(item, ensure_ascii=False)
            })
        return parsed

    def run(self):
        print("启动找法网采集...")
        if not self.cities:
            print("无城市数据")
            return

        for city in self.cities:
            pinyin = city.get("pinyin")
            province = city.get("province", "")
            city_name = city.get("city", "")
            if not pinyin:
                continue
            print(f"采集 {province}-{city_name}")
            page = 1
            while True:
                url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
                referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
                print(f"  第 {page} 页: {url}")
                items = self._fetch_page(url, referer)
                if not items:
                    break

                phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
                existing = self._existing_phones(phones)

                for entry in items:
                    phone = entry.get("phone")
                    if not phone:
                        continue
                    if phone in existing:
                        print(f"    -- 已存在: {entry['name']} ({phone})")
                        continue
                    try:
                        self.db.insert_data("lawyer", entry)
                        print(f"    -> 新增: {entry['name']} ({phone})")
                    except Exception as exc:
                        print(f"    插入失败: {exc}")

                page += 1

        print("找法网采集完成")


if __name__ == "__main__":
    with Db() as db:
        spider = FindlawSpider(db)
        spider.run()