feat: enhance project configuration and improve data export functionality

- Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
2026-03-18 10:02:25 +08:00
parent c2b77975c1
commit 38e7c284e8
14 changed files with 1665 additions and 3004 deletions
@@ -1,14 +1,9 @@
 import json
 import os
-import random
-import re
 import sys
 import time
-from typing import Dict, List, Optional, Set, Tuple
-from urllib.parse import urljoin
-
-import urllib3
-from bs4 import BeautifulSoup
+import random
+from typing import Dict, Optional

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -18,144 +13,191 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

-from Db import Db
-from request.requests_client import (
-    RequestClientError,
-    RequestConnectTimeout,
-    RequestConnectionError,
-    RequestTimeout,
-    RequestsClient,
-)
-from utils.rate_limiter import wait_for_request
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+import urllib3
+from bs4 import BeautifulSoup
+from request.proxy_config import get_proxies, report_proxy_status

+# 禁用 SSL 警告
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

+from Db import Db
+from utils.rate_limiter import wait_for_request
+
 DOMAIN = "大律师"
-SITE_BASE = "https://m.maxlaw.cn"
-LIST_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}"
-PHONE_PATTERN = re.compile(r"1[3-9]\d{9}")
-MAX_PAGES_PER_CITY = int(os.getenv("MAXLAW_MAX_PAGES", "0"))
-PROXY_TESTED = False
+LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
+_PROXY_TESTED = False


 class DlsSpider:
    def __init__(self, db_connection):
        self.db = db_connection
-        self.client = self._build_client()
+        self.session = self._build_session()
        self.areas = self._load_areas()

-    def _build_client(self) -> RequestsClient:
-        client = RequestsClient(
-            headers={
-                "User-Agent": (
-                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
-                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
-                    "Mobile/15E148 Safari/604.1"
-                ),
-                "Host": "m.maxlaw.cn",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-                "Connection": "close",
-            },
-            retry_total=3,
-            retry_backoff_factor=1,
-            retry_status_forcelist=(429, 500, 502, 503, 504),
-            retry_allowed_methods=("GET", "POST"),
+    def _build_session(self) -> requests.Session:
+        """构建带重试机制的 session"""
+        report_proxy_status()
+        s = requests.Session()
+        s.trust_env = False
+        proxies = get_proxies()
+        if proxies:
+            s.proxies.update(proxies)
+        else:
+            s.proxies.clear()
+        self._proxy_test(s, proxies)
+        # 配置重试策略
+        retries = Retry(
+            total=3,  # 总共重试3次
+            backoff_factor=1,  # 重试间隔：1s, 2s, 4s
+            status_forcelist=(429, 500, 502, 503, 504),  # 对这些状态码进行重试
+            allowed_methods=frozenset(["GET", "POST"]),
+            raise_on_status=False  # 不立即抛出异常，让代码处理
        )
-        self._proxy_test(client, client.proxies or None)
-        return client
+        adapter = HTTPAdapter(max_retries=retries)
+        s.mount("https://", adapter)
+        s.mount("http://", adapter)
+        s.headers.update({
+            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
+            "Host": "m.maxlaw.cn",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "close",
+        })
+        return s

-    def _refresh_client(self) -> None:
-        self.client.refresh()
-        self._proxy_test(self.client, self.client.proxies or None)
+    def _refresh_session(self) -> None:
+        try:
+            self.session.close()
+        except Exception:
+            pass
+        self.session = self._build_session()

-    def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
-        global PROXY_TESTED
-        if PROXY_TESTED or not os.getenv("PROXY_TEST"):
+    def _proxy_test(self, session: requests.Session, proxies: Optional[Dict[str, str]]) -> None:
+        global _PROXY_TESTED
+        if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
            return
-        PROXY_TESTED = True
+        _PROXY_TESTED = True
        if not proxies:
            print("[proxy] test skipped: no proxy configured")
            return
        test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
        timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
        try:
-            resp = client.get_text(test_url, timeout=timeout, headers={"Connection": "close"})
+            resp = session.get(
+                test_url,
+                timeout=timeout,
+                headers={"Connection": "close"},
+            )
            print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
        except Exception as exc:
            print(f"[proxy] test failed: {exc}")

-    def _load_areas(self) -> List[Dict[str, str]]:
-        tables = ("area_new", "area2", "area")
-        last_error = None
-        for table in tables:
-            try:
-                rows = self.db.select_data(table, "province, city, pinyin", "domain='maxlaw'") or []
-            except Exception as exc:
-                last_error = exc
-                continue
-            if rows:
-                missing_pinyin = sum(1 for row in rows if not (row.get("pinyin") or "").strip())
-                print(f"[大律师] 地区来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
-                return rows
-        if last_error:
-            print(f"[大律师] 加载地区失败: {last_error}")
-        print("[大律师] 无地区数据（已尝试 area_new/area2/area）")
-        return []
+    def _load_areas(self):
+        try:
+            return self.db.select_data(
+                "area_new",
+                "province, city, pinyin",
+                "domain='maxlaw'"
+            ) or []
+        except Exception as exc:
+            print(f"加载地区失败: {exc}")
+            return []

-    def _get(
-        self,
-        url: str,
-        *,
-        headers: Optional[Dict[str, str]] = None,
-        max_retries: int = 3,
-        timeout: Tuple[int, int] = (10, 30),
-    ) -> Optional[str]:
+    def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
+        """发送 GET 请求，带重试机制"""
        wait_for_request()
+        
        for attempt in range(max_retries):
            try:
-                resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
-                if resp.status_code == 403:
+                # 使用更长的超时时间，分别设置连接和读取超时
+                resp = self.session.get(
+                    url, 
+                    timeout=(10, 30),  # (connect_timeout, read_timeout)
+                    verify=False,
+                    headers=headers,
+                )
+                status_code = resp.status_code
+                content = resp.text
+                resp.close()
+                if status_code == 403:
                    if attempt < max_retries - 1:
-                        wait_time = (2 ** attempt) + random.uniform(0.3, 1.0)
-                        print(f"请求403，{wait_time:.2f}s后重试 ({attempt + 1}/{max_retries}): {url}")
-                        self._refresh_client()
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        self._refresh_session()
                        time.sleep(wait_time)
                        continue
                    print(f"请求失败 {url}: 403 Forbidden")
                    return None
-                if resp.status_code >= 400:
-                    raise RequestClientError(f"{resp.status_code} Error: {url}")
-                return resp.text
-            except RequestConnectTimeout as exc:
+                if status_code >= 400:
+                    raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
+                return content
+            except requests.exceptions.ConnectTimeout as exc:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt  # 指数退避：2s, 4s, 8s
+                    print(f"连接超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    time.sleep(wait_time)
+                else:
+                    print(f"连接超时，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except requests.exceptions.Timeout as exc:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
-                    print(f"连接超时，{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
+                    print(f"请求超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
                    time.sleep(wait_time)
-                    continue
-                print(f"连接超时，已达到最大重试次数 {url}: {exc}")
-                return None
-            except RequestTimeout as exc:
+                else:
+                    print(f"请求超时，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except requests.exceptions.ConnectionError as exc:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
-                    print(f"请求超时，{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
+                    print(f"连接错误，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
                    time.sleep(wait_time)
-                    continue
-                print(f"请求超时，已达到最大重试次数 {url}: {exc}")
-                return None
-            except RequestConnectionError as exc:
-                if attempt < max_retries - 1:
-                    wait_time = 2 ** attempt
-                    print(f"连接错误，{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
-                    time.sleep(wait_time)
-                    continue
-                print(f"连接错误，已达到最大重试次数 {url}: {exc}")
-                return None
-            except RequestClientError as exc:
+                else:
+                    print(f"连接错误，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except requests.exceptions.RequestException as exc:
                print(f"请求失败 {url}: {exc}")
                return None
+        
        return None

+    def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
+        soup = BeautifulSoup(html, "html.parser")
+        cards = soup.find_all("div", class_="lstx")
+        if not cards:
+            return 0
+
+        inserted = 0
+        for card in cards:
+            link = card.find("a")
+            if not link or not link.get("href"):
+                continue
+            detail = self._parse_detail(link['href'], province, city, list_url)
+            if not detail:
+                continue
+            phone = detail.get("phone")
+            if not phone:
+                continue
+            condition = f"phone='{phone}' and domain='{DOMAIN}'"
+            if self.db.is_data_exist("lawyer", condition):
+                print(f"  -- 已存在: {detail['name']} ({phone})")
+                time.sleep(0.3)
+                continue
+            try:
+                self.db.insert_data("lawyer", detail)
+                inserted += 1
+                print(f"  -> 新增: {detail['name']} ({phone})")
+            except Exception as exc:
+                print(f"  插入失败: {exc}")
+            time.sleep(1)
+            time.sleep(0.3)
+        # 列表页结束后再缓一缓，降低风控
+        time.sleep(0.6)
+        return inserted
+
    def _detail_headers(self, referer: str) -> Dict[str, str]:
        return {
            "Referer": referer,
@@ -166,215 +208,72 @@ class DlsSpider:
            "Upgrade-Insecure-Requests": "1",
        }

-    def _extract_detail_urls(self, html: str) -> List[str]:
-        soup = BeautifulSoup(html, "html.parser")
-        urls: List[str] = []
-        seen: Set[str] = set()
-
-        # 主选择器：当前站点列表卡片
-        for a_tag in soup.select("div.lstx a[href]"):
-            href = (a_tag.get("href") or "").strip()
-            if not href:
-                continue
-            url = urljoin(SITE_BASE, href)
-            if url in seen:
-                continue
-            seen.add(url)
-            urls.append(url)
-
-        # 回退选择器：页面结构轻微变化时尽量保活
-        if not urls:
-            for a_tag in soup.select("a[href]"):
-                href = (a_tag.get("href") or "").strip()
-                if "/lawyer/" not in href:
-                    continue
-                url = urljoin(SITE_BASE, href)
-                if url in seen:
-                    continue
-                seen.add(url)
-                urls.append(url)
-        return urls
-
-    def _extract_name(self, soup: BeautifulSoup) -> str:
-        for selector in ("h2.lawyerName", "h1.lawyerName", "h1", "h2"):
-            tag = soup.select_one(selector)
-            if tag:
-                name = tag.get_text(strip=True)
-                if name:
-                    return name
-        title = soup.title.get_text(strip=True) if soup.title else ""
-        match = re.search(r"(\S+律师)", title)
-        return match.group(1) if match else ""
-
-    def _extract_law_firm(self, soup: BeautifulSoup) -> str:
-        for selector in ("p.law-firm", "div.law-firm", "p[class*=firm]"):
-            tag = soup.select_one(selector)
-            if tag:
-                text = tag.get_text(strip=True)
-                if text:
-                    return text
-        page_text = soup.get_text(" ", strip=True)
-        match = re.search(r"(执业机构|律所)\s*[:：]?\s*([^\s，。,；;]{2,40})", page_text)
-        if match:
-            return match.group(2).strip()
-        return ""
-
-    def _normalize_phone(self, text: str) -> str:
-        compact = re.sub(r"\D", "", text or "")
-        match = PHONE_PATTERN.search(compact)
-        return match.group(0) if match else ""
-
-    def _extract_phone(self, soup: BeautifulSoup) -> str:
-        contact = soup.select_one("ul.contact-content")
-        if contact:
-            phone = self._normalize_phone(contact.get_text(" ", strip=True))
-            if phone:
-                return phone
-        for selector in ("a[href^='tel:']", "span.phone", "p.phone"):
-            tag = soup.select_one(selector)
-            if tag:
-                phone = self._normalize_phone(tag.get_text(" ", strip=True))
-                if phone:
-                    return phone
-        return self._normalize_phone(soup.get_text(" ", strip=True))
-
-    def _parse_detail(self, detail_url: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
-        print(f"  详情: {detail_url}")
-        html = self._get(detail_url, headers=self._detail_headers(list_url))
+    def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
+        url = f"https://m.maxlaw.cn{path}"
+        print(f"  详情: {url}")
+        html = self._get(url, headers=self._detail_headers(list_url))
        if not html:
            return None

        soup = BeautifulSoup(html, "html.parser")
-        name = self._extract_name(soup)
-        phone = self._extract_phone(soup)
+        name_tag = soup.find("h2", class_="lawyerName")
+        law_firm_tag = soup.find("p", class_="law-firm")
+        contact_list = soup.find("ul", class_="contact-content")
+
+        name = name_tag.get_text(strip=True) if name_tag else ""
+        law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
+        phone = ""
+
+        if contact_list:
+            items = contact_list.find_all("li")
+            if len(items) > 2:
+                phone_tag = items[2].find("p")
+                if phone_tag:
+                    phone = phone_tag.get_text(strip=True)
+                    phone = phone.split("咨询请说明来自大律师网")[0].strip()
+
+        phone = phone.replace('-', '').strip()
        if not name or not phone:
            print("    信息不完整，跳过")
            return None

-        safe_city = city or province
+        safe_city = city if city else province
        return {
            "name": name,
-            "law_firm": self._extract_law_firm(soup),
+            "law_firm": law_firm,
            "province": province,
            "city": safe_city,
            "phone": phone,
-            "url": detail_url,
+            "url": url,
            "domain": DOMAIN,
            "create_time": int(time.time()),
-            "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False),
+            "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
        }

-    def _existing_phones(self, phones: List[str]) -> Set[str]:
-        if not phones:
-            return set()
-        existing: Set[str] = set()
-        cur = self.db.db.cursor()
-        try:
-            chunk_size = 500
-            for idx in range(0, len(phones), chunk_size):
-                chunk = phones[idx:idx + chunk_size]
-                placeholders = ",".join(["%s"] * len(chunk))
-                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [DOMAIN, *chunk])
-                for row in cur.fetchall():
-                    existing.add(row[0])
-        finally:
-            cur.close()
-        return existing
-
-    def _save_lawyers(self, lawyers: List[Dict[str, str]]) -> Tuple[int, int]:
-        if not lawyers:
-            return 0, 0
-        phones = [row["phone"] for row in lawyers if row.get("phone")]
-        existing = self._existing_phones(phones)
-        inserted = 0
-        skipped = 0
-
-        for row in lawyers:
-            phone = row.get("phone", "")
-            if not phone:
-                skipped += 1
-                continue
-            if phone in existing:
-                skipped += 1
-                print(f"  -- 已存在: {row.get('name', '')} ({phone})")
-                continue
-            try:
-                self.db.insert_data("lawyer", row)
-                existing.add(phone)
-                inserted += 1
-                print(f"  -> 新增: {row.get('name', '')} ({phone})")
-            except Exception as exc:
-                skipped += 1
-                print(f"  插入失败 {row.get('url', '')}: {exc}")
-        return inserted, skipped
-
-    def _crawl_city(self, area: Dict[str, str]) -> Tuple[int, int]:
-        pinyin = (area.get("pinyin") or "").strip()
-        province = area.get("province", "")
-        city = area.get("city", "")
-        if not pinyin:
-            return 0, 0
-
-        total_inserted = 0
-        total_parsed = 0
-        page = 1
-        prev_fingerprint = ""
-
-        while True:
-            if MAX_PAGES_PER_CITY > 0 and page > MAX_PAGES_PER_CITY:
-                print(f"达到分页上限({MAX_PAGES_PER_CITY})，停止 {province}-{city}")
-                break
-
-            list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
-            print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
-            html = self._get(list_url)
-            if not html:
-                break
-
-            detail_urls = self._extract_detail_urls(html)
-            if not detail_urls:
-                print("  列表为空，结束当前城市")
-                break
-
-            fingerprint = "|".join(detail_urls[:8])
-            if fingerprint and fingerprint == prev_fingerprint:
-                print("  列表页重复，提前停止当前城市")
-                break
-            prev_fingerprint = fingerprint
-
-            lawyers: List[Dict[str, str]] = []
-            for detail_url in detail_urls:
-                row = self._parse_detail(detail_url, province, city, list_url)
-                if row:
-                    lawyers.append(row)
-                time.sleep(0.25)
-
-            inserted, skipped = self._save_lawyers(lawyers)
-            total_inserted += inserted
-            total_parsed += len(lawyers)
-            print(
-                f"  第 {page} 页完成: 列表{len(detail_urls)}条, "
-                f"解析{len(lawyers)}条, 新增{inserted}条, 跳过{skipped}条"
-            )
-
-            page += 1
-            time.sleep(0.5)
-        return total_inserted, total_parsed
-
    def run(self):
        print("启动大律师采集...")
        if not self.areas:
            print("无地区数据")
            return

-        all_inserted = 0
-        all_parsed = 0
        for area in self.areas:
-            inserted, parsed = self._crawl_city(area)
-            all_inserted += inserted
-            all_parsed += parsed
-        print(f"大律师采集完成: 解析{all_parsed}条, 新增{all_inserted}条")
+            pinyin = area.get("pinyin")
+            province = area.get("province", "")
+            city = area.get("city", "")
+            if not pinyin:
+                continue
+            page = 1
+            while True:
+                list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
+                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
+                html = self._get(list_url)
+                if not html:
+                    break
+                inserted = self._parse_list(html, province, city, list_url)
+                if inserted == 0:
+                    break
+                page += 1
+        print("大律师采集完成")


 if __name__ == "__main__":
@@ -19,6 +19,9 @@ if project_root not in sys.path:
 from Db import Db


+DEFAULT_EXPORT_START_TS = 1772932103
+
+
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
    parser.add_argument(
@@ -30,7 +33,10 @@ def parse_args() -> argparse.Namespace:
        "--start-ts",
        type=int,
        default=None,
-        help="create_time 起始时间戳（含），不传时默认取最近7天",
+        help=(
+            "create_time 起始时间戳（含），"
+            f"不传时默认取 {DEFAULT_EXPORT_START_TS} 之后的数据"
+        ),
    )
    parser.add_argument(
        "--end-ts",
@@ -83,9 +89,9 @@ def parse_args() -> argparse.Namespace:


 def apply_default_time_filter(args: argparse.Namespace) -> None:
-    # 未显式传时间范围时，默认导出最近7天的数据
+    # 未显式传时间范围时，默认导出指定时间戳之后的数据
    if args.start_ts is None and args.end_ts is None:
-        args.start_ts = int(time.time()) - 7 * 24 * 3600
+        args.start_ts = DEFAULT_EXPORT_START_TS
        args.end_ts = 0
        return
    if args.start_ts is None:
@@ -211,11 +217,10 @@ def export_to_excel(
    ws = wb.active
    ws.title = "lawyers"

-    headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"]
+    headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain", "URL"]
    if include_extra:
        headers.extend(
            [
-                "URL",
                "站点",
                "create_time",
                "create_time_text",
@@ -270,12 +275,12 @@ def export_to_excel(
            row.get("city", "") or "",
            site_name,
            row.get("domain", "") or "",
+            row.get("url", "") or "",
        ]

        if include_extra:
            line.extend(
                [
-                    row.get("url", "") or "",
                    row.get("domain", "") or "",
                    row.get("create_time", "") or "",
                    ts_to_text(row.get("create_time")),
@@ -1,16 +1,9 @@
-import argparse
-import ast
-import hashlib
 import json
 import os
-import random
-import re
 import sys
 import time
-from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Set, Tuple
-
-import urllib3
+import random
+from typing import Dict, List, Set, Optional

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -20,460 +13,212 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

+import requests
+from request.proxy_config import get_proxies, report_proxy_status
 from Db import Db
-from request.requests_client import RequestClientError, RequestsClient
-from utils.rate_limiter import wait_for_request

-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-SITE_NAME = "findlaw"
-LEGACY_DOMAIN = "找法网"
-SITE_BASE = "https://m.findlaw.cn"
-CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
-LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
-
-PHONE_RE = re.compile(r"1[3-9]\d{9}")
+DOMAIN = "找法网"
+LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"


-@dataclass
-class CityTarget:
-    province_id: str
-    province_name: str
-    province_py: str
-    city_id: str
-    city_name: str
-    city_py: str
-
-
-def normalize_phone(text: str) -> str:
-    compact = re.sub(r"\D", "", text or "")
-    match = PHONE_RE.search(compact)
-    return match.group(0) if match else ""
-
-
-class FindlawCrawler:
-    def __init__(
-        self,
-        max_pages: int = 9999,
-        sleep_seconds: float = 0.1,
-        use_proxy: bool = True,
-        db_connection=None,
-    ):
-        self.max_pages = max_pages
-        self.sleep_seconds = max(0.0, sleep_seconds)
+class FindlawSpider:
+    def __init__(self, db_connection):
        self.db = db_connection
-        self.client = RequestsClient(
-            headers={
-                "User-Agent": (
-                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
-                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
-                    "Mobile/15E148 Safari/604.1"
-                ),
-                "Accept": "application/json, text/javascript, */*; q=0.01",
-                "X-Requested-With": "XMLHttpRequest",
-                "Connection": "close",
-            },
-            use_proxy=use_proxy,
-            retry_total=2,
-            retry_backoff_factor=1,
-            retry_status_forcelist=(429, 500, 502, 503, 504),
-            retry_allowed_methods=("GET",),
-        )
+        self.session = self._build_session()
+        self.cities = self._load_cities()

-    def _get_text(
-        self,
-        url: str,
-        timeout: int = 20,
-        max_retries: int = 3,
-        referer: str = SITE_BASE,
-    ) -> str:
-        headers = {"Referer": referer}
-        last_error: Optional[Exception] = None
+    def _build_session(self) -> requests.Session:
+        report_proxy_status()
+        session = requests.Session()
+        session.trust_env = False
+        proxies = get_proxies()
+        if proxies:
+            session.proxies.update(proxies)
+        else:
+            session.proxies.clear()
+        session.headers.update({
+            "User-Agent": (
+                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                "Mobile/15E148 Safari/604.1"
+            ),
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "X-Requested-With": "XMLHttpRequest",
+            "Connection": "close",
+        })
+        return session

-        for attempt in range(max_retries):
-            wait_for_request()
-            try:
-                resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
-                code = resp.status_code
-                if code == 403:
-                    if attempt < max_retries - 1:
-                        self.client.refresh()
-                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                        continue
-                    raise RequestClientError(f"{code} Error: {url}")
-                if code >= 500 and attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                if code >= 400:
-                    raise RequestClientError(f"{code} Error: {url}")
-                return resp.text
-            except Exception as exc:
-                last_error = exc
-                if attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                raise
-
-        if last_error is not None:
-            raise last_error
-        raise RequestClientError(f"Unknown request error: {url}")
-
-    def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
-        pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
-        match = re.search(pattern, script_text)
-        if not match:
-            return []
-        raw = match.group(1)
+    def _refresh_session(self) -> None:
        try:
-            rows = ast.literal_eval(raw)
-            return rows if isinstance(rows, list) else []
+            self.session.close()
        except Exception:
-            return []
+            pass
+        self.session = self._build_session()

-    def discover_cities(self) -> List[CityTarget]:
-        js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
-        provinces = self._parse_city_js_array(js_text, "iosProvinces")
-        cities = self._parse_city_js_array(js_text, "iosCitys")
-
-        province_map: Dict[str, Dict] = {}
-        for item in provinces:
-            pid = str(item.get("id") or "").strip()
-            if pid:
-                province_map[pid] = item
-
-        results: List[CityTarget] = []
-        seen_py: Set[str] = set()
-        for city in cities:
-            city_py = str(city.get("pinyin") or "").strip()
-            city_name = str(city.get("value") or "").strip()
-            city_id = str(city.get("id") or "").strip()
-            province_id = str(city.get("parentId") or "").strip()
-            if not city_py or not city_name or not city_id:
-                continue
-            if city_py in seen_py:
-                continue
-            seen_py.add(city_py)
-
-            province_row = province_map.get(province_id, {})
-            province_name = str(province_row.get("value") or city_name).strip()
-            province_py = str(province_row.get("pinyin") or city_py).strip()
-
-            results.append(
-                CityTarget(
-                    province_id=province_id,
-                    province_name=province_name,
-                    province_py=province_py,
-                    city_id=city_id,
-                    city_name=city_name,
-                    city_py=city_py,
-                )
-            )
-        return results
-
-    def _parse_list_payload(self, text: str) -> Dict:
-        cleaned = (text or "").strip().lstrip("\ufeff")
-        try:
-            return json.loads(cleaned)
-        except ValueError:
-            start = cleaned.find("{")
-            end = cleaned.rfind("}")
-            if start == -1 or end == -1:
-                return {}
-            return json.loads(cleaned[start:end + 1])
-
-    def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
-        list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
-        referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
-        text = self._get_text(list_url, referer=referer)
-        payload = self._parse_list_payload(text)
-        if payload.get("errcode") != 0:
-            return [], False, list_url
-
-        data = payload.get("data", {}) or {}
-        items = data.get("lawyer_list", []) or []
-        has_more = str(data.get("has_more", "0")) == "1"
-        return items, has_more, list_url
-
-    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
-        for page in range(1, self.max_pages + 1):
+    def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
+        headers = {"Referer": referer}
+        for attempt in range(max_retries):
            try:
-                items, has_more, list_url = self.fetch_list_page(target.city_py, page)
-            except Exception as exc:
-                print(f"[list] 失败 {target.city_py} p{page}: {exc}")
-                break
+                resp = self.session.get(url, timeout=15, verify=verify, headers=headers)
+                status_code = resp.status_code
+                text = resp.text
+                resp.close()
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
+                return text
+            except requests.exceptions.SSLError:
+                if verify:
+                    return self._get(url, referer, verify=False, max_retries=max_retries)
+                print(f"SSL错误 {url}")
+                return None
+            except requests.exceptions.RequestException as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        return None

-            if not items:
-                break
-
-            for item in items:
-                detail_url = item.get("siteask_m") or item.get("site_url") or ""
-                detail_url = str(detail_url).strip()
-                if not detail_url.startswith("http"):
-                    detail_url = list_url
-
-                phone = normalize_phone(item.get("mobile", ""))
-                profile = {
-                    "uid": str(item.get("uid") or ""),
-                    "name": str(item.get("username") or "").strip(),
-                    "law_firm": str(item.get("lawyer_lawroom") or "").strip(),
-                    "phone": phone,
-                    "lawyer_year": item.get("lawyer_year"),
-                    "service_area": str(item.get("service_area") or "").strip(),
-                    "address": str(item.get("addr") or "").strip(),
-                    "specialties": item.get("professionArr") or [],
-                    "answer_count": item.get("ansnum"),
-                    "comment_count": item.get("askcommentnum"),
-                }
-
-                now = int(time.time())
-                uid = profile.get("uid", "")
-                record_key = uid or detail_url
-                record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
-
-                area = item.get("areaInfo", {}) or {}
-                yield {
-                    "record_id": record_id,
-                    "collected_at": now,
-                    "source": {
-                        "site": SITE_NAME,
-                        "list_url": list_url,
-                        "detail_url": detail_url,
-                        "province": str(area.get("province") or target.province_name),
-                        "province_py": target.province_py,
-                        "city": str(area.get("city") or target.city_name),
-                        "city_py": target.city_py,
-                        "page": page,
-                    },
-                    "list_snapshot": {
-                        "uid": uid,
-                        "name": profile["name"],
-                        "law_firm": profile["law_firm"],
-                        "answer_count": profile["answer_count"],
-                        "comment_count": profile["comment_count"],
-                    },
-                    "profile": profile,
-                    "raw": item,
-                }
-                if self.sleep_seconds:
-                    time.sleep(self.sleep_seconds)
-
-            if not has_more:
-                break
-
-    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
-        source = record.get("source", {}) or {}
-        profile = record.get("profile", {}) or {}
-        phone = normalize_phone(profile.get("phone", ""))
-        if not phone:
-            return None
-
-        province = (source.get("province") or "").strip()
-        city = (source.get("city") or province).strip()
-        return {
-            "name": (profile.get("name") or "").strip(),
-            "law_firm": (profile.get("law_firm") or "").strip(),
-            "province": province,
-            "city": city,
-            "phone": phone,
-            "url": (source.get("detail_url") or source.get("list_url") or "").strip(),
-            "domain": LEGACY_DOMAIN,
-            "create_time": int(record.get("collected_at") or time.time()),
-            "params": json.dumps(record, ensure_ascii=False),
-        }
-
-    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
-        if not self.db or not phones:
+    def _existing_phones(self, phones: List[str]) -> Set[str]:
+        if not phones:
            return set()
-        deduped = sorted({p for p in phones if p})
-        if not deduped:
-            return set()
-
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
-            for i in range(0, len(deduped), chunk_size):
-                chunk = deduped[i:i + chunk_size]
+            for i in range(0, len(phones), chunk_size):
+                chunk = phones[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
+                cur.execute(sql, [DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
        return existing

-    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
-        if not self.db:
-            return 0, 0
-
-        rows: List[Dict[str, str]] = []
-        for record in records:
-            row = self._to_legacy_lawyer_row(record)
-            if row:
-                rows.append(row)
-        if not rows:
-            return 0, 0
-
-        existing = self._existing_phones_in_db([row["phone"] for row in rows])
-        inserted = 0
-        skipped = 0
-        for row in rows:
-            phone = row.get("phone", "")
-            if not phone or phone in existing:
-                skipped += 1
-                continue
+    def _load_cities(self):
+        condition = "domain='findlaw' AND level=2"
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
            try:
-                self.db.insert_data("lawyer", row)
-                existing.add(phone)
-                inserted += 1
+                rows = self.db.select_data(table, "city, province, pinyin", condition) or []
            except Exception as exc:
-                skipped += 1
-                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
-        return inserted, skipped
+                last_error = exc
+                continue
+            if rows:
+                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
+                print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
+                return rows

-    def crawl(
-        self,
-        output_path: str,
-        max_cities: int = 0,
-        city_filter: Optional[str] = None,
-    ) -> None:
-        cities = self.discover_cities()
-        print(f"[discover] 共发现城市 {len(cities)} 个")
-        if city_filter:
-            key = city_filter.strip().lower()
-            cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
-            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
-        if max_cities > 0:
-            cities = cities[:max_cities]
-            print(f"[discover] 截断城市数 {len(cities)}")
+        if last_error:
+            print(f"[找法网] 加载地区数据失败: {last_error}")
+        print("[找法网] 无城市数据（已尝试 area_new/area2/area）")
+        for table in tables:
+            try:
+                cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
+                c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
+                print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
+            except Exception:
+                pass
+        return []

-        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    def _fetch_page(self, url: str, referer: str) -> List[Dict]:
+        text = self._get(url, referer, verify=True)
+        if not text:
+            return []

-        seen_ids: Set[str] = set()
-        if os.path.exists(output_path):
-            with open(output_path, "r", encoding="utf-8") as old_file:
-                for line in old_file:
-                    line = line.strip()
-                    if not line:
+        try:
+            # 某些返回体前会携带 BOM 或包装脚本，此处做兼容
+            text = text.strip().lstrip("\ufeff")
+            try:
+                data = json.loads(text)
+            except ValueError:
+                json_start = text.find('{')
+                json_end = text.rfind('}')
+                if json_start == -1 or json_end == -1:
+                    print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
+                    return []
+                cleaned = text[json_start:json_end + 1]
+                data = json.loads(cleaned)
+            if isinstance(data, str):
+                try:
+                    data = json.loads(data)
+                except ValueError:
+                    print(f"解析JSON失败 {url}: 二次解析仍为字符串，开头: {str(data)[:80]!r}")
+                    return []
+        except ValueError as exc:
+            print(f"解析JSON失败 {url}: {exc}")
+            return []
+
+        items = data.get("data", {}).get("lawyer_list", [])
+        parsed = []
+        for item in items:
+            phone = (item.get("mobile") or "").replace("-", "")
+            parsed.append({
+                "name": item.get("username", ""),
+                "law_firm": item.get("lawyer_lawroom", ""),
+                "province": item.get("areaInfo", {}).get("province", ""),
+                "city": item.get("areaInfo", {}).get("city", ""),
+                "phone": phone,
+                "url": url,
+                "domain": DOMAIN,
+                "create_time": int(time.time()),
+                "params": json.dumps(item, ensure_ascii=False)
+            })
+        return parsed
+
+    def run(self):
+        print("启动找法网采集...")
+        if not self.cities:
+            print("无城市数据")
+            return
+
+        for city in self.cities:
+            pinyin = city.get("pinyin")
+            province = city.get("province", "")
+            city_name = city.get("city", "")
+            if not pinyin:
+                continue
+            print(f"采集 {province}-{city_name}")
+            page = 1
+            while True:
+                url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
+                referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
+                print(f"  第 {page} 页: {url}")
+                items = self._fetch_page(url, referer)
+                if not items:
+                    break
+
+                phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
+                existing = self._existing_phones(phones)
+
+                for entry in items:
+                    phone = entry.get("phone")
+                    if not phone:
+                        continue
+                    if phone in existing:
+                        print(f"    -- 已存在: {entry['name']} ({phone})")
                        continue
                    try:
-                        item = json.loads(line)
-                    except Exception:
-                        continue
-                    rid = item.get("record_id")
-                    if rid:
-                        seen_ids.add(rid)
-            print(f"[resume] 已有记录 {len(seen_ids)} 条")
+                        self.db.insert_data("lawyer", entry)
+                        print(f"    -> 新增: {entry['name']} ({phone})")
+                    except Exception as exc:
+                        print(f"    插入失败: {exc}")

-        total_new_json = 0
-        total_new_db = 0
-        total_skip_db = 0
+                page += 1

-        with open(output_path, "a", encoding="utf-8") as out:
-            for idx, target in enumerate(cities, start=1):
-                print(
-                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
-                    f"({target.city_py})"
-                )
-                city_records = list(self.crawl_city(target))
-
-                city_new_json = 0
-                for record in city_records:
-                    rid = record["record_id"]
-                    if rid in seen_ids:
-                        continue
-                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
-                    seen_ids.add(rid)
-                    city_new_json += 1
-                    total_new_json += 1
-
-                city_new_db, city_skip_db = self._write_records_to_db(city_records)
-                total_new_db += city_new_db
-                total_skip_db += city_skip_db
-                print(
-                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
-                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
-                )
-
-        print(
-            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
-            f"DB跳过{total_skip_db}条, 输出: {output_path}"
-        )
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="找法网全新采集脚本（重写版）")
-    parser.add_argument(
-        "--output",
-        default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
-        help="输出 jsonl 文件路径",
-    )
-    parser.add_argument(
-        "--max-cities",
-        type=int,
-        default=0,
-        help="最多采集多少个城市，0 表示不限",
-    )
-    parser.add_argument(
-        "--max-pages",
-        type=int,
-        default=9999,
-        help="每个城市最多采集多少页",
-    )
-    parser.add_argument(
-        "--city-filter",
-        default="",
-        help="按城市拼音或城市名过滤，如 beijing",
-    )
-    parser.add_argument(
-        "--sleep",
-        type=float,
-        default=0.1,
-        help="每条记录采集间隔秒数",
-    )
-    parser.add_argument(
-        "--direct",
-        action="store_true",
-        help="直连模式，不使用 proxy_settings.json 代理",
-    )
-    parser.add_argument(
-        "--no-db",
-        action="store_true",
-        help="只输出 JSONL，不写入数据库",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    if args.no_db:
-        crawler = FindlawCrawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=None,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
-        return
-
-    with Db() as db:
-        crawler = FindlawCrawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=db,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
+        print("找法网采集完成")


 if __name__ == "__main__":
-    main()
+    with Db() as db:
+        spider = FindlawSpider(db)
+        spider.run()
@@ -1,16 +1,13 @@
-import argparse
-import hashlib
 import json
 import os
-import random
 import re
 import sys
 import time
-from dataclasses import dataclass, field
-from typing import Dict, Iterable, List, Optional, Set, Tuple
-
-import urllib3
-from bs4 import BeautifulSoup
+import random
+from typing import Dict, Optional, List, Set
+from urllib.parse import urljoin
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -20,628 +17,281 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

-from Db import Db
-from request.requests_client import RequestClientError, RequestsClient
-from utils.rate_limiter import wait_for_request
+import requests
+import urllib3
+from bs4 import BeautifulSoup
+from request.proxy_config import get_proxies, report_proxy_status

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

-SITE_NAME = "lawtime"
-LEGACY_DOMAIN = "法律快车"
-SITE_BASE = "https://www.lawtime.cn"
-PROVINCE_API = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode=0"
-CITY_API_TEMPLATE = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode={province_id}"
-LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/lawyer"
+from Db import Db
+from config import LAWTIME_CONFIG

-PHONE_RE = re.compile(r"1[3-9]\d{9}")
-YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
+LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
+DETAIL_BASE = "https://m.lawtime.cn"
+DOMAIN = "法律快车"


-@dataclass
-class CityTarget:
-    province_id: str
-    province_name: str
-    province_py: str
-    city_id: str
-    city_name: str
-    city_py: str
-
-
-@dataclass
-class ListCard:
-    detail_url: str
-    name: str
-    phone: str
-    address: str = ""
-    specialties: List[str] = field(default_factory=list)
-    metric_text: str = ""
-
-
-def normalize_phone(text: str) -> str:
-    compact = re.sub(r"\D", "", text or "")
-    match = PHONE_RE.search(compact)
-    return match.group(0) if match else ""
-
-
-class LawtimeCrawler:
-    def __init__(
-        self,
-        max_pages: int = 9999,
-        sleep_seconds: float = 0.1,
-        use_proxy: bool = True,
-        db_connection=None,
-    ):
-        self.max_pages = max_pages
-        self.sleep_seconds = max(0.0, sleep_seconds)
+class LawtimeSpider:
+    def __init__(self, db_connection):
        self.db = db_connection
-        self.client = RequestsClient(
-            headers={
-                "User-Agent": (
-                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                    "AppleWebKit/537.36 (KHTML, like Gecko) "
-                    "Chrome/122.0.0.0 Safari/537.36"
-                ),
-                "Accept": "text/html,application/json,*/*;q=0.8",
-                "Connection": "close",
-            },
-            use_proxy=use_proxy,
-            retry_total=2,
-            retry_backoff_factor=1,
-            retry_status_forcelist=(429, 500, 502, 503, 504),
-            retry_allowed_methods=("GET",),
-        )
+        self.session = self._build_session()
+        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+        self._tls = threading.local()

-    def _get_text(
-        self,
-        url: str,
-        *,
-        timeout: int = 20,
-        max_retries: int = 3,
-        referer: str = SITE_BASE,
-    ) -> str:
-        headers = {"Referer": referer}
-        last_error: Optional[Exception] = None
+    def _build_session(self) -> requests.Session:
+        report_proxy_status()
+        session = requests.Session()
+        session.trust_env = False
+        proxies = get_proxies()
+        if proxies:
+            session.proxies.update(proxies)
+        else:
+            session.proxies.clear()
+        headers = LAWTIME_CONFIG.get("HEADERS", {})
+        if headers:
+            session.headers.update(headers)
+        session.headers.setdefault("Connection", "close")
+        return session

-        for attempt in range(max_retries):
-            wait_for_request()
-            try:
-                resp = self.client.get_text(
-                    url,
-                    timeout=timeout,
-                    verify=False,
-                    headers=headers,
-                )
-                code = resp.status_code
-                if code == 403:
-                    if attempt < max_retries - 1:
-                        self.client.refresh()
-                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                        continue
-                    raise RequestClientError(f"{code} Error: {url}")
-                if code >= 500 and attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                if code >= 400:
-                    raise RequestClientError(f"{code} Error: {url}")
-                return resp.text
-            except Exception as exc:
-                last_error = exc
-                if attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                raise
-
-        if last_error is not None:
-            raise last_error
-        raise RequestClientError(f"Unknown request error: {url}")
-
-    def _get_json(self, url: str, *, referer: str) -> List[Dict]:
-        text = self._get_text(url, referer=referer)
-        cleaned = (text or "").strip().lstrip("\ufeff")
-        if not cleaned or cleaned.startswith("<"):
-            return []
+    def _refresh_session(self) -> None:
        try:
-            data = json.loads(cleaned)
-        except ValueError:
-            return []
-        return data if isinstance(data, list) else []
+            self.session.close()
+        except Exception:
+            pass
+        self.session = self._build_session()

-    def discover_cities(self) -> List[CityTarget]:
-        provinces = self._get_json(PROVINCE_API, referer=SITE_BASE)
-        if not provinces:
-            print("[discover] 地区接口未返回有效数据")
-            return []
+    def _get_thread_session(self) -> requests.Session:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            return s
+        s = self._build_session()
+        s.headers.update(dict(self.session.headers))
+        self._tls.session = s
+        return s

-        results: List[CityTarget] = []
-        seen_py: Set[str] = set()
-
-        for province in provinces:
-            province_id = str(province.get("id") or "").strip()
-            province_name = str(province.get("province") or province.get("city") or "").strip()
-            province_py = str(province.get("pinyin") or "").strip()
-            if not province_id or not province_name:
-                continue
-
-            city_api = CITY_API_TEMPLATE.format(province_id=province_id)
+    def _refresh_thread_session(self) -> None:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
            try:
-                cities = self._get_json(city_api, referer=LIST_URL_TEMPLATE.format(city_py=province_py or ""))
-            except Exception as exc:
-                print(f"[city] 获取失败 province={province_id}: {exc}")
-                continue
-
-            if not cities:
-                cities = [
-                    {
-                        "id": province_id,
-                        "province": province_name,
-                        "city": province_name,
-                        "pinyin": province_py,
-                    }
-                ]
-
-            for city in cities:
-                city_id = str(city.get("id") or "").strip()
-                city_name = str(city.get("city") or city.get("province") or "").strip()
-                city_py = str(city.get("pinyin") or "").strip()
-                if not city_id or not city_name or not city_py:
-                    continue
-                if city_py in seen_py:
-                    continue
-                seen_py.add(city_py)
-
-                results.append(
-                    CityTarget(
-                        province_id=province_id,
-                        province_name=province_name,
-                        province_py=province_py,
-                        city_id=city_id,
-                        city_name=city_name,
-                        city_py=city_py,
-                    )
-                )
-
-        return results
-
-    def _build_list_url(self, city_py: str, page: int) -> str:
-        base = LIST_URL_TEMPLATE.format(city_py=city_py)
-        if page <= 1:
-            return base
-        return f"{base}?page={page}"
-
-    def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[ListCard], bool, str]:
-        list_url = self._build_list_url(target.city_py, page)
-        html = self._get_text(list_url, referer=SITE_BASE + "/")
-
-        cards = self.parse_list_cards(html)
-
-        soup = BeautifulSoup(html, "html.parser")
-        next_link = soup.select_one(f"div.page a[href*='page={page + 1}']")
-        has_next = next_link is not None
-
-        return cards, has_next, list_url
-
-    def parse_list_cards(self, html: str) -> List[ListCard]:
-        soup = BeautifulSoup(html, "html.parser")
-        cards: List[ListCard] = []
-        seen: Set[str] = set()
-
-        for item in soup.select("li.lawyer-item-card"):
-            link_tag = item.select_one("a.name[href]") or item.select_one("a.lawyer-img-box[href]")
-            if not link_tag:
-                continue
-            detail_url = (link_tag.get("href") or "").strip()
-            if not detail_url.startswith("http"):
-                continue
-            if detail_url in seen:
-                continue
-            seen.add(detail_url)
-
-            name = link_tag.get_text(strip=True)
-            phone = ""
-            phone_tag = item.select_one("div.phone")
-            if phone_tag:
-                phone = normalize_phone(phone_tag.get_text(" ", strip=True))
-
-            address = ""
-            addr_tag = item.select_one("div.location .txt")
-            if addr_tag:
-                address = addr_tag.get_text(" ", strip=True)
-
-            specialties: List[str] = []
-            prof_tag = item.select_one("div.prof .txt")
-            if prof_tag:
-                specialties = [
-                    x.strip() for x in re.split(r"[、,，]", prof_tag.get_text(" ", strip=True)) if x.strip()
-                ]
-
-            metric_text = ""
-            metric_tag = item.select_one("div.num-msg")
-            if metric_tag:
-                metric_text = metric_tag.get_text(" ", strip=True)
-
-            cards.append(
-                ListCard(
-                    detail_url=detail_url,
-                    name=name,
-                    phone=phone,
-                    address=address,
-                    specialties=specialties,
-                    metric_text=metric_text,
-                )
-            )
-
-        return cards
-
-    def parse_detail(self, detail_url: str) -> Dict:
-        html = self._get_text(detail_url, referer=SITE_BASE)
-        if "网站防火墙" in html or "访问被拒绝" in html or "/ipfilter/verify" in html:
-            raise RequestClientError(f"firewall blocked: {detail_url}")
-
-        soup = BeautifulSoup(html, "html.parser")
-        text = soup.get_text(" ", strip=True)
-
-        name = ""
-        law_firm = ""
-        phone = ""
-        address = ""
-        practice_years: Optional[int] = None
-        specialties: List[str] = []
-
-        if soup.title:
-            title = soup.title.get_text(" ", strip=True)
-            match = re.search(r"([^\s_，,。]+?)律师", title)
-            if match:
-                name = match.group(1).strip()
-
-        phone_candidates = [
-            soup.select_one(".data-w .tel-b b").get_text(" ", strip=True)
-            if soup.select_one(".data-w .tel-b b")
-            else "",
-            soup.select_one(".law-info-b .item .two-r.b").get_text(" ", strip=True)
-            if soup.select_one(".law-info-b .item .two-r.b")
-            else "",
-            text,
-        ]
-        for candidate in phone_candidates:
-            phone = normalize_phone(candidate)
-            if phone:
-                break
-
-        law_firm_tag = soup.select_one(".law-info-b .item .two-nowrap")
-        if law_firm_tag:
-            law_firm = law_firm_tag.get_text(" ", strip=True)
-
-        for li in soup.select(".law-info-b .item"):
-            li_text = li.get_text(" ", strip=True)
-            if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
-                law_firm = li_text
-
-        addr_tag = soup.select_one(".law-info-b .item .two-r[title]")
-        if addr_tag:
-            addr_value = (addr_tag.get("title") or "").strip()
-            if len(addr_value) > 8:
-                address = addr_value
-
-        if not address:
-            addr_tag = soup.select_one(".law-info-b .item .two-r")
-            if addr_tag:
-                addr_value = addr_tag.get_text(" ", strip=True)
-                if len(addr_value) > 8 and "律师" not in addr_value:
-                    address = addr_value
-
-        year_match = YEAR_RE.search(text)
-        if year_match:
-            try:
-                practice_years = int(year_match.group(1))
+                s.close()
            except Exception:
-                practice_years = None
+                pass
+        self._tls.session = None

-        specialties = [x.get_text(strip=True) for x in soup.select(".profession-b .item") if x.get_text(strip=True)]
-
-        return {
-            "name": name,
-            "law_firm": law_firm,
-            "phone": phone,
-            "address": address,
-            "practice_years": practice_years,
-            "specialties": specialties,
-            "detail_url": detail_url,
-        }
-
-    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
-        seen_details: Set[str] = set()
-
-        for page in range(1, self.max_pages + 1):
-            try:
-                cards, has_next, list_url = self.fetch_list_page(target, page)
-            except Exception as exc:
-                print(f"[list] 失败 {target.city_py} p{page}: {exc}")
-                break
-
-            if not cards:
-                break
-
-            for card in cards:
-                if card.detail_url in seen_details:
-                    continue
-                seen_details.add(card.detail_url)
-
-                detail: Dict = {}
-                try:
-                    detail = self.parse_detail(card.detail_url)
-                except Exception as exc:
-                    print(f"[detail] 失败 {card.detail_url}: {exc}")
-
-                phone = normalize_phone(detail.get("phone") or card.phone)
-                profile_name = (detail.get("name") or card.name).replace("律师", "").strip()
-
-                now = int(time.time())
-                record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
-
-                yield {
-                    "record_id": record_id,
-                    "collected_at": now,
-                    "source": {
-                        "site": SITE_NAME,
-                        "province_id": target.province_id,
-                        "province": target.province_name,
-                        "province_py": target.province_py,
-                        "city_id": target.city_id,
-                        "city": target.city_name,
-                        "city_py": target.city_py,
-                        "page": page,
-                        "list_url": list_url,
-                        "detail_url": card.detail_url,
-                    },
-                    "list_snapshot": {
-                        "name": card.name,
-                        "phone": card.phone,
-                        "address": card.address,
-                        "specialties": card.specialties,
-                        "metric_text": card.metric_text,
-                    },
-                    "profile": {
-                        "name": profile_name,
-                        "law_firm": (detail.get("law_firm") or "").strip(),
-                        "phone": phone,
-                        "address": (detail.get("address") or card.address or "").strip(),
-                        "practice_years": detail.get("practice_years"),
-                        "specialties": detail.get("specialties") or card.specialties,
-                    },
-                }
-
-                if self.sleep_seconds:
-                    time.sleep(self.sleep_seconds)
-
-            if not has_next:
-                break
-
-    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
-        source = record.get("source", {}) or {}
-        profile = record.get("profile", {}) or {}
-
-        phone = normalize_phone(profile.get("phone", ""))
-        if not phone:
-            return None
-
-        province = (source.get("province") or "").strip()
-        city = (source.get("city") or province).strip()
-        return {
-            "name": (profile.get("name") or "").strip(),
-            "law_firm": (profile.get("law_firm") or "").strip(),
-            "province": province,
-            "city": city,
-            "phone": phone,
-            "url": (source.get("detail_url") or source.get("list_url") or "").strip(),
-            "domain": LEGACY_DOMAIN,
-            "create_time": int(record.get("collected_at") or time.time()),
-            "params": json.dumps(record, ensure_ascii=False),
-        }
-
-    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
-        if not self.db or not phones:
+    def _existing_phones(self, phones: List[str]) -> Set[str]:
+        if not phones:
            return set()
-
-        deduped = sorted({p for p in phones if p})
-        if not deduped:
-            return set()
-
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
-            for i in range(0, len(deduped), chunk_size):
-                chunk = deduped[i:i + chunk_size]
+            for i in range(0, len(phones), chunk_size):
+                chunk = phones[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
+                cur.execute(sql, [DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
-
        return existing

-    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
-        if not self.db:
-            return 0, 0
+    def _load_areas(self):
+        condition = "level = 2 and domain='法律快车'"
+        tables = ("area_new", "area", "area2")
+        last_error = None
+        for table in tables:
+            try:
+                rows = self.db.select_data(table, "pinyin, province, city", condition) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+            if rows:
+                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
+                print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
+                return rows

-        rows: List[Dict[str, str]] = []
-        for record in records:
-            row = self._to_legacy_lawyer_row(record)
-            if row:
-                rows.append(row)
-        if not rows:
-            return 0, 0
+        if last_error:
+            print(f"[法律快车] 加载地区数据失败: {last_error}")
+        print("[法律快车] 无城市数据（已尝试 area_new/area/area2）")
+        return []

-        existing = self._existing_phones_in_db([row["phone"] for row in rows])
-        inserted = 0
-        skipped = 0
+    def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
+        return self._get_with_session(self.session, url, max_retries=max_retries, is_thread=False)

-        for row in rows:
-            phone = row.get("phone", "")
-            if not phone or phone in existing:
-                skipped += 1
+    def _get_with_session(self, session: requests.Session, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = session.get(url, timeout=15, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                resp.close()
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"请求失败 {url}: 403，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        if is_thread:
+                            self._refresh_thread_session()
+                            session = self._get_thread_session()
+                        else:
+                            self._refresh_session()
+                            session = self.session
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
+                return text
+            except requests.exceptions.RequestException as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        return None
+
+    def _parse_list(self, html: str, province: str, city: str) -> int:
+        soup = BeautifulSoup(html, "html.parser")
+        links = [a.get("href", "") for a in soup.select("a.hide_link")]
+        links = [link.replace("lll", "int") for link in links if link]
+        if not links:
+            return 0
+
+        detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
+
+        results: List[Dict[str, str]] = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
+            for fut in as_completed(futs):
+                try:
+                    data = fut.result()
+                except Exception as exc:
+                    print(f"  详情解析异常: {exc}")
+                    continue
+                if data and data.get("phone"):
+                    results.append(data)
+
+        if not results:
+            return len(detail_urls)
+
+        phones = [d["phone"] for d in results if d.get("phone")]
+        existing = self._existing_phones(phones)
+
+        for data in results:
+            phone = data.get("phone")
+            if not phone:
+                continue
+            if phone in existing:
+                print(f"  -- 已存在: {data['name']} ({phone})")
                continue
            try:
-                self.db.insert_data("lawyer", row)
-                existing.add(phone)
-                inserted += 1
+                self.db.insert_data("lawyer", data)
+                print(f"  -> 新增: {data['name']} ({phone})")
            except Exception as exc:
-                skipped += 1
-                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
+                print(f"  插入失败 {data.get('url')}: {exc}")

-        return inserted, skipped
+        return len(detail_urls)

-    def crawl(
-        self,
-        output_path: str,
-        max_cities: int = 0,
-        city_filter: Optional[str] = None,
-    ) -> None:
-        cities = self.discover_cities()
-        print(f"[discover] 共发现城市 {len(cities)} 个")
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        html = None
+        sess = self._get_thread_session()
+        html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
+        if not html:
+            return None

-        if city_filter:
-            key = city_filter.strip().lower()
-            cities = [
-                c for c in cities
-                if key in c.city_py.lower() or key in c.city_name.lower()
-            ]
-            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
+        soup = BeautifulSoup(html, "html.parser")
+        text = soup.get_text(" ")

-        if max_cities > 0:
-            cities = cities[:max_cities]
-            print(f"[discover] 截断城市数 {len(cities)}")
+        name = ""
+        title_tag = soup.find("title")
+        if title_tag:
+            match = re.search(r"(\S+)律师", title_tag.get_text())
+            if match:
+                name = match.group(1)
+        if not name:
+            intl_div = soup.find("div", class_="intl")
+            if intl_div:
+                match = re.search(r"(\S+)律师", intl_div.get_text())
+                if match:
+                    name = match.group(1)

-        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+        phone = ""
+        phone_pattern = r"1[3-9]\d{9}"
+        for item in soup.select("div.item.flex"):
+            label = item.find("div", class_="label")
+            desc = item.find("div", class_="desc")
+            if not label or not desc:
+                continue
+            label_text = label.get_text()
+            desc_text = desc.get_text().replace("-", "")
+            if "联系电话" in label_text or "电话" in label_text:
+                matches = re.findall(phone_pattern, desc_text)
+                if matches:
+                    phone = matches[0]
+                    break
+        if not phone:
+            matches = re.findall(phone_pattern, text.replace("-", ""))
+            if matches:
+                phone = matches[0]
+        if not phone:
+            print(f"  无手机号: {url}")
+            return None

-        seen_ids: Set[str] = set()
-        if os.path.exists(output_path):
-            with open(output_path, "r", encoding="utf-8") as old_file:
-                for line in old_file:
-                    line = line.strip()
-                    if not line:
-                        continue
-                    try:
-                        item = json.loads(line)
-                    except Exception:
-                        continue
-                    rid = item.get("record_id")
-                    if rid:
-                        seen_ids.add(rid)
-            print(f"[resume] 已有记录 {len(seen_ids)} 条")
+        law_firm = ""
+        for item in soup.select("div.item.flex"):
+            label = item.find("div", class_="label")
+            desc = item.find("div", class_="desc")
+            if not label or not desc:
+                continue
+            if "执业律所" in label.get_text() or "律所" in label.get_text():
+                law_firm = desc.get_text(strip=True).replace("已认证", "")
+                break

-        total_new_json = 0
-        total_new_db = 0
-        total_skip_db = 0
+        params = {
+            "list_url": url,
+            "province": province,
+            "city": city,
+        }

-        with open(output_path, "a", encoding="utf-8") as out:
-            for idx, target in enumerate(cities, start=1):
-                print(
-                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
-                    f"({target.city_py})"
-                )
-                city_records = list(self.crawl_city(target))
+        return {
+            "name": name or "",
+            "law_firm": law_firm,
+            "province": province,
+            "city": city,
+            "phone": phone,
+            "url": url,
+            "domain": DOMAIN,
+            "create_time": int(time.time()),
+            "params": json.dumps(params, ensure_ascii=False)
+        }

-                city_new_json = 0
-                for record in city_records:
-                    rid = record["record_id"]
-                    if rid in seen_ids:
-                        continue
-                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
-                    seen_ids.add(rid)
-                    city_new_json += 1
-                    total_new_json += 1
+    def run(self):
+        print("启动法律快车采集...")
+        areas = self._load_areas()
+        if not areas:
+            print("无地区数据")
+            return

-                city_new_db, city_skip_db = self._write_records_to_db(city_records)
-                total_new_db += city_new_db
-                total_skip_db += city_skip_db
-
-                print(
-                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
-                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
-                )
-
-        print(
-            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
-            f"DB跳过{total_skip_db}条, 输出: {output_path}"
-        )
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="法律快车全新采集脚本（站点数据直采）")
-    parser.add_argument(
-        "--output",
-        default="/www/wwwroot/lawyers/data/lawtime_records_all.jsonl",
-        help="输出 jsonl 文件路径",
-    )
-    parser.add_argument(
-        "--max-cities",
-        type=int,
-        default=0,
-        help="最多采集多少个城市，0 表示不限",
-    )
-    parser.add_argument(
-        "--max-pages",
-        type=int,
-        default=9999,
-        help="每个城市最多采集多少页",
-    )
-    parser.add_argument(
-        "--city-filter",
-        default="",
-        help="按城市拼音或城市名过滤，如 beijing",
-    )
-    parser.add_argument(
-        "--sleep",
-        type=float,
-        default=0.1,
-        help="详情页请求间隔秒数",
-    )
-    parser.add_argument(
-        "--direct",
-        action="store_true",
-        help="直连模式，不使用 proxy_settings.json 代理",
-    )
-    parser.add_argument(
-        "--no-db",
-        action="store_true",
-        help="只输出 JSONL，不写入数据库",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    if args.no_db:
-        crawler = LawtimeCrawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=None,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
-        return
-
-    with Db() as db:
-        crawler = LawtimeCrawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=db,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
+        for area in areas:
+            pinyin = area.get("pinyin")
+            province = area.get("province", "")
+            city = area.get("city", "")
+            if not pinyin:
+                continue
+            page = 1
+            while True:
+                list_url = LIST_BASE.format(pinyin=pinyin, page=page)
+                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
+                html = self._get(list_url)
+                if not html:
+                    break
+                link_count = self._parse_list(html, province, city)
+                if link_count == 0:
+                    break
+                page += 1
+        print("法律快车采集完成")


 if __name__ == "__main__":
-    main()
+    with Db() as db:
+        spider = LawtimeSpider(db)
+        spider.run()
@@ -1,17 +1,11 @@
-import argparse
-import hashlib
 import json
 import os
-import random
-import re
 import sys
 import time
-from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Set, Tuple
-from urllib.parse import urljoin
-
-import urllib3
-from bs4 import BeautifulSoup
+import random
+from typing import Dict, Optional, List, Set
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -21,237 +15,165 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

-from Db import Db
-from request.requests_client import RequestClientError, RequestsClient
-from utils.rate_limiter import wait_for_request
+import requests
+import urllib3
+from bs4 import BeautifulSoup
+from request.proxy_config import get_proxies, report_proxy_status

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

-SITE_NAME = "64365"
-LEGACY_DOMAIN = "律图"
-SITE_BASE = "https://m.64365.com"
-AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js"
-LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
+from Db import Db

-PHONE_RE = re.compile(r"1[3-9]\d{9}")
-YEAR_RE = re.compile(r"(\d+)\s*年")
+DOMAIN = "律图"
+LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"


-@dataclass
-class CityTarget:
-    area_id: str
-    province_id: str
-    province_name: str
-    province_py: str
-    city_name: str
-    city_py: str
-
-
-@dataclass
-class ListCard:
-    detail_url: str
-    name: str
-    specialties: List[str]
-    score_text: str
-    service_text: str
-
-
-def normalize_phone(text: str) -> str:
-    compact = re.sub(r"\D", "", text or "")
-    match = PHONE_RE.search(compact)
-    return match.group(0) if match else ""
-
-
-class Six4365Crawler:
-    def __init__(
-        self,
-        max_pages: int = 9999,
-        sleep_seconds: float = 0.1,
-        use_proxy: bool = True,
-        db_connection=None,
-    ):
-        self.max_pages = max_pages
-        self.sleep_seconds = max(0.0, sleep_seconds)
+class Six4365Spider:
+    def __init__(self, db_connection):
        self.db = db_connection
-        self.client = RequestsClient(
-            headers={
-                "User-Agent": (
-                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
-                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
-                    "Mobile/15E148 Safari/604.1"
-                ),
-                "Accept": "text/html, */*; q=0.01",
-                "Connection": "close",
-            },
-            use_proxy=use_proxy,
-            retry_total=2,
-            retry_backoff_factor=1,
-            retry_status_forcelist=(429, 500, 502, 503, 504),
-            retry_allowed_methods=("GET", "POST"),
-        )
+        self.session = self._build_session()
+        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+        self._tls = threading.local()
+        self.cities = self._load_cities()

-    def _request_text(
-        self,
-        method: str,
-        url: str,
-        *,
-        timeout: int = 20,
-        max_retries: int = 3,
-        referer: str = SITE_BASE,
-        data: Optional[Dict] = None,
-    ) -> str:
-        headers = {"Referer": referer}
-        last_error: Optional[Exception] = None
+    def _build_session(self) -> requests.Session:
+        report_proxy_status()
+        session = requests.Session()
+        session.trust_env = False
+        proxies = get_proxies()
+        if proxies:
+            session.proxies.update(proxies)
+        else:
+            session.proxies.clear()
+        session.headers.update({
+            "User-Agent": (
+                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                "Mobile/15E148 Safari/604.1"
+            ),
+            "Connection": "close",
+        })
+        return session

-        for attempt in range(max_retries):
-            wait_for_request()
+    def _refresh_session(self) -> None:
+        try:
+            self.session.close()
+        except Exception:
+            pass
+        self.session = self._build_session()
+
+    def _get_thread_session(self) -> requests.Session:
+        """requests.Session 不是严格线程安全：每个线程用独立 session（但共享同样代理/headers）"""
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            return s
+        s = self._build_session()
+        s.headers.update(dict(self.session.headers))
+        self._tls.session = s
+        return s
+
+    def _refresh_thread_session(self) -> None:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
            try:
-                if method.upper() == "POST":
-                    resp = self.client.post_text(
-                        url,
-                        timeout=timeout,
-                        verify=False,
-                        headers=headers,
-                        data=data,
-                    )
-                else:
-                    resp = self.client.get_text(
-                        url,
-                        timeout=timeout,
-                        verify=False,
-                        headers=headers,
-                    )
+                s.close()
+            except Exception:
+                pass
+        self._tls.session = None

-                code = resp.status_code
-                if code == 403:
-                    if attempt < max_retries - 1:
-                        self.client.refresh()
-                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                        continue
-                    raise RequestClientError(f"{code} Error: {url}")
-                if code >= 500 and attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                if code >= 400:
-                    raise RequestClientError(f"{code} Error: {url}")
-                return resp.text
+    def _existing_urls(self, urls: List[str]) -> Set[str]:
+        """批量查重，减少 N 次 is_data_exist"""
+        if not urls:
+            return set()
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            # IN 参数过多会失败，分批
+            chunk_size = 500
+            for i in range(0, len(urls), chunk_size):
+                chunk = urls[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
+                cur.execute(sql, chunk)
+                for row in cur.fetchall():
+                    # pymysql 默认返回 tuple
+                    existing.add(row[0])
+        finally:
+            cur.close()
+        return existing
+
+    def _load_cities(self):
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
+            try:
+                provinces = self.db.select_data(
+                    table,
+                    "id, code, province",
+                    "domain='64365' AND level=1"
+                ) or []
+                cities = self.db.select_data(
+                    table,
+                    "code, city, province, pid",
+                    "domain='64365' AND level=2"
+                ) or []
            except Exception as exc:
                last_error = exc
-                if attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                raise
+                continue

-        if last_error is not None:
-            raise last_error
-        raise RequestClientError(f"Unknown request error: {url}")
+            if not cities:
+                continue

-    def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
-        return self._request_text(
-            "GET",
-            url,
-            timeout=timeout,
-            max_retries=max_retries,
-            referer=referer,
-        )
+            province_map = {row.get('id'): row for row in provinces}
+            data = {}
+            for city in cities:
+                province_row = province_map.get(city.get('pid'), {}) or {}
+                data[str(city.get('code'))] = {
+                    "name": city.get('city'),
+                    "province": city.get('province'),
+                    "province_name": province_row.get('province', city.get('province')),
+                }
+            print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
+            return data

-    def _post_text(
-        self,
-        url: str,
-        *,
-        data: Dict,
-        timeout: int = 20,
-        max_retries: int = 3,
-        referer: str = SITE_BASE,
-    ) -> str:
-        return self._request_text(
-            "POST",
-            url,
-            timeout=timeout,
-            max_retries=max_retries,
-            referer=referer,
-            data=data,
-        )
+        if last_error:
+            print(f"[律图] 加载地区数据失败: {last_error}")
+        print("[律图] 无城市数据（已尝试 area_new/area2/area）")
+        return {}

-    def _extract_area_data(self, text: str) -> List[Dict]:
-        match = re.search(
-            r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData",
-            text,
-            re.S,
-        )
-        if not match:
-            return []
-
-        raw = match.group(1)
-        try:
-            data = json.loads(raw)
-        except Exception:
-            return []
-        return data if isinstance(data, list) else []
-
-    def discover_cities(self) -> List[CityTarget]:
-        text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/")
-        provinces = self._extract_area_data(text)
-
-        targets: List[CityTarget] = []
-        seen_area: Set[str] = set()
-
-        for province in provinces:
-            province_id = str(province.get("id") or "").strip()
-            province_name = str(province.get("name") or "").strip()
-            province_py = str(province.get("py") or "").strip()
-            child_rows = province.get("child") or []
-
-            # 常规省份 child 是地级市；直辖市 child 是区县，此时使用省级 id 抓取
-            if child_rows and any((row.get("child") or []) for row in child_rows):
-                for city in child_rows:
-                    area_id = str(city.get("id") or "").strip()
-                    city_name = str(city.get("name") or "").strip()
-                    city_py = str(city.get("py") or "").strip()
-                    if not area_id or not city_name:
+    def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = self.session.post(LIST_URL, data=payload, timeout=10, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                resp.close()
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_session()
+                        time.sleep(wait_time)
                        continue
-                    if area_id in seen_area:
-                        continue
-                    seen_area.add(area_id)
-                    targets.append(
-                        CityTarget(
-                            area_id=area_id,
-                            province_id=province_id,
-                            province_name=province_name,
-                            province_py=province_py,
-                            city_name=city_name,
-                            city_py=city_py,
-                        )
-                    )
-            else:
-                if not province_id or not province_name:
-                    continue
-                if province_id in seen_area:
-                    continue
-                seen_area.add(province_id)
-                targets.append(
-                    CityTarget(
-                        area_id=province_id,
-                        province_id=province_id,
-                        province_name=province_name,
-                        province_py=province_py,
-                        city_name=province_name,
-                        city_py=province_py,
-                    )
-                )
+                    print("请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise requests.exceptions.HTTPError(f"{status_code} Error")
+                return text
+            except requests.exceptions.RequestException as exc:
+                print(f"请求失败: {exc}")
+                return None
+        return None

-        return targets
-
-    def _build_payload(self, area_id: str, page: int) -> Dict[str, str]:
-        ua = self.client.headers.get("User-Agent", "")
+    def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
        return {
            "AdCode": "",
-            "RegionId": str(area_id),
+            "RegionId": str(city_code),
            "CategoryId": "",
            "MaxNumber": "",
            "OnlyData": "true",
            "IgnoreButton": "",
-            "LawyerRecommendRequest[AreaId]": str(area_id),
+            "LawyerRecommendRequest[AreaId]": str(city_code),
            "LawyerRecommendRequest[LawCategoryIds]": "",
            "LawyerRecommendRequest[LawFirmPersonCount]": "",
            "LawyerRecommendRequest[LawFirmScale]": "",
@@ -268,429 +190,163 @@ class Six4365Crawler:
            "LawyerRecommendRequest[RefferUrl]": "",
            "LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
            "LawyerRecommendRequest[resource_type_name]": "",
-            "LawyerRecommendRequest[UserAgent]": ua,
+            "LawyerRecommendRequest[UserAgent]": self.session.headers["User-Agent"],
            "LawyerRecommendRequest[AddLawyerWithNoData]": "false",
            "ShowCaseButton": "true",
        }

-    def fetch_list_html(self, target: CityTarget, page: int) -> str:
-        payload = self._build_payload(target.area_id, page)
-        return self._post_text(
-            LIST_API_URL,
-            data=payload,
-            referer=SITE_BASE + "/findlawyer/",
-        )
-
-    def parse_list_cards(self, html: str) -> List[ListCard]:
+    def _parse_list(self, html: str, province: str, city: str) -> int:
        soup = BeautifulSoup(html, "html.parser")
-        cards: List[ListCard] = []
-        seen: Set[str] = set()
+        lawyers = soup.find_all("a", class_="lawyer")
+        if not lawyers:
+            return 0

-        for anchor in soup.select("a.lawyer[href]"):
-            href = (anchor.get("href") or "").strip()
+        detail_urls: List[str] = []
+        for lawyer in lawyers:
+            href = lawyer.get("href")
            if not href:
                continue
-            detail_url = urljoin(SITE_BASE, href)
-            if detail_url in seen:
+            detail_urls.append(f"{href.rstrip('/')}/info/")
+
+        if not detail_urls:
+            return 0
+
+        results: List[Dict[str, str]] = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
+            for fut in as_completed(futs):
+                try:
+                    data = fut.result()
+                except Exception as exc:
+                    print(f"    详情解析异常: {exc}")
+                    continue
+                if data:
+                    results.append(data)
+
+        if not results:
+            return len(detail_urls)
+
+        existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
+        for data in results:
+            if not data:
                continue
-            seen.add(detail_url)
+            url = data.get("url", "")
+            if not url:
+                continue
+            if url in existing:
+                print(f"  -- 已存在URL: {url}")
+                continue
+            try:
+                self.db.insert_data("lawyer", data)
+                print(f"  -> 新增: {data['name']} ({data['phone']})")
+            except Exception as exc:
+                print(f"  插入失败 {url}: {exc}")

-            name = ""
-            name_tag = anchor.select_one("b.name")
-            if name_tag:
-                name = name_tag.get_text(strip=True)
+        return len(detail_urls)

-            specialties: List[str] = []
-            skill_tag = anchor.select_one("div.skill")
-            if skill_tag:
-                raw = skill_tag.get_text(" ", strip=True).replace("擅长：", "")
-                specialties = [x.strip() for x in re.split(r"[、,，]", raw) if x.strip()]
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        html = self._get_detail(url)
+        if not html:
+            return None

-            score_text = ""
-            score_tag = anchor.select_one("div.info span[title='评分'] em")
-            if score_tag:
-                score_text = score_tag.get_text(strip=True)
-
-            service_text = ""
-            service_tag = anchor.select_one("div.info")
-            if service_tag:
-                service_text = service_tag.get_text(" ", strip=True)
-
-            cards.append(
-                ListCard(
-                    detail_url=detail_url,
-                    name=name,
-                    specialties=specialties,
-                    score_text=score_text,
-                    service_text=service_text,
-                )
-            )
-
-        return cards
-
-    def parse_detail(self, detail_url: str) -> Dict:
-        info_url = detail_url.rstrip("/") + "/info/"
-        html = self._get_text(info_url, referer=detail_url)
        soup = BeautifulSoup(html, "html.parser")
+        base_info = soup.find("ul", class_="intro-basic-bar")
+        if not base_info:
+            return None

        name = ""
        law_firm = ""
        phone = ""
-        practice_years: Optional[int] = None
-        office_area = ""
-        address = ""
-        specialties: List[str] = []

-        for li in soup.select("ul.intro-basic-bar li"):
-            label_tag = li.select_one("span.label")
-            value_tag = li.select_one("div.txt")
-            if not label_tag or not value_tag:
+        for li in base_info.find_all("li"):
+            label = li.find("span", class_="label")
+            txt = li.find("div", class_="txt")
+            if not label or not txt:
                continue
+            label_text = label.get_text(strip=True)
+            if "姓名" in label_text:
+                name = txt.get_text(strip=True)
+            if "执业律所" in label_text:
+                law_firm = txt.get_text(strip=True)

-            label = label_tag.get_text(" ", strip=True).replace("：", "")
-            value = value_tag.get_text(" ", strip=True)
+        more_section = soup.find("div", class_="more-intro-basic")
+        if more_section:
+            phone_ul = more_section.find("ul", class_="intro-basic-bar")
+            if phone_ul:
+                for li in phone_ul.find_all("li"):
+                    label = li.find("span", class_="label")
+                    txt = li.find("div", class_="txt")
+                    if label and txt and "联系电话" in label.get_text(strip=True):
+                        phone = txt.get_text(strip=True).replace(" ", "")
+                        break

-            if "姓名" in label and not name:
-                name = value
-            elif "执业律所" in label and not law_firm:
-                law_firm = value
-            elif "联系电话" in label and not phone:
-                phone = normalize_phone(value)
-            elif "执业年限" in label and practice_years is None:
-                year_match = YEAR_RE.search(value)
-                if year_match:
-                    try:
-                        practice_years = int(year_match.group(1))
-                    except Exception:
-                        practice_years = None
-            elif "办公地区" in label and not office_area:
-                office_area = value
-            elif "办公地址" in label and not address:
-                address = value
-
-        text = soup.get_text(" ", strip=True)
-        if not phone:
-            phone = normalize_phone(text)
-
-        if not name and soup.title:
-            title = soup.title.get_text(" ", strip=True)
-            match = re.search(r"([^\s_，,。]+?)律师", title)
-            if match:
-                name = match.group(1).strip()
-
-        skill_match = re.search(r"擅长：([^\n]+)", text)
-        if skill_match:
-            specialties = [x.strip() for x in re.split(r"[、,，]", skill_match.group(1)) if x.strip()]
-
-        return {
-            "name": name,
-            "law_firm": law_firm,
-            "phone": phone,
-            "practice_years": practice_years,
-            "office_area": office_area,
-            "address": address,
-            "specialties": specialties,
-            "detail_url": detail_url,
-            "info_url": info_url,
-        }
-
-    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
-        seen_detail_urls: Set[str] = set()
-        page_first_seen: Set[str] = set()
-
-        for page in range(1, self.max_pages + 1):
-            try:
-                html = self.fetch_list_html(target, page)
-            except Exception as exc:
-                print(f"[list] 失败 area={target.area_id} p{page}: {exc}")
-                break
-
-            cards = self.parse_list_cards(html)
-            if not cards:
-                break
-
-            first_url = cards[0].detail_url
-            if first_url in page_first_seen:
-                break
-            page_first_seen.add(first_url)
-
-            for card in cards:
-                if card.detail_url in seen_detail_urls:
-                    continue
-                seen_detail_urls.add(card.detail_url)
-
-                try:
-                    detail = self.parse_detail(card.detail_url)
-                except Exception as exc:
-                    print(f"[detail] 失败 {card.detail_url}: {exc}")
-                    continue
-
-                now = int(time.time())
-                uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url)
-                uid = uid_match.group(1) if uid_match else card.detail_url
-                record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
-
-                yield {
-                    "record_id": record_id,
-                    "collected_at": now,
-                    "source": {
-                        "site": SITE_NAME,
-                        "province_id": target.province_id,
-                        "province": target.province_name,
-                        "province_py": target.province_py,
-                        "area_id": target.area_id,
-                        "city": target.city_name,
-                        "city_py": target.city_py,
-                        "page": page,
-                        "detail_url": card.detail_url,
-                        "info_url": detail.get("info_url", ""),
-                    },
-                    "list_snapshot": {
-                        "name": card.name,
-                        "specialties": card.specialties,
-                        "score_text": card.score_text,
-                        "service_text": card.service_text,
-                    },
-                    "profile": {
-                        "name": detail.get("name") or card.name,
-                        "law_firm": detail.get("law_firm") or "",
-                        "phone": detail.get("phone") or "",
-                        "practice_years": detail.get("practice_years"),
-                        "office_area": detail.get("office_area") or "",
-                        "address": detail.get("address") or "",
-                        "specialties": detail.get("specialties") or card.specialties,
-                    },
-                }
-
-                if self.sleep_seconds:
-                    time.sleep(self.sleep_seconds)
-
-    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
-        source = record.get("source", {}) or {}
-        profile = record.get("profile", {}) or {}
-
-        phone = normalize_phone(profile.get("phone", ""))
-        if not phone:
+        phone = phone.replace('-', '').strip()
+        if not name or not phone:
            return None

-        province = (source.get("province") or "").strip()
-        city = (source.get("city") or province).strip()
-        return {
-            "name": (profile.get("name") or "").strip(),
-            "law_firm": (profile.get("law_firm") or "").strip(),
+        data = {
+            "phone": phone,
            "province": province,
            "city": city,
-            "phone": phone,
-            "url": (source.get("info_url") or source.get("detail_url") or "").strip(),
-            "domain": LEGACY_DOMAIN,
-            "create_time": int(record.get("collected_at") or time.time()),
-            "params": json.dumps(record, ensure_ascii=False),
+            "law_firm": law_firm,
+            "url": url,
+            "domain": DOMAIN,
+            "name": name,
+            "create_time": int(time.time()),
+            "params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
        }
+        return data

-    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
-        if not self.db or not phones:
-            return set()
-
-        deduped = sorted({p for p in phones if p})
-        if not deduped:
-            return set()
-
-        existing: Set[str] = set()
-        cur = self.db.db.cursor()
-        try:
-            chunk_size = 500
-            for i in range(0, len(deduped), chunk_size):
-                chunk = deduped[i:i + chunk_size]
-                placeholders = ",".join(["%s"] * len(chunk))
-                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
-                for row in cur.fetchall():
-                    existing.add(row[0])
-        finally:
-            cur.close()
-
-        return existing
-
-    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
-        if not self.db:
-            return 0, 0
-
-        rows: List[Dict[str, str]] = []
-        for record in records:
-            row = self._to_legacy_lawyer_row(record)
-            if row:
-                rows.append(row)
-        if not rows:
-            return 0, 0
-
-        existing = self._existing_phones_in_db([row["phone"] for row in rows])
-        inserted = 0
-        skipped = 0
-
-        for row in rows:
-            phone = row.get("phone", "")
-            if not phone or phone in existing:
-                skipped += 1
-                continue
+    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
+        session = self._get_thread_session()
+        for attempt in range(max_retries):
            try:
-                self.db.insert_data("lawyer", row)
-                existing.add(phone)
-                inserted += 1
-            except Exception as exc:
-                skipped += 1
-                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
-
-        return inserted, skipped
-
-    def crawl(
-        self,
-        output_path: str,
-        max_cities: int = 0,
-        city_filter: Optional[str] = None,
-    ) -> None:
-        cities = self.discover_cities()
-        print(f"[discover] 共发现地区 {len(cities)} 个")
-
-        if city_filter:
-            key = city_filter.strip().lower()
-            cities = [
-                c for c in cities
-                if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id
-            ]
-            print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}")
-
-        if max_cities > 0:
-            cities = cities[:max_cities]
-            print(f"[discover] 截断地区数 {len(cities)}")
-
-        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
-
-        seen_ids: Set[str] = set()
-        if os.path.exists(output_path):
-            with open(output_path, "r", encoding="utf-8") as old_file:
-                for line in old_file:
-                    line = line.strip()
-                    if not line:
+                resp = session.get(url, timeout=10, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                resp.close()
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_thread_session()
+                        session = self._get_thread_session()
+                        time.sleep(wait_time)
                        continue
-                    try:
-                        item = json.loads(line)
-                    except Exception:
-                        continue
-                    rid = item.get("record_id")
-                    if rid:
-                        seen_ids.add(rid)
-            print(f"[resume] 已有记录 {len(seen_ids)} 条")
+                    print("    请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise requests.exceptions.HTTPError(f"{status_code} Error")
+                return text
+            except requests.exceptions.RequestException as exc:
+                print(f"    请求失败: {exc}")
+                return None
+        return None

-        total_new_json = 0
-        total_new_db = 0
-        total_skip_db = 0
+    def run(self):
+        print("启动律图采集...")
+        if not self.cities:
+            print("无城市数据")
+            return

-        with open(output_path, "a", encoding="utf-8") as out:
-            for idx, target in enumerate(cities, start=1):
-                print(
-                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
-                    f"(area={target.area_id})"
-                )
-                city_records = list(self.crawl_city(target))
-
-                city_new_json = 0
-                for record in city_records:
-                    rid = record["record_id"]
-                    if rid in seen_ids:
-                        continue
-                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
-                    seen_ids.add(rid)
-                    city_new_json += 1
-                    total_new_json += 1
-
-                city_new_db, city_skip_db = self._write_records_to_db(city_records)
-                total_new_db += city_new_db
-                total_skip_db += city_skip_db
-
-                print(
-                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
-                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
-                )
-
-        print(
-            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
-            f"DB跳过{total_skip_db}条, 输出: {output_path}"
-        )
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="律图全新采集脚本（站点数据直采）")
-    parser.add_argument(
-        "--output",
-        default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl",
-        help="输出 jsonl 文件路径",
-    )
-    parser.add_argument(
-        "--max-cities",
-        type=int,
-        default=0,
-        help="最多采集多少个地区，0 表示不限",
-    )
-    parser.add_argument(
-        "--max-pages",
-        type=int,
-        default=9999,
-        help="每个地区最多采集多少页",
-    )
-    parser.add_argument(
-        "--city-filter",
-        default="",
-        help="按城市名称/拼音/编码过滤",
-    )
-    parser.add_argument(
-        "--sleep",
-        type=float,
-        default=0.1,
-        help="详情页请求间隔秒数",
-    )
-    parser.add_argument(
-        "--direct",
-        action="store_true",
-        help="直连模式，不使用 proxy_settings.json 代理",
-    )
-    parser.add_argument(
-        "--no-db",
-        action="store_true",
-        help="只输出 JSONL，不写入数据库",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    if args.no_db:
-        crawler = Six4365Crawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=None,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
-        return
-
-    with Db() as db:
-        crawler = Six4365Crawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=db,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
+        for city_code, info in self.cities.items():
+            province = info.get("province_name", "")
+            city = info.get("name", "")
+            print(f"采集 {province}-{city}")
+            page = 1
+            while True:
+                payload = self._build_payload(city_code, page)
+                html = self._post(payload)
+                if not html:
+                    break
+                link_count = self._parse_list(html, province, city)
+                if link_count == 0:
+                    break
+                page += 1
+        print("律图采集完成")


 if __name__ == "__main__":
-    main()
+    with Db() as db:
+        spider = Six4365Spider(db)
+        spider.run()
@@ -1,80 +1,13 @@
 #!/usr/bin/env bash
 set -euo pipefail

-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
-LOG_DIR="${PROJECT_ROOT}/logs"
-DATA_DIR="${PROJECT_ROOT}/data"
+# 切换到脚本所在目录，确保相对路径正确
+cd "$(dirname "$0")"

-mkdir -p "${LOG_DIR}" "${DATA_DIR}"
+echo "使用 request/proxy_settings.json 读取代理配置"

-if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
-  PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
-else
-  PYTHON_BIN="python3"
-fi
-
-RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
-
-echo "[start] project=${PROJECT_ROOT}"
-echo "[start] python=${PYTHON_BIN}"
-echo "[start] mode=${RUN_MODE}"
-echo "[start] proxy=request/proxy_settings.json"
-
-# 大律师（新结构采集 + 写库）可通过环境变量控制
-DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
-DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
-DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
-DLS_SLEEP="${DLS_SLEEP:-0.2}"
-DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
-DLS_EXTRA_ARGS=()
-
-if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
-  DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
-fi
-if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
-  DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
-fi
-if [[ -n "${DLS_CITY_FILTER}" ]]; then
-  DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
-fi
-DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
-
-if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
-  DLS_EXTRA_ARGS+=(--direct)
-fi
-if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
-  DLS_EXTRA_ARGS+=(--no-db)
-fi
-
-run_bg() {
-  local name="$1"
-  shift
-  local logfile="${LOG_DIR}/${name}.log"
-  nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
-  echo "[start] ${name} pid=$! log=${logfile}"
-}
-
-run_fg() {
-  local name="$1"
-  shift
-  local logfile="${LOG_DIR}/${name}.log"
-  echo "[start] ${name} fg log=${logfile}"
-  env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
-}
-
-if [[ "${RUN_MODE}" == "sequential" ]]; then
-  run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
-  run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
-  run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
-  run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
-  run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
-  echo "[done] sequential completed"
-else
-  run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
-  run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
-  run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
-  run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
-  run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
-  echo "[done] all crawlers started in background"
-fi
+nohup python ../common_sites/dls.py > dls.log 2>&1 & # 大律师
+nohup python ../common_sites/findlaw.py > findlaw.log 2>&1 & # 找法网
+nohup python ../common_sites/lawtime.py > lawtime.log 2>&1 &  # 法律快车
+nohup python ../common_sites/six4365.py > six4365.log 2>&1 & # 律图
+nohup python ../common_sites/hualv.py > hualv.log 2>&1 & # 华律