重构采集脚本并新增按时间导出Excel

- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore，忽略 data/logs 本地产物
2026-03-02 11:46:05 +08:00
parent 03847a4b8e
commit 19cf9ce901
12 changed files with 3545 additions and 1059 deletions
@@ -29,3 +29,8 @@ Thumbs.db
 # Local runtime files
 *.log
 logs/
 data/
 # accidental local files
 =*
@@ -14,7 +14,49 @@
 ```bash
 cd /www/wwwroot/lawyers
-python3 -m pip install -r requirements.txt
+python3 -m venv .venv
-cd common_sites
+.venv/bin/pip install -r requirements.txt
-./start.sh
+./common_sites/start.sh
 ```
 ## 启动参数
 `start.sh` 默认并行启动 5 个站点采集（大律师使用 `dls_fresh.py`）。
 - 日志目录：`/www/wwwroot/lawyers/logs`
 - 大律师 JSON 输出：`/www/wwwroot/lawyers/data/dls_records.jsonl`
 常用环境变量：
 ```bash
 # 顺序执行（默认 parallel）
 RUN_MODE=sequential ./common_sites/start.sh
 # 大律师限制采集范围
 DLS_CITY_FILTER=beijing DLS_MAX_CITIES=1 DLS_MAX_PAGES=1 ./common_sites/start.sh
 # 大律师直连（不走代理）/ 仅导出JSON不写库
 DLS_DIRECT=1 DLS_NO_DB=1 ./common_sites/start.sh
 ```
 ## 导出 Excel
 新增导出脚本：`common_sites/export_lawyers_excel.py`
 ```bash
 # 无参数：默认导出最近7天数据（含手机号/姓名/律所/省份/市区/站点名称）
 # 并默认解析 params 扩展信息（邮箱/地址/执业证号/执业年限/擅长领域等）
 ./.venv/bin/python ./common_sites/export_lawyers_excel.py
 # 按 create_time 时间戳范围导出
 ./.venv/bin/python ./common_sites/export_lawyers_excel.py \
  --start-ts 1772380000 --end-ts 1772429999 \
  --output ./data/lawyers_20260302.xlsx
 # 只导出某站点，并带技术字段（url/域名/时间等）
 ./.venv/bin/python ./common_sites/export_lawyers_excel.py \
  --domain 大律师 --include-extra
 # 如果不需要解析 params 扩展信息
 ./.venv/bin/python ./common_sites/export_lawyers_excel.py --no-parse-params
 ```
@@ -1,9 +1,14 @@
 import json
 import os
 import random
 import re
 import sys
 import time
-import random
+from typing import Dict, List, Optional, Set, Tuple
-from typing import Dict, Optional
+from urllib.parse import urljoin
 import urllib3
 from bs4 import BeautifulSoup
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -13,8 +18,7 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)
-import urllib3
+from Db import Db
 from bs4 import BeautifulSoup
 from request.requests_client import (
    RequestClientError,
    RequestConnectTimeout,
@@ -22,168 +26,136 @@ from request.requests_client import (
    RequestTimeout,
    RequestsClient,
 )
 # 禁用 SSL 警告
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 from Db import Db
 from utils.rate_limiter import wait_for_request
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 DOMAIN = "大律师"
-LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
+SITE_BASE = "https://m.maxlaw.cn"
-_PROXY_TESTED = False
+LIST_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}"
 PHONE_PATTERN = re.compile(r"1[3-9]\d{9}")
 MAX_PAGES_PER_CITY = int(os.getenv("MAXLAW_MAX_PAGES", "0"))
 PROXY_TESTED = False
 class DlsSpider:
    def __init__(self, db_connection):
        self.db = db_connection
-        self.client = self._build_session()
+        self.client = self._build_client()
        self.areas = self._load_areas()
-    def _build_session(self) -> RequestsClient:
+    def _build_client(self) -> RequestsClient:
        """构建带重试机制的 session"""
        client = RequestsClient(
            headers={
-                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
+                "User-Agent": (
                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
                    "Mobile/15E148 Safari/604.1"
                ),
                "Host": "m.maxlaw.cn",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
                "Connection": "close",
            },
-            retry_total=3,  # 总共重试3次
+            retry_total=3,
-            retry_backoff_factor=1,  # 重试间隔：1s, 2s, 4s
+            retry_backoff_factor=1,
-            retry_status_forcelist=(429, 500, 502, 503, 504),  # 对这些状态码进行重试
+            retry_status_forcelist=(429, 500, 502, 503, 504),
            retry_allowed_methods=("GET", "POST"),
        )
        self._proxy_test(client, client.proxies or None)
        return client
-    def _refresh_session(self) -> None:
+    def _refresh_client(self) -> None:
        self.client.refresh()
        self._proxy_test(self.client, self.client.proxies or None)
    def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
-        global _PROXY_TESTED
+        global PROXY_TESTED
-        if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
+        if PROXY_TESTED or not os.getenv("PROXY_TEST"):
            return
-        _PROXY_TESTED = True
+        PROXY_TESTED = True
        if not proxies:
            print("[proxy] test skipped: no proxy configured")
            return
        test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
        timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
        try:
-            resp = client.get_text(
+            resp = client.get_text(test_url, timeout=timeout, headers={"Connection": "close"})
                test_url,
                timeout=timeout,
                headers={"Connection": "close"},
            )
            print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
        except Exception as exc:
            print(f"[proxy] test failed: {exc}")
-    def _load_areas(self):
+    def _load_areas(self) -> List[Dict[str, str]]:
        tables = ("area_new", "area2", "area")
        last_error = None
        for table in tables:
            try:
-            return self.db.select_data(
+                rows = self.db.select_data(table, "province, city, pinyin", "domain='maxlaw'") or []
                "area_new",
                "province, city, pinyin",
                "domain='maxlaw'"
            ) or []
            except Exception as exc:
-            print(f"加载地区失败: {exc}")
+                last_error = exc
                continue
            if rows:
                missing_pinyin = sum(1 for row in rows if not (row.get("pinyin") or "").strip())
                print(f"[大律师] 地区来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
                return rows
        if last_error:
            print(f"[大律师] 加载地区失败: {last_error}")
        print("[大律师] 无地区数据（已尝试 area_new/area2/area）")
        return []
-    def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
+    def _get(
-        """发送 GET 请求，带重试机制"""
+        self,
        url: str,
        *,
        headers: Optional[Dict[str, str]] = None,
        max_retries: int = 3,
        timeout: Tuple[int, int] = (10, 30),
    ) -> Optional[str]:
        wait_for_request()
        for attempt in range(max_retries):
            try:
-                # 使用更长的超时时间，分别设置连接和读取超时
+                resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
-                resp = self.client.get_text(
+                if resp.status_code == 403:
                    url, 
                    timeout=(10, 30),  # (connect_timeout, read_timeout)
                    verify=False,
                    headers=headers,
                )
                status_code = resp.status_code
                content = resp.text
                if status_code == 403:
                    if attempt < max_retries - 1:
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        wait_time = (2 ** attempt) + random.uniform(0.3, 1.0)
-                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        print(f"请求403，{wait_time:.2f}s后重试 ({attempt + 1}/{max_retries}): {url}")
-                        self._refresh_session()
+                        self._refresh_client()
                        time.sleep(wait_time)
                        continue
                    print(f"请求失败 {url}: 403 Forbidden")
                    return None
-                if status_code >= 400:
+                if resp.status_code >= 400:
-                    raise RequestClientError(f"{status_code} Error: {url}")
+                    raise RequestClientError(f"{resp.status_code} Error: {url}")
-                return content
+                return resp.text
            except RequestConnectTimeout as exc:
                if attempt < max_retries - 1:
-                    wait_time = 2 ** attempt  # 指数退避：2s, 4s, 8s
+                    wait_time = 2 ** attempt
-                    print(f"连接超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    print(f"连接超时，{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
                    time.sleep(wait_time)
-                else:
+                    continue
                print(f"连接超时，已达到最大重试次数 {url}: {exc}")
                return None
            except RequestTimeout as exc:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
-                    print(f"请求超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    print(f"请求超时，{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
                    time.sleep(wait_time)
-                else:
+                    continue
                print(f"请求超时，已达到最大重试次数 {url}: {exc}")
                return None
            except RequestConnectionError as exc:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
-                    print(f"连接错误，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    print(f"连接错误，{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
                    time.sleep(wait_time)
-                else:
+                    continue
                print(f"连接错误，已达到最大重试次数 {url}: {exc}")
                return None
            except RequestClientError as exc:
                print(f"请求失败 {url}: {exc}")
                return None
        return None
    def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
        soup = BeautifulSoup(html, "html.parser")
        cards = soup.find_all("div", class_="lstx")
        if not cards:
            return 0
        inserted = 0
        for card in cards:
            link = card.find("a")
            if not link or not link.get("href"):
                continue
            detail = self._parse_detail(link['href'], province, city, list_url)
            if not detail:
                continue
            phone = detail.get("phone")
            if not phone:
                continue
            condition = f"phone='{phone}' and domain='{DOMAIN}'"
            if self.db.is_data_exist("lawyer", condition):
                print(f"  -- 已存在: {detail['name']} ({phone})")
                time.sleep(0.3)
                continue
            try:
                self.db.insert_data("lawyer", detail)
                inserted += 1
                print(f"  -> 新增: {detail['name']} ({phone})")
            except Exception as exc:
                print(f"  插入失败: {exc}")
            time.sleep(1)
            time.sleep(0.3)
        # 列表页结束后再缓一缓，降低风控
        time.sleep(0.6)
        return inserted
    def _detail_headers(self, referer: str) -> Dict[str, str]:
        return {
            "Referer": referer,
@@ -194,72 +166,215 @@ class DlsSpider:
            "Upgrade-Insecure-Requests": "1",
        }
-    def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
+    def _extract_detail_urls(self, html: str) -> List[str]:
-        url = f"https://m.maxlaw.cn{path}"
+        soup = BeautifulSoup(html, "html.parser")
-        print(f"  详情: {url}")
+        urls: List[str] = []
-        html = self._get(url, headers=self._detail_headers(list_url))
+        seen: Set[str] = set()
        # 主选择器：当前站点列表卡片
        for a_tag in soup.select("div.lstx a[href]"):
            href = (a_tag.get("href") or "").strip()
            if not href:
                continue
            url = urljoin(SITE_BASE, href)
            if url in seen:
                continue
            seen.add(url)
            urls.append(url)
        # 回退选择器：页面结构轻微变化时尽量保活
        if not urls:
            for a_tag in soup.select("a[href]"):
                href = (a_tag.get("href") or "").strip()
                if "/lawyer/" not in href:
                    continue
                url = urljoin(SITE_BASE, href)
                if url in seen:
                    continue
                seen.add(url)
                urls.append(url)
        return urls
    def _extract_name(self, soup: BeautifulSoup) -> str:
        for selector in ("h2.lawyerName", "h1.lawyerName", "h1", "h2"):
            tag = soup.select_one(selector)
            if tag:
                name = tag.get_text(strip=True)
                if name:
                    return name
        title = soup.title.get_text(strip=True) if soup.title else ""
        match = re.search(r"(\S+律师)", title)
        return match.group(1) if match else ""
    def _extract_law_firm(self, soup: BeautifulSoup) -> str:
        for selector in ("p.law-firm", "div.law-firm", "p[class*=firm]"):
            tag = soup.select_one(selector)
            if tag:
                text = tag.get_text(strip=True)
                if text:
                    return text
        page_text = soup.get_text(" ", strip=True)
        match = re.search(r"(执业机构|律所)\s*[:：]?\s*([^\s，。,；;]{2,40})", page_text)
        if match:
            return match.group(2).strip()
        return ""
    def _normalize_phone(self, text: str) -> str:
        compact = re.sub(r"\D", "", text or "")
        match = PHONE_PATTERN.search(compact)
        return match.group(0) if match else ""
    def _extract_phone(self, soup: BeautifulSoup) -> str:
        contact = soup.select_one("ul.contact-content")
        if contact:
            phone = self._normalize_phone(contact.get_text(" ", strip=True))
            if phone:
                return phone
        for selector in ("a[href^='tel:']", "span.phone", "p.phone"):
            tag = soup.select_one(selector)
            if tag:
                phone = self._normalize_phone(tag.get_text(" ", strip=True))
                if phone:
                    return phone
        return self._normalize_phone(soup.get_text(" ", strip=True))
    def _parse_detail(self, detail_url: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
        print(f"  详情: {detail_url}")
        html = self._get(detail_url, headers=self._detail_headers(list_url))
        if not html:
            return None
        soup = BeautifulSoup(html, "html.parser")
-        name_tag = soup.find("h2", class_="lawyerName")
+        name = self._extract_name(soup)
-        law_firm_tag = soup.find("p", class_="law-firm")
+        phone = self._extract_phone(soup)
        contact_list = soup.find("ul", class_="contact-content")
        name = name_tag.get_text(strip=True) if name_tag else ""
        law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
        phone = ""
        if contact_list:
            items = contact_list.find_all("li")
            if len(items) > 2:
                phone_tag = items[2].find("p")
                if phone_tag:
                    phone = phone_tag.get_text(strip=True)
                    phone = phone.split("咨询请说明来自大律师网")[0].strip()
        phone = phone.replace('-', '').strip()
        if not name or not phone:
            print("    信息不完整，跳过")
            return None
-        safe_city = city if city else province
+        safe_city = city or province
        return {
            "name": name,
-            "law_firm": law_firm,
+            "law_firm": self._extract_law_firm(soup),
            "province": province,
            "city": safe_city,
            "phone": phone,
-            "url": url,
+            "url": detail_url,
            "domain": DOMAIN,
            "create_time": int(time.time()),
-            "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
+            "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False),
        }
    def _existing_phones(self, phones: List[str]) -> Set[str]:
        if not phones:
            return set()
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
            for idx in range(0, len(phones), chunk_size):
                chunk = phones[idx:idx + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
                cur.execute(sql, [DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
        return existing
    def _save_lawyers(self, lawyers: List[Dict[str, str]]) -> Tuple[int, int]:
        if not lawyers:
            return 0, 0
        phones = [row["phone"] for row in lawyers if row.get("phone")]
        existing = self._existing_phones(phones)
        inserted = 0
        skipped = 0
        for row in lawyers:
            phone = row.get("phone", "")
            if not phone:
                skipped += 1
                continue
            if phone in existing:
                skipped += 1
                print(f"  -- 已存在: {row.get('name', '')} ({phone})")
                continue
            try:
                self.db.insert_data("lawyer", row)
                existing.add(phone)
                inserted += 1
                print(f"  -> 新增: {row.get('name', '')} ({phone})")
            except Exception as exc:
                skipped += 1
                print(f"  插入失败 {row.get('url', '')}: {exc}")
        return inserted, skipped
    def _crawl_city(self, area: Dict[str, str]) -> Tuple[int, int]:
        pinyin = (area.get("pinyin") or "").strip()
        province = area.get("province", "")
        city = area.get("city", "")
        if not pinyin:
            return 0, 0
        total_inserted = 0
        total_parsed = 0
        page = 1
        prev_fingerprint = ""
        while True:
            if MAX_PAGES_PER_CITY > 0 and page > MAX_PAGES_PER_CITY:
                print(f"达到分页上限({MAX_PAGES_PER_CITY})，停止 {province}-{city}")
                break
            list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
            print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
            html = self._get(list_url)
            if not html:
                break
            detail_urls = self._extract_detail_urls(html)
            if not detail_urls:
                print("  列表为空，结束当前城市")
                break
            fingerprint = "|".join(detail_urls[:8])
            if fingerprint and fingerprint == prev_fingerprint:
                print("  列表页重复，提前停止当前城市")
                break
            prev_fingerprint = fingerprint
            lawyers: List[Dict[str, str]] = []
            for detail_url in detail_urls:
                row = self._parse_detail(detail_url, province, city, list_url)
                if row:
                    lawyers.append(row)
                time.sleep(0.25)
            inserted, skipped = self._save_lawyers(lawyers)
            total_inserted += inserted
            total_parsed += len(lawyers)
            print(
                f"  第 {page} 页完成: 列表{len(detail_urls)}条, "
                f"解析{len(lawyers)}条, 新增{inserted}条, 跳过{skipped}条"
            )
            page += 1
            time.sleep(0.5)
        return total_inserted, total_parsed
    def run(self):
        print("启动大律师采集...")
        if not self.areas:
            print("无地区数据")
            return
        all_inserted = 0
        all_parsed = 0
        for area in self.areas:
-            pinyin = area.get("pinyin")
+            inserted, parsed = self._crawl_city(area)
-            province = area.get("province", "")
+            all_inserted += inserted
-            city = area.get("city", "")
+            all_parsed += parsed
-            if not pinyin:
+        print(f"大律师采集完成: 解析{all_parsed}条, 新增{all_inserted}条")
                continue
            page = 1
            while True:
                list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
                html = self._get(list_url)
                if not html:
                    break
                inserted = self._parse_list(html, province, city, list_url)
                if inserted == 0:
                    break
                page += 1
        print("大律师采集完成")
 if __name__ == "__main__":
@@ -0,0 +1,621 @@
 import argparse
 import hashlib
 import json
 import os
 import random
 import re
 import sys
 import time
 from dataclasses import dataclass, field
 from typing import Dict, Iterable, List, Optional, Set, Tuple
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 import urllib3
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
 request_dir = os.path.join(project_root, "request")
 if request_dir not in sys.path:
    sys.path.insert(0, request_dir)
 if project_root not in sys.path:
    sys.path.append(project_root)
 from request.requests_client import RequestClientError, RequestsClient
 from utils.rate_limiter import wait_for_request
 from Db import Db
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 SITE_NAME = "maxlaw"
 LEGACY_DOMAIN = "大律师"
 SITE_BASE = "https://m.maxlaw.cn"
 CITY_API = "https://js.maxlaw.cn/js/ajax/common/getprovice.js"
 CITY_DETAIL_API = "https://js.maxlaw.cn/js/ajax/common/getcity_{province_id}.js"
 LIST_URL_TEMPLATE = SITE_BASE + "/law/{city_py}?page={page}"
 PHONE_RE = re.compile(r"1[3-9]\d{9}")
 ANSWER_RE = re.compile(r"已解答\s*(\d+)\s*次")
@dataclass
 class CityTarget:
    province_id: int
    province_name: str
    province_py: str
    city_id: int
    city_name: str
    city_py: str
@dataclass
 class ListCard:
    detail_url: str
    name: str = ""
    law_firm: str = ""
    specialties: List[str] = field(default_factory=list)
    answered_count: Optional[int] = None
 def clean_prefixed_name(value: str) -> str:
    text = (value or "").strip()
    # 接口返回常见格式如 "B 北京"
    text = re.sub(r"^[A-Za-z]\s*", "", text)
    return text.strip()
 def normalize_phone(text: str) -> str:
    compact = re.sub(r"\D", "", text or "")
    match = PHONE_RE.search(compact)
    return match.group(0) if match else ""
 def parse_json_with_bom(text: str) -> Dict:
    cleaned = (text or "").strip().lstrip("\ufeff")
    return json.loads(cleaned)
 class DlsFreshCrawler:
    def __init__(
        self,
        max_pages: int = 3,
        sleep_seconds: float = 0.2,
        use_proxy: bool = True,
        db_connection=None,
    ):
        self.max_pages = max_pages
        self.sleep_seconds = max(0.0, sleep_seconds)
        self.db = db_connection
        self.client = RequestsClient(
            headers={
                "User-Agent": (
                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
                    "Mobile/15E148 Safari/604.1"
                ),
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
                "Connection": "close",
            },
            use_proxy=use_proxy,
            retry_total=2,
            retry_backoff_factor=1,
            retry_status_forcelist=(429, 500, 502, 503, 504),
            retry_allowed_methods=("GET",),
        )
    def _get_text(self, url: str, timeout: int = 20, max_retries: int = 3) -> str:
        last_error: Optional[Exception] = None
        for attempt in range(max_retries):
            wait_for_request()
            try:
                resp = self.client.get_text(url, timeout=timeout, verify=False)
                code = resp.status_code
                if code == 403:
                    if attempt < max_retries - 1:
                        self.client.refresh()
                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                        continue
                    raise RequestClientError(f"{code} Error: {url}")
                if code >= 500 and attempt < max_retries - 1:
                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                    continue
                if code >= 400:
                    raise RequestClientError(f"{code} Error: {url}")
                return resp.text
            except Exception as exc:
                last_error = exc
                if attempt < max_retries - 1:
                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                    continue
                raise
        if last_error is not None:
            raise last_error
        raise RequestClientError(f"Unknown request error: {url}")
    def discover_cities(self) -> List[CityTarget]:
        province_text = self._get_text(CITY_API)
        province_data = parse_json_with_bom(province_text)
        province_rows = province_data.get("ds", []) or []
        cities: List[CityTarget] = []
        seen_py: Set[str] = set()
        for province in province_rows:
            province_id = int(province.get("id"))
            province_name = clean_prefixed_name(province.get("name", ""))
            province_py = (province.get("py_code") or "").strip()
            if not province_py:
                continue
            city_api = CITY_DETAIL_API.format(province_id=province_id)
            try:
                city_text = self._get_text(city_api)
                city_data = parse_json_with_bom(city_text)
            except Exception as exc:
                print(f"[city] 获取失败 pid={province_id}: {exc}")
                continue
            for city in city_data.get("ds", []) or []:
                city_py = (city.get("py_code") or "").strip()
                if not city_py or city_py in seen_py:
                    continue
                seen_py.add(city_py)
                cities.append(
                    CityTarget(
                        province_id=province_id,
                        province_name=province_name,
                        province_py=province_py,
                        city_id=int(city.get("id")),
                        city_name=clean_prefixed_name(city.get("name", "")),
                        city_py=city_py,
                    )
                )
        return cities
    def parse_list_cards(self, html: str) -> List[ListCard]:
        soup = BeautifulSoup(html, "html.parser")
        cards: List[ListCard] = []
        seen: Set[str] = set()
        for item in soup.select("div.lawyer_list ul.lawyer_ul > li"):
            link = item.select_one("div.lstx a[href]")
            if not link:
                continue
            detail_url = urljoin(SITE_BASE, link.get("href", "").strip())
            if not detail_url or detail_url in seen:
                continue
            seen.add(detail_url)
            name = ""
            law_firm = ""
            specialties: List[str] = []
            answered_count = None
            name_tag = item.select_one("p.name")
            if name_tag:
                name = name_tag.get_text(strip=True)
            firm_tag = item.select_one("div.li_r h2")
            if firm_tag:
                law_firm = firm_tag.get_text(strip=True)
            for span in item.select("div.zc span"):
                text = span.get_text(strip=True)
                if text:
                    specialties.append(text)
            distance_text = item.select_one("div.distance i")
            if distance_text:
                match = ANSWER_RE.search(distance_text.get_text(" ", strip=True))
                if match:
                    answered_count = int(match.group(1))
            cards.append(
                ListCard(
                    detail_url=detail_url,
                    name=name,
                    law_firm=law_firm,
                    specialties=specialties,
                    answered_count=answered_count,
                )
            )
        return cards
    def has_next_page(self, html: str) -> bool:
        soup = BeautifulSoup(html, "html.parser")
        return soup.select_one("a.mnext") is not None
    def parse_detail(self, detail_url: str) -> Dict:
        html = self._get_text(detail_url)
        soup = BeautifulSoup(html, "html.parser")
        name = ""
        law_firm = ""
        license_no = ""
        practice_years = None
        phone = ""
        email = ""
        address = ""
        specialties: List[str] = []
        name_tag = soup.select_one("h2.lawyerName")
        if name_tag:
            name = name_tag.get_text(strip=True)
        firm_tag = soup.select_one("p.law-firm")
        if firm_tag:
            law_firm = firm_tag.get_text(strip=True)
        license_tag = soup.select_one("p.card-zyz")
        if license_tag:
            license_no = (
                license_tag.get_text(" ", strip=True)
                .replace("执业证号：", "")
                .replace("执业证号:", "")
                .strip()
            )
        years_tag = soup.select_one("div#practice i")
        if years_tag:
            year_text = years_tag.get_text(strip=True)
            if year_text.isdigit():
                practice_years = int(year_text)
        tel_tag = soup.select_one("a[href^='tel:']")
        if tel_tag:
            phone = normalize_phone(tel_tag.get("href", ""))
        for li in soup.select("ul.contact-content > li"):
            key = li.select_one("i")
            val = li.select_one("p")
            if not key or not val:
                continue
            k = key.get_text(strip=True).replace("：", ":")
            v = val.get_text(" ", strip=True)
            if "电话" in k and not phone:
                phone = normalize_phone(v)
            elif "邮箱" in k and not email:
                email = v.strip()
            elif "地址" in k and not address:
                address = v.strip()
        for node in soup.select("div.exp-main li.on"):
            text = node.get_text(strip=True)
            if text:
                specialties.append(text)
        return {
            "name": name,
            "law_firm": law_firm,
            "license_no": license_no,
            "practice_years": practice_years,
            "phone": phone,
            "email": email,
            "address": address,
            "specialties": specialties,
            "detail_url": detail_url,
        }
    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
        profile = record.get("profile", {}) or {}
        source = record.get("source", {}) or {}
        phone = normalize_phone(profile.get("phone", ""))
        if not phone:
            return None
        province = (source.get("province") or "").strip()
        city = (source.get("city") or province).strip()
        return {
            "name": (profile.get("name") or "").strip(),
            "law_firm": (profile.get("law_firm") or "").strip(),
            "province": province,
            "city": city,
            "phone": phone,
            "url": (source.get("detail_url") or "").strip(),
            "domain": LEGACY_DOMAIN,
            "create_time": int(record.get("collected_at") or time.time()),
            "params": json.dumps(record, ensure_ascii=False),
        }
    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
        if not self.db or not phones:
            return set()
        deduped = sorted({p for p in phones if p})
        if not deduped:
            return set()
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
            for i in range(0, len(deduped), chunk_size):
                chunk = deduped[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
        return existing
    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
        if not self.db:
            return 0, 0
        rows: List[Dict[str, str]] = []
        for record in records:
            row = self._to_legacy_lawyer_row(record)
            if row:
                rows.append(row)
        if not rows:
            return 0, 0
        existing = self._existing_phones_in_db([row["phone"] for row in rows])
        inserted = 0
        skipped = 0
        for row in rows:
            phone = row.get("phone", "")
            if not phone or phone in existing:
                skipped += 1
                continue
            try:
                self.db.insert_data("lawyer", row)
                existing.add(phone)
                inserted += 1
            except Exception as exc:
                skipped += 1
                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
        return inserted, skipped
    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
        # 同一城市内去重，避免站点分页回流导致重复抓取
        seen_detail_urls: Set[str] = set()
        last_page_signature: Tuple[str, ...] = tuple()
        repeated_signature_pages = 0
        no_new_pages = 0
        for page in range(1, self.max_pages + 1):
            list_url = LIST_URL_TEMPLATE.format(city_py=target.city_py, page=page)
            try:
                html = self._get_text(list_url)
            except Exception as exc:
                print(f"[list] 失败 {list_url}: {exc}")
                break
            cards = self.parse_list_cards(html)
            if not cards:
                break
            page_signature = tuple(sorted(card.detail_url for card in cards if card.detail_url))
            if page_signature and page_signature == last_page_signature:
                repeated_signature_pages += 1
            else:
                repeated_signature_pages = 0
            last_page_signature = page_signature
            if repeated_signature_pages >= 2:
                print(
                    f"[list] 城市 {target.city_py} 第{page}页列表签名重复，提前结束，"
                    f"list_url={list_url}"
                )
                break
            fresh_cards: List[ListCard] = []
            for card in cards:
                if not card.detail_url:
                    continue
                if card.detail_url in seen_detail_urls:
                    continue
                seen_detail_urls.add(card.detail_url)
                fresh_cards.append(card)
            if not fresh_cards:
                no_new_pages += 1
                if no_new_pages >= 3:
                    print(
                        f"[list] 城市 {target.city_py} 连续{no_new_pages}页无新增律师，提前结束，"
                        f"list_url={list_url}"
                    )
                    break
            else:
                no_new_pages = 0
            print(
                f"[page] city={target.city_py} page={page} cards={len(cards)} "
                f"fresh={len(fresh_cards)} next={self.has_next_page(html)}"
            )
            for card in fresh_cards:
                try:
                    detail = self.parse_detail(card.detail_url)
                except Exception as exc:
                    print(f"[detail] 失败 {card.detail_url}: {exc}")
                    continue
                now = int(time.time())
                record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
                yield {
                    "record_id": record_id,
                    "collected_at": now,
                    "source": {
                        "site": SITE_NAME,
                        "list_url": list_url,
                        "detail_url": card.detail_url,
                        "province": target.province_name,
                        "province_py": target.province_py,
                        "city": target.city_name,
                        "city_py": target.city_py,
                        "page": page,
                    },
                    "list_snapshot": {
                        "name": card.name,
                        "law_firm": card.law_firm,
                        "specialties": card.specialties,
                        "answered_count": card.answered_count,
                    },
                    "profile": {
                        "name": detail.get("name") or card.name,
                        "law_firm": detail.get("law_firm") or card.law_firm,
                        "phone": detail.get("phone", ""),
                        "license_no": detail.get("license_no", ""),
                        "practice_years": detail.get("practice_years"),
                        "email": detail.get("email", ""),
                        "address": detail.get("address", ""),
                        "specialties": detail.get("specialties") or card.specialties,
                    },
                }
                if self.sleep_seconds:
                    time.sleep(self.sleep_seconds)
            if not self.has_next_page(html):
                break
    def crawl(
        self,
        output_path: str,
        max_cities: int = 0,
        city_filter: Optional[str] = None,
    ) -> None:
        cities = self.discover_cities()
        print(f"[discover] 共发现城市 {len(cities)} 个")
        if city_filter:
            key = city_filter.strip().lower()
            cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
        if max_cities > 0:
            cities = cities[:max_cities]
            print(f"[discover] 截断城市数 {len(cities)}")
        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
        seen_ids: Set[str] = set()
        if os.path.exists(output_path):
            with open(output_path, "r", encoding="utf-8") as old_file:
                for line in old_file:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        item = json.loads(line)
                    except Exception:
                        continue
                    rid = item.get("record_id")
                    if rid:
                        seen_ids.add(rid)
            print(f"[resume] 已有记录 {len(seen_ids)} 条")
        total_new_json = 0
        total_new_db = 0
        total_skip_db = 0
        with open(output_path, "a", encoding="utf-8") as out:
            for idx, target in enumerate(cities, start=1):
                print(
                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
                    f"({target.city_py})"
                )
                city_records = list(self.crawl_city(target))
                city_new_json = 0
                for record in city_records:
                    rid = record["record_id"]
                    if rid in seen_ids:
                        continue
                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
                    seen_ids.add(rid)
                    city_new_json += 1
                    total_new_json += 1
                city_new_db, city_skip_db = self._write_records_to_db(city_records)
                total_new_db += city_new_db
                total_skip_db += city_skip_db
                print(
                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
                )
        print(
            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
            f"DB跳过{total_skip_db}条, 输出: {output_path}"
        )
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="大律师全新采集脚本（新数据结构）")
    parser.add_argument(
        "--output",
        default="/www/wwwroot/lawyers/data/dls_records_all.jsonl",
        help="输出 jsonl 文件路径",
    )
    parser.add_argument(
        "--max-cities",
        type=int,
        default=0,
        help="最多采集多少个城市，0 表示不限",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=9999,
        help="每个城市最多采集多少页",
    )
    parser.add_argument(
        "--city-filter",
        default="",
        help="按城市拼音或城市名过滤，如 beijing",
    )
    parser.add_argument(
        "--sleep",
        type=float,
        default=0.2,
        help="详情页请求间隔秒数",
    )
    parser.add_argument(
        "--direct",
        action="store_true",
        help="直连模式，不使用 proxy_settings.json 代理",
    )
    parser.add_argument(
        "--no-db",
        action="store_true",
        help="只输出 JSONL，不写入数据库",
    )
    return parser.parse_args()
 def main():
    args = parse_args()
    if args.no_db:
        crawler = DlsFreshCrawler(
            max_pages=args.max_pages,
            sleep_seconds=args.sleep,
            use_proxy=not args.direct,
            db_connection=None,
        )
        crawler.crawl(
            output_path=args.output,
            max_cities=args.max_cities,
            city_filter=args.city_filter or None,
        )
        return
    with Db() as db:
        crawler = DlsFreshCrawler(
            max_pages=args.max_pages,
            sleep_seconds=args.sleep,
            use_proxy=not args.direct,
            db_connection=db,
        )
        crawler.crawl(
            output_path=args.output,
            max_cities=args.max_cities,
            city_filter=args.city_filter or None,
        )
 if __name__ == "__main__":
    main()
@@ -0,0 +1,290 @@
 #!/usr/bin/env python3
 import argparse
 import json
 import os
 import sys
 import time
 from datetime import datetime
 from typing import Dict, List, Optional
 import pymysql
 from openpyxl import Workbook
 from openpyxl.styles import Font
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
 if project_root not in sys.path:
    sys.path.insert(0, project_root)
 from Db import Db
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
    parser.add_argument(
        "--output",
        default="",
        help="输出 xlsx 文件路径，默认输出到 data/export_lawyers_时间戳.xlsx",
    )
    parser.add_argument(
        "--start-ts",
        type=int,
        default=None,
        help="create_time 起始时间戳（含），不传时默认取最近7天",
    )
    parser.add_argument(
        "--end-ts",
        type=int,
        default=None,
        help="create_time 结束时间戳（含），默认不限制上限",
    )
    parser.add_argument(
        "--domain",
        default="",
        help="按 domain 过滤，例如：大律师 / 找法网 / 华律",
    )
    parser.add_argument(
        "--province",
        default="",
        help="按省份过滤，例如：北京、广东",
    )
    parser.add_argument(
        "--city",
        default="",
        help="按城市过滤，例如：北京、深圳",
    )
    parser.add_argument(
        "--keyword",
        default="",
        help="关键词过滤（匹配姓名/律所/手机号）",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=0,
        help="最多导出多少条，0 表示不限",
    )
    parser.add_argument(
        "--include-extra",
        action="store_true",
        help="导出更多扩展字段（url/domain/create_time/site_time 等）",
    )
    parser.add_argument(
        "--no-parse-params",
        action="store_true",
        help="关闭 params JSON 扩展信息解析（默认开启）",
    )
    return parser.parse_args()
 def apply_default_time_filter(args: argparse.Namespace) -> None:
    # 未显式传时间范围时，默认导出最近7天的数据
    if args.start_ts is None and args.end_ts is None:
        args.start_ts = int(time.time()) - 7 * 24 * 3600
        args.end_ts = 0
        return
    if args.start_ts is None:
        args.start_ts = 0
    if args.end_ts is None:
        args.end_ts = 0
 def build_output_path(user_output: str) -> str:
    if user_output:
        return os.path.abspath(user_output)
    ts = int(time.time())
    return os.path.abspath(f"/www/wwwroot/lawyers/data/export_lawyers_{ts}.xlsx")
 def ts_to_text(ts_value: Optional[int]) -> str:
    if ts_value in (None, 0, ""):
        return ""
    try:
        return datetime.fromtimestamp(int(ts_value)).strftime("%Y-%m-%d %H:%M:%S")
    except Exception:
        return ""
 def build_query(args: argparse.Namespace) -> (str, List):
    where: List[str] = []
    params: List = []
    if args.start_ts > 0:
        where.append("create_time >= %s")
        params.append(args.start_ts)
    if args.end_ts > 0:
        where.append("create_time <= %s")
        params.append(args.end_ts)
    if args.domain.strip():
        where.append("domain = %s")
        params.append(args.domain.strip())
    if args.province.strip():
        where.append("province = %s")
        params.append(args.province.strip())
    if args.city.strip():
        where.append("city = %s")
        params.append(args.city.strip())
    if args.keyword.strip():
        like = f"%{args.keyword.strip()}%"
        where.append("(name LIKE %s OR law_firm LIKE %s OR phone LIKE %s)")
        params.extend([like, like, like])
    where_sql = f"WHERE {' AND '.join(where)}" if where else ""
    limit_sql = f"LIMIT {int(args.limit)}" if args.limit and args.limit > 0 else ""
    sql = (
        "SELECT id, name, phone, law_firm, province, city, url, domain, "
        "create_time, site_time, params "
        f"FROM lawyer {where_sql} ORDER BY id ASC {limit_sql}"
    )
    return sql, params
 def parse_params(params_text: str) -> Dict[str, str]:
    if not params_text:
        return {}
    try:
        data = json.loads(params_text)
    except Exception:
        return {}
    if not isinstance(data, dict):
        return {}
    profile = data.get("profile") or {}
    source = data.get("source") or {}
    if not isinstance(profile, dict):
        profile = {}
    if not isinstance(source, dict):
        source = {}
    specialties = profile.get("specialties")
    if isinstance(specialties, list):
        specialties_text = ",".join(str(x) for x in specialties if x)
    else:
        specialties_text = ""
    return {
        "email": str(profile.get("email") or ""),
        "address": str(profile.get("address") or ""),
        "license_no": str(profile.get("license_no") or ""),
        "practice_years": str(profile.get("practice_years") or ""),
        "specialties": specialties_text,
        "source_site": str(source.get("site") or ""),
        "detail_url": str(source.get("detail_url") or ""),
        "list_url": str(source.get("list_url") or ""),
    }
 def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, parse_params_flag: bool) -> int:
    wb = Workbook()
    ws = wb.active
    ws.title = "lawyers"
    headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"]
    if include_extra:
        headers.extend(
            [
                "URL",
                "站点",
                "create_time",
                "create_time_text",
                "site_time",
                "site_time_text",
                "ID",
            ]
        )
    if parse_params_flag:
        headers.extend(
            [
                "邮箱",
                "地址",
                "执业证号",
                "执业年限",
                "擅长领域",
                "source_site",
                "detail_url",
                "list_url",
            ]
        )
    ws.append(headers)
    for cell in ws[1]:
        cell.font = Font(bold=True)
    exported = 0
    for row in rows:
        info = parse_params(row.get("params", "") or "") if parse_params_flag else {}
        site_name = info.get("source_site") or (row.get("domain", "") or "")
        line = [
            row.get("phone", "") or "",
            row.get("name", "") or "",
            row.get("law_firm", "") or "",
            row.get("province", "") or "",
            row.get("city", "") or "",
            site_name,
            row.get("domain", "") or "",
        ]
        if include_extra:
            line.extend(
                [
                    row.get("url", "") or "",
                    row.get("domain", "") or "",
                    row.get("create_time", "") or "",
                    ts_to_text(row.get("create_time")),
                    row.get("site_time", "") or "",
                    ts_to_text(row.get("site_time")),
                    row.get("id", "") or "",
                ]
            )
        if parse_params_flag:
            line.extend(
                [
                    info.get("email", ""),
                    info.get("address", ""),
                    info.get("license_no", ""),
                    info.get("practice_years", ""),
                    info.get("specialties", ""),
                    info.get("source_site", ""),
                    info.get("detail_url", ""),
                    info.get("list_url", ""),
                ]
            )
        ws.append(line)
        exported += 1
    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
    wb.save(output_path)
    return exported
 def main() -> None:
    args = parse_args()
    apply_default_time_filter(args)
    output_path = build_output_path(args.output)
    sql, sql_params = build_query(args)
    with Db() as db:
        cursor = db.db.cursor(pymysql.cursors.DictCursor)
        try:
            cursor.execute(sql, sql_params)
            rows = cursor.fetchall()
        finally:
            cursor.close()
    count = export_to_excel(
        rows=rows,
        output_path=output_path,
        include_extra=args.include_extra,
        parse_params_flag=not args.no_parse_params,
    )
    print(f"[export] 导出完成，共 {count} 条")
    print(f"[export] 文件路径: {output_path}")
    print(
        f"[export] 时间筛选 create_time: start={args.start_ts or '-'} end={args.end_ts or '-'}"
    )
 if __name__ == "__main__":
    main()
@@ -1,9 +1,16 @@
 import argparse
 import ast
 import hashlib
 import json
 import os
 import random
 import re
 import sys
 import time
-import random
+from dataclasses import dataclass
-from typing import Dict, List, Set, Optional
+from typing import Dict, Iterable, List, Optional, Set, Tuple
 import urllib3
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -13,21 +20,50 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)
 from request.requests_client import RequestClientError, RequestSSLError, RequestsClient
 from Db import Db
 from request.requests_client import RequestClientError, RequestsClient
 from utils.rate_limiter import wait_for_request
-DOMAIN = "找法网"
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
+
 SITE_NAME = "findlaw"
 LEGACY_DOMAIN = "找法网"
 SITE_BASE = "https://m.findlaw.cn"
 CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
 LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
 PHONE_RE = re.compile(r"1[3-9]\d{9}")
-class FindlawSpider:
+@dataclass
-    def __init__(self, db_connection):
+class CityTarget:
    province_id: str
    province_name: str
    province_py: str
    city_id: str
    city_name: str
    city_py: str
 def normalize_phone(text: str) -> str:
    compact = re.sub(r"\D", "", text or "")
    match = PHONE_RE.search(compact)
    return match.group(0) if match else ""
 class FindlawCrawler:
    def __init__(
        self,
        max_pages: int = 9999,
        sleep_seconds: float = 0.1,
        use_proxy: bool = True,
        db_connection=None,
    ):
        self.max_pages = max_pages
        self.sleep_seconds = max(0.0, sleep_seconds)
        self.db = db_connection
-        self.client = self._build_session()
+        self.client = RequestsClient(
-        self.cities = self._load_cities()
+            headers={
    def _build_session(self) -> RequestsClient:
        return RequestsClient(headers={
                "User-Agent": (
                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
@@ -36,174 +72,408 @@ class FindlawSpider:
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "X-Requested-With": "XMLHttpRequest",
                "Connection": "close",
-        })
+            },
            use_proxy=use_proxy,
            retry_total=2,
            retry_backoff_factor=1,
            retry_status_forcelist=(429, 500, 502, 503, 504),
            retry_allowed_methods=("GET",),
        )
-    def _refresh_session(self) -> None:
+    def _get_text(
-        self.client.refresh()
+        self,
-
+        url: str,
-    def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
+        timeout: int = 20,
        max_retries: int = 3,
        referer: str = SITE_BASE,
    ) -> str:
        headers = {"Referer": referer}
        last_error: Optional[Exception] = None
        for attempt in range(max_retries):
            wait_for_request()
            try:
-                resp = self.client.get_text(url, timeout=15, verify=verify, headers=headers)
+                resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
-                status_code = resp.status_code
+                code = resp.status_code
-                text = resp.text
+                if code == 403:
                if status_code == 403:
                    if attempt < max_retries - 1:
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        self.client.refresh()
-                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                        self._refresh_session()
                        time.sleep(wait_time)
                        continue
-                    print(f"请求失败 {url}: 403 Forbidden")
+                    raise RequestClientError(f"{code} Error: {url}")
-                    return None
+                if code >= 500 and attempt < max_retries - 1:
-                if status_code >= 400:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    raise RequestClientError(f"{status_code} Error: {url}")
+                    continue
-                return text
+                if code >= 400:
-            except RequestSSLError:
+                    raise RequestClientError(f"{code} Error: {url}")
-                if verify:
+                return resp.text
-                    return self._get(url, referer, verify=False, max_retries=max_retries)
+            except Exception as exc:
-                print(f"SSL错误 {url}")
+                last_error = exc
-                return None
+                if attempt < max_retries - 1:
-            except RequestClientError as exc:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                print(f"请求失败 {url}: {exc}")
+                    continue
-                return None
+                raise
        if last_error is not None:
            raise last_error
        raise RequestClientError(f"Unknown request error: {url}")
    def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
        pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
        match = re.search(pattern, script_text)
        if not match:
            return []
        raw = match.group(1)
        try:
            rows = ast.literal_eval(raw)
            return rows if isinstance(rows, list) else []
        except Exception:
            return []
    def discover_cities(self) -> List[CityTarget]:
        js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
        provinces = self._parse_city_js_array(js_text, "iosProvinces")
        cities = self._parse_city_js_array(js_text, "iosCitys")
        province_map: Dict[str, Dict] = {}
        for item in provinces:
            pid = str(item.get("id") or "").strip()
            if pid:
                province_map[pid] = item
        results: List[CityTarget] = []
        seen_py: Set[str] = set()
        for city in cities:
            city_py = str(city.get("pinyin") or "").strip()
            city_name = str(city.get("value") or "").strip()
            city_id = str(city.get("id") or "").strip()
            province_id = str(city.get("parentId") or "").strip()
            if not city_py or not city_name or not city_id:
                continue
            if city_py in seen_py:
                continue
            seen_py.add(city_py)
            province_row = province_map.get(province_id, {})
            province_name = str(province_row.get("value") or city_name).strip()
            province_py = str(province_row.get("pinyin") or city_py).strip()
            results.append(
                CityTarget(
                    province_id=province_id,
                    province_name=province_name,
                    province_py=province_py,
                    city_id=city_id,
                    city_name=city_name,
                    city_py=city_py,
                )
            )
        return results
    def _parse_list_payload(self, text: str) -> Dict:
        cleaned = (text or "").strip().lstrip("\ufeff")
        try:
            return json.loads(cleaned)
        except ValueError:
            start = cleaned.find("{")
            end = cleaned.rfind("}")
            if start == -1 or end == -1:
                return {}
            return json.loads(cleaned[start:end + 1])
    def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
        list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
        referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
        text = self._get_text(list_url, referer=referer)
        payload = self._parse_list_payload(text)
        if payload.get("errcode") != 0:
            return [], False, list_url
        data = payload.get("data", {}) or {}
        items = data.get("lawyer_list", []) or []
        has_more = str(data.get("has_more", "0")) == "1"
        return items, has_more, list_url
    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
        for page in range(1, self.max_pages + 1):
            try:
                items, has_more, list_url = self.fetch_list_page(target.city_py, page)
            except Exception as exc:
                print(f"[list] 失败 {target.city_py} p{page}: {exc}")
                break
            if not items:
                break
            for item in items:
                detail_url = item.get("siteask_m") or item.get("site_url") or ""
                detail_url = str(detail_url).strip()
                if not detail_url.startswith("http"):
                    detail_url = list_url
                phone = normalize_phone(item.get("mobile", ""))
                profile = {
                    "uid": str(item.get("uid") or ""),
                    "name": str(item.get("username") or "").strip(),
                    "law_firm": str(item.get("lawyer_lawroom") or "").strip(),
                    "phone": phone,
                    "lawyer_year": item.get("lawyer_year"),
                    "service_area": str(item.get("service_area") or "").strip(),
                    "address": str(item.get("addr") or "").strip(),
                    "specialties": item.get("professionArr") or [],
                    "answer_count": item.get("ansnum"),
                    "comment_count": item.get("askcommentnum"),
                }
                now = int(time.time())
                uid = profile.get("uid", "")
                record_key = uid or detail_url
                record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
                area = item.get("areaInfo", {}) or {}
                yield {
                    "record_id": record_id,
                    "collected_at": now,
                    "source": {
                        "site": SITE_NAME,
                        "list_url": list_url,
                        "detail_url": detail_url,
                        "province": str(area.get("province") or target.province_name),
                        "province_py": target.province_py,
                        "city": str(area.get("city") or target.city_name),
                        "city_py": target.city_py,
                        "page": page,
                    },
                    "list_snapshot": {
                        "uid": uid,
                        "name": profile["name"],
                        "law_firm": profile["law_firm"],
                        "answer_count": profile["answer_count"],
                        "comment_count": profile["comment_count"],
                    },
                    "profile": profile,
                    "raw": item,
                }
                if self.sleep_seconds:
                    time.sleep(self.sleep_seconds)
            if not has_more:
                break
    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
        source = record.get("source", {}) or {}
        profile = record.get("profile", {}) or {}
        phone = normalize_phone(profile.get("phone", ""))
        if not phone:
            return None
-    def _existing_phones(self, phones: List[str]) -> Set[str]:
+        province = (source.get("province") or "").strip()
-        if not phones:
+        city = (source.get("city") or province).strip()
        return {
            "name": (profile.get("name") or "").strip(),
            "law_firm": (profile.get("law_firm") or "").strip(),
            "province": province,
            "city": city,
            "phone": phone,
            "url": (source.get("detail_url") or source.get("list_url") or "").strip(),
            "domain": LEGACY_DOMAIN,
            "create_time": int(record.get("collected_at") or time.time()),
            "params": json.dumps(record, ensure_ascii=False),
        }
    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
        if not self.db or not phones:
            return set()
        deduped = sorted({p for p in phones if p})
        if not deduped:
            return set()
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
-            for i in range(0, len(phones), chunk_size):
+            for i in range(0, len(deduped), chunk_size):
-                chunk = phones[i:i + chunk_size]
+                chunk = deduped[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [DOMAIN, *chunk])
+                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
        return existing
-    def _load_cities(self):
+    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
-        condition = "domain='findlaw' AND level=2"
+        if not self.db:
-        tables = ("area_new", "area2", "area")
+            return 0, 0
-        last_error = None
+
-        for table in tables:
+        rows: List[Dict[str, str]] = []
-            try:
+        for record in records:
-                rows = self.db.select_data(table, "city, province, pinyin", condition) or []
+            row = self._to_legacy_lawyer_row(record)
-            except Exception as exc:
+            if row:
-                last_error = exc
+                rows.append(row)
        if not rows:
            return 0, 0
        existing = self._existing_phones_in_db([row["phone"] for row in rows])
        inserted = 0
        skipped = 0
        for row in rows:
            phone = row.get("phone", "")
            if not phone or phone in existing:
                skipped += 1
                continue
            if rows:
                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
                print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
                return rows
        if last_error:
            print(f"[找法网] 加载地区数据失败: {last_error}")
        print("[找法网] 无城市数据（已尝试 area_new/area2/area）")
        for table in tables:
            try:
-                cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
+                self.db.insert_data("lawyer", row)
-                c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
+                existing.add(phone)
-                print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
+                inserted += 1
            except Exception as exc:
                skipped += 1
                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
        return inserted, skipped
    def crawl(
        self,
        output_path: str,
        max_cities: int = 0,
        city_filter: Optional[str] = None,
    ) -> None:
        cities = self.discover_cities()
        print(f"[discover] 共发现城市 {len(cities)} 个")
        if city_filter:
            key = city_filter.strip().lower()
            cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
        if max_cities > 0:
            cities = cities[:max_cities]
            print(f"[discover] 截断城市数 {len(cities)}")
        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
        seen_ids: Set[str] = set()
        if os.path.exists(output_path):
            with open(output_path, "r", encoding="utf-8") as old_file:
                for line in old_file:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        item = json.loads(line)
                    except Exception:
-                pass
+                        continue
-        return []
+                    rid = item.get("record_id")
                    if rid:
                        seen_ids.add(rid)
            print(f"[resume] 已有记录 {len(seen_ids)} 条")
-    def _fetch_page(self, url: str, referer: str) -> List[Dict]:
+        total_new_json = 0
-        text = self._get(url, referer, verify=True)
+        total_new_db = 0
-        if not text:
+        total_skip_db = 0
            return []
-        try:
+        with open(output_path, "a", encoding="utf-8") as out:
-            # 某些返回体前会携带 BOM 或包装脚本，此处做兼容
+            for idx, target in enumerate(cities, start=1):
-            text = text.strip().lstrip("\ufeff")
+                print(
-            try:
+                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
-                data = json.loads(text)
+                    f"({target.city_py})"
-            except ValueError:
+                )
-                json_start = text.find('{')
+                city_records = list(self.crawl_city(target))
                json_end = text.rfind('}')
                if json_start == -1 or json_end == -1:
                    print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
                    return []
                cleaned = text[json_start:json_end + 1]
                data = json.loads(cleaned)
            if isinstance(data, str):
                try:
                    data = json.loads(data)
                except ValueError:
                    print(f"解析JSON失败 {url}: 二次解析仍为字符串，开头: {str(data)[:80]!r}")
                    return []
        except ValueError as exc:
            print(f"解析JSON失败 {url}: {exc}")
            return []
-        items = data.get("data", {}).get("lawyer_list", [])
+                city_new_json = 0
-        parsed = []
+                for record in city_records:
-        for item in items:
+                    rid = record["record_id"]
-            phone = (item.get("mobile") or "").replace("-", "")
+                    if rid in seen_ids:
-            parsed.append({
+                        continue
-                "name": item.get("username", ""),
+                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
-                "law_firm": item.get("lawyer_lawroom", ""),
+                    seen_ids.add(rid)
-                "province": item.get("areaInfo", {}).get("province", ""),
+                    city_new_json += 1
-                "city": item.get("areaInfo", {}).get("city", ""),
+                    total_new_json += 1
                "phone": phone,
                "url": url,
                "domain": DOMAIN,
                "create_time": int(time.time()),
                "params": json.dumps(item, ensure_ascii=False)
            })
        return parsed
-    def run(self):
+                city_new_db, city_skip_db = self._write_records_to_db(city_records)
-        print("启动找法网采集...")
+                total_new_db += city_new_db
-        if not self.cities:
+                total_skip_db += city_skip_db
-            print("无城市数据")
+                print(
                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
                )
        print(
            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
            f"DB跳过{total_skip_db}条, 输出: {output_path}"
        )
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="找法网全新采集脚本（重写版）")
    parser.add_argument(
        "--output",
        default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
        help="输出 jsonl 文件路径",
    )
    parser.add_argument(
        "--max-cities",
        type=int,
        default=0,
        help="最多采集多少个城市，0 表示不限",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=9999,
        help="每个城市最多采集多少页",
    )
    parser.add_argument(
        "--city-filter",
        default="",
        help="按城市拼音或城市名过滤，如 beijing",
    )
    parser.add_argument(
        "--sleep",
        type=float,
        default=0.1,
        help="每条记录采集间隔秒数",
    )
    parser.add_argument(
        "--direct",
        action="store_true",
        help="直连模式，不使用 proxy_settings.json 代理",
    )
    parser.add_argument(
        "--no-db",
        action="store_true",
        help="只输出 JSONL，不写入数据库",
    )
    return parser.parse_args()
 def main():
    args = parse_args()
    if args.no_db:
        crawler = FindlawCrawler(
            max_pages=args.max_pages,
            sleep_seconds=args.sleep,
            use_proxy=not args.direct,
            db_connection=None,
        )
        crawler.crawl(
            output_path=args.output,
            max_cities=args.max_cities,
            city_filter=args.city_filter or None,
        )
        return
-        for city in self.cities:
+    with Db() as db:
-            pinyin = city.get("pinyin")
+        crawler = FindlawCrawler(
-            province = city.get("province", "")
+            max_pages=args.max_pages,
-            city_name = city.get("city", "")
+            sleep_seconds=args.sleep,
-            if not pinyin:
+            use_proxy=not args.direct,
-                continue
+            db_connection=db,
-            print(f"采集 {province}-{city_name}")
+        )
-            page = 1
+        crawler.crawl(
-            while True:
+            output_path=args.output,
-                url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
+            max_cities=args.max_cities,
-                referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
+            city_filter=args.city_filter or None,
-                print(f"  第 {page} 页: {url}")
+        )
                items = self._fetch_page(url, referer)
                if not items:
                    break
                phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
                existing = self._existing_phones(phones)
                for entry in items:
                    phone = entry.get("phone")
                    if not phone:
                        continue
                    if phone in existing:
                        print(f"    -- 已存在: {entry['name']} ({phone})")
                        continue
                    try:
                        self.db.insert_data("lawyer", entry)
                        print(f"    -> 新增: {entry['name']} ({phone})")
                    except Exception as exc:
                        print(f"    插入失败: {exc}")
                page += 1
        print("找法网采集完成")
 if __name__ == "__main__":
-    with Db() as db:
+    main()
        spider = FindlawSpider(db)
        spider.run()
@@ -1,10 +1,18 @@
 import argparse
 import ast
 import hashlib
 import json
 import os
 import random
 import re
 import sys
 import time
-import random
+from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Dict, Iterable, List, Optional, Set, Tuple
 from urllib.parse import urljoin
 import urllib3
 from bs4 import BeautifulSoup
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -14,312 +22,638 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)
 from bs4 import BeautifulSoup
 from request.requests_client import RequestClientError, RequestsClient
 from Db import Db
-from config import HEADERS
+from request.requests_client import RequestClientError, RequestsClient
 from utils.rate_limiter import wait_for_request
-LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-DOMAIN = "华律"
+
 SITE_NAME = "hualv"
 LEGACY_DOMAIN = "华律"
 SITE_BASE = "https://m.66law.cn"
 CITY_DATA_URL = "https://cache.66law.cn/dist/main-v2.0.js"
 LIST_API_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
 PHONE_RE = re.compile(r"1[3-9]\d{9}")
 EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
 YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
-class HualvSpider:
+@dataclass
-    def __init__(self, db_connection):
+class CityTarget:
    province_id: int
    province_name: str
    city_id: int
    city_name: str
 def normalize_phone(text: str) -> str:
    compact = re.sub(r"\D", "", text or "")
    match = PHONE_RE.search(compact)
    return match.group(0) if match else ""
 def strip_html_tags(text: str) -> str:
    return re.sub(r"<[^>]+>", "", text or "").strip()
 class HualvCrawler:
    def __init__(
        self,
        max_pages: int = 9999,
        sleep_seconds: float = 0.15,
        use_proxy: bool = True,
        db_connection=None,
    ):
        self.max_pages = max_pages
        self.sleep_seconds = max(0.0, sleep_seconds)
        self.db = db_connection
-        self.client = self._build_session()
+        self.client = RequestsClient(
-        self.areas = self._load_areas()
+            headers={
-
+                "User-Agent": (
-    def _build_session(self) -> RequestsClient:
+                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
-        custom_headers = HEADERS.copy()
+                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
-        custom_headers['User-Agent'] = (
+                    "Mobile/15E148 Safari/604.1"
-            'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
+                ),
-            'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
+                "Accept": "application/json, text/javascript, */*; q=0.01",
-            'Mobile/15E148 Safari/604.1'
+                "X-Requested-With": "XMLHttpRequest",
                "Connection": "close",
            },
            use_proxy=use_proxy,
            retry_total=2,
            retry_backoff_factor=1,
            retry_status_forcelist=(429, 500, 502, 503, 504),
            retry_allowed_methods=("GET", "POST"),
        )
        custom_headers["Connection"] = "close"
        return RequestsClient(headers=custom_headers)
-    def _refresh_session(self) -> None:
+    def _request_text(
-        self.client.refresh()
+        self,
        method: str,
        url: str,
        *,
        timeout: int = 20,
        max_retries: int = 3,
        referer: str = SITE_BASE,
        data: Optional[Dict] = None,
    ) -> str:
        headers = {"Referer": referer}
        last_error: Optional[Exception] = None
-    def _load_areas(self):
+        for attempt in range(max_retries):
-        tables = ("area_new", "area2", "area")
+            wait_for_request()
        last_error = None
        for table in tables:
            try:
-                provinces = self.db.select_data(
+                if method.upper() == "POST":
-                    table,
+                    resp = self.client.post_text(
-                    "code, province, pinyin, id",
+                        url,
-                    "domain='66law' AND level=1"
+                        timeout=timeout,
-                ) or []
+                        verify=False,
-                cities = self.db.select_data(
+                        headers=headers,
-                    table,
+                        data=data,
-                    "code, city, province, pid",
+                    )
-                    "domain='66law' AND level=2"
+                else:
-                ) or []
+                    resp = self.client.get_text(
                        url,
                        timeout=timeout,
                        verify=False,
                        headers=headers,
                    )
                code = resp.status_code
                if code == 403:
                    if attempt < max_retries - 1:
                        self.client.refresh()
                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                        continue
                    raise RequestClientError(f"{code} Error: {url}")
                if code >= 500 and attempt < max_retries - 1:
                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                    continue
                if code >= 400:
                    raise RequestClientError(f"{code} Error: {url}")
                return resp.text
            except Exception as exc:
                last_error = exc
                continue
            if not cities:
                continue
            province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
            city_map = {}
            for city in cities:
                province_info = province_map.get(city.get('pid'), {}) or {}
                province_code = province_info.get('code')
                city_map[city.get('code')] = {
                    "name": city.get('city'),
                    "province": city.get('province'),
                    "province_code": province_code,
                }
            print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
            return city_map
        if last_error:
            print(f"[华律] 加载地区数据失败: {last_error}")
        print("[华律] 无城市数据（已尝试 area_new/area2/area）")
        return {}
    def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
        for attempt in range(max_retries):
            try:
                resp = self.client.post_text(LIST_URL, data=data, timeout=20, verify=False)
                status_code = resp.status_code
                text = resp.text
                if status_code == 403:
                if attempt < max_retries - 1:
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
                        self._refresh_session()
                        time.sleep(wait_time)
                    continue
-                    print("请求失败: 403 Forbidden")
+                raise
                    return None
                if status_code >= 400:
                    raise RequestClientError(f"{status_code} Error")
                try:
                    return json.loads(text)
                except ValueError as exc:
                    print(f"解析JSON失败: {exc}")
                    return None
            except RequestClientError as exc:
                print(f"请求失败: {exc}")
                return None
        return None
-    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        if last_error is not None:
-        contact_url = f"{url}lawyer_contact.aspx"
+            raise last_error
-        print(f"  详情: {contact_url}")
+        raise RequestClientError(f"Unknown request error: {url}")
-        existing = self.db.select_data(
+
-            "lawyer",
+    def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
-            "id, avatar_url",
+        return self._request_text(
-            f"domain='{DOMAIN}' AND url='{contact_url}'"
+            "GET",
            url,
            timeout=timeout,
            max_retries=max_retries,
            referer=referer,
        )
        existing_id = None
        if existing:
            existing_id = existing[0].get("id")
            avatar = (existing[0].get("avatar_url") or "").strip()
            if avatar:
                print("    -- 已存在且头像已补全，跳过")
                return None
-        html = self._get_detail(contact_url)
+    def _post_text(
-        if not html:
+        self,
-            return None
+        url: str,
        *,
        data: Dict,
        timeout: int = 20,
        max_retries: int = 3,
        referer: str = SITE_BASE,
    ) -> str:
        return self._request_text(
            "POST",
            url,
            timeout=timeout,
            max_retries=max_retries,
            referer=referer,
            data=data,
        )
    def _extract_spc_location(self, script_text: str) -> List:
        # main-v2.js 内置了 sPCLocation=new Array(...)，后面紧跟 cateinfo 数组
        marker = "sPCLocation = new Array("
        start = script_text.find(marker)
        if start == -1:
            marker = "sPCLocation=new Array("
            start = script_text.find(marker)
        if start == -1:
            return []
        start += len(marker)
        next_marker = script_text.find("cateinfo = new Array(", start)
        if next_marker == -1:
            next_marker = script_text.find("cateinfo=new Array(", start)
        if next_marker != -1:
            end = script_text.rfind(");", start, next_marker)
        else:
            end = script_text.find(");", start)
        if end == -1 or end <= start:
            return []
        raw = "[" + script_text[start:end] + "]"
        try:
            data = ast.literal_eval(raw)
        except Exception:
            return []
        return data if isinstance(data, list) else []
    def discover_cities(self) -> List[CityTarget]:
        script_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyer/")
        rows = self._extract_spc_location(script_text)
        targets: List[CityTarget] = []
        seen: Set[Tuple[int, int]] = set()
        for province in rows:
            if not isinstance(province, list) or len(province) < 3:
                continue
            try:
                province_id = int(province[0])
            except Exception:
                continue
            province_name = str(province[1] or "").strip()
            city_rows = province[2] if isinstance(province[2], list) else []
            for city in city_rows:
                if not isinstance(city, list) or len(city) < 2:
                    continue
                try:
                    city_id = int(city[0])
                except Exception:
                    continue
                city_name = str(city[1] or "").strip()
                if city_id <= 0 or not city_name:
                    continue
                key = (province_id, city_id)
                if key in seen:
                    continue
                seen.add(key)
                targets.append(
                    CityTarget(
                        province_id=province_id,
                        province_name=province_name,
                        city_id=city_id,
                        city_name=city_name,
                    )
                )
        return targets
    def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[Dict], int]:
        payload = {
            "pid": str(target.province_id),
            "cid": str(target.city_id),
            "page": str(page),
        }
        text = self._post_text(
            LIST_API_URL,
            data=payload,
            referer=SITE_BASE + "/findlawyer/",
        )
        data = json.loads((text or "").strip().lstrip("\ufeff") or "{}")
        items = data.get("lawyerList") or data.get("queryLawyerList") or []
        if not isinstance(items, list):
            items = []
        page_count = 0
        try:
            page_count = int((data.get("lawyerItems") or {}).get("pageCount") or 0)
        except Exception:
            page_count = 0
        return items, page_count
    def parse_detail(self, detail_url: str) -> Dict:
        contact_url = detail_url.rstrip("/") + "/lawyer_contact.aspx"
        html = self._get_text(contact_url, referer=detail_url)
        soup = BeautifulSoup(html, "html.parser")
-        info_list = soup.find("ul", class_="information-list")
+        full_text = soup.get_text(" ", strip=True)
        if not info_list:
            return None
        phone = ""
        law_firm = ""
        for li in info_list.find_all("li"):
            text = li.get_text(strip=True)
            if "手机号" in text:
                cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
                match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
                if match:
                    phone = match.group(0)
            if "执业单位" in text:
                law_firm = text.replace("执业单位", "").strip()
        name = ""
-        breadcrumb = soup.find("div", class_="weizhi")
+        law_firm = ""
-        if breadcrumb:
+        phone = ""
-            links = breadcrumb.find_all("a")
+        email = ""
-            if len(links) > 2:
+        address = ""
-                name = links[2].get_text(strip=True)
+        license_no = ""
        practice_years: Optional[int] = None
-        phone = phone.replace('-', '').strip()
+        name_tag = soup.select_one(".logo-box .title b")
-        if not phone or not re.fullmatch(r"1\d{10}", phone):
+        if name_tag:
-            print("    无手机号，跳过")
+            name = name_tag.get_text(strip=True).replace("律师", "").strip()
        if not name and soup.title:
            match = re.search(r"([^\s,，。_]+?)律师", soup.title.get_text(" ", strip=True))
            if match:
                name = match.group(1).strip()
        phone_candidates = [
            soup.select_one(".logo-box .r-bar .tel").get_text(" ", strip=True)
            if soup.select_one(".logo-box .r-bar .tel")
            else "",
            soup.select_one(".lawyer-show ul.info").get_text(" ", strip=True)
            if soup.select_one(".lawyer-show ul.info")
            else "",
            full_text,
        ]
        for candidate in phone_candidates:
            phone = normalize_phone(candidate)
            if phone:
                break
        for li in soup.select(".lawyer-show ul.info li"):
            li_text = li.get_text(" ", strip=True)
            if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
                law_firm = li_text
        if not law_firm:
            match = re.search(r'"affiliation":\{"@type":"Organization","name":"([^"]+)"', html)
            if match:
                law_firm = match.group(1).strip()
        match = re.search(r'"identifier":"([^"]+)"', html)
        if match:
            license_no = match.group(1).strip()
        match = re.search(r'"streetAddress":"([^"]+)"', html)
        if match:
            address = match.group(1).strip()
        email_match = EMAIL_RE.search(html)
        if email_match:
            email = email_match.group(0).strip()
        year_match = YEAR_RE.search(full_text)
        if year_match:
            try:
                practice_years = int(year_match.group(1))
            except Exception:
                practice_years = None
        specialties = [node.get_text(strip=True) for node in soup.select(".tag-h38 span")]
        specialties = [x for x in specialties if x]
        return {
            "name": name,
            "law_firm": law_firm,
            "phone": phone,
            "email": email,
            "address": address,
            "license_no": license_no,
            "practice_years": practice_years,
            "specialties": specialties,
            "detail_url": detail_url,
            "contact_url": contact_url,
        }
    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
        seen_details: Set[str] = set()
        for page in range(1, self.max_pages + 1):
            try:
                items, page_count = self.fetch_list_page(target, page)
            except Exception as exc:
                print(f"[list] 失败 pid={target.province_id} cid={target.city_id} p{page}: {exc}")
                break
            if not items:
                break
            for item in items:
                detail_url = str(item.get("lawyerUrl") or "").strip()
                if not detail_url:
                    continue
                if detail_url.startswith("//"):
                    detail_url = "https:" + detail_url
                if not detail_url.startswith("http"):
                    detail_url = urljoin(SITE_BASE, detail_url)
                if detail_url in seen_details:
                    continue
                seen_details.add(detail_url)
                try:
                    detail = self.parse_detail(detail_url)
                except Exception as exc:
                    print(f"[detail] 失败 {detail_url}: {exc}")
                    continue
                now = int(time.time())
                uid = str(item.get("lawyerId") or item.get("globalUserId") or detail_url)
                record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
                list_name = str(item.get("name") or "").replace("律师", "").strip()
                category_text = str(item.get("categoryNames") or "").strip()
                category_arr = [x.strip() for x in re.split(r"[、,，]", category_text) if x.strip()]
                yield {
                    "record_id": record_id,
                    "collected_at": now,
                    "source": {
                        "site": SITE_NAME,
                        "province_id": target.province_id,
                        "province": target.province_name,
                        "city_id": target.city_id,
                        "city": target.city_name,
                        "page": page,
                        "detail_url": detail_url,
                        "contact_url": detail.get("contact_url", ""),
                    },
                    "list_snapshot": {
                        "lawyer_id": item.get("lawyerId"),
                        "name": list_name,
                        "category_names": category_arr,
                        "help_count": strip_html_tags(str(item.get("helpCount") or "")),
                        "comment_score": strip_html_tags(str(item.get("commentScore") or "")),
                        "response_time": str(item.get("responseTime") or "").strip(),
                        "year": item.get("year"),
                        "is_adv": bool(item.get("isAdv")),
                    },
                    "profile": {
                        "name": detail.get("name") or list_name,
                        "law_firm": detail.get("law_firm") or "",
                        "phone": detail.get("phone") or "",
                        "email": detail.get("email") or "",
                        "address": detail.get("address") or "",
                        "license_no": detail.get("license_no") or "",
                        "practice_years": detail.get("practice_years"),
                        "specialties": detail.get("specialties") or category_arr,
                    },
                    "raw": item,
                }
                if self.sleep_seconds:
                    time.sleep(self.sleep_seconds)
            if page_count > 0 and page >= page_count:
                break
    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
        source = record.get("source", {}) or {}
        profile = record.get("profile", {}) or {}
        phone = normalize_phone(profile.get("phone", ""))
        if not phone:
            return None
-        avatar_url, site_time = self._extract_avatar_and_time(soup)
+        province = (source.get("province") or "").strip()
-        data = {
+        city = (source.get("city") or province).strip()
-            "phone": phone,
+        return {
            "name": (profile.get("name") or "").strip(),
            "law_firm": (profile.get("law_firm") or "").strip(),
            "province": province,
            "city": city,
-            "law_firm": law_firm,
+            "phone": phone,
-            "url": contact_url,
+            "url": (source.get("contact_url") or source.get("detail_url") or "").strip(),
-            "avatar_url": avatar_url,
+            "domain": LEGACY_DOMAIN,
-            "create_time": int(time.time()),
+            "create_time": int(record.get("collected_at") or time.time()),
-            "site_time": site_time,
+            "params": json.dumps(record, ensure_ascii=False),
            "domain": DOMAIN,
            "name": name,
            "params": json.dumps({"source": url}, ensure_ascii=False)
        }
        if existing_id:
            update_data = {
                "avatar_url": avatar_url,
                "site_time": site_time,
            }
            if name:
                update_data["name"] = name
            if law_firm:
                update_data["law_firm"] = law_firm
            if province:
                update_data["province"] = province
            if city:
                update_data["city"] = city
            if phone:
                update_data["phone"] = phone
            update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
            try:
                self.db.update_data("lawyer", update_data, f"id={existing_id}")
                print("    -- 已存在，已补全头像/时间")
            except Exception as exc:
                print(f"    更新失败: {exc}")
            return None
        # 若手机号已存在，则更新头像/时间，不再插入新记录
        existing_phone = self.db.select_data(
            "lawyer",
            "id, avatar_url, url",
            f"domain='{DOMAIN}' AND phone='{phone}'"
        )
        if existing_phone:
            existing_row = existing_phone[0]
            avatar = (existing_row.get("avatar_url") or "").strip()
            if avatar:
                print("    -- 已存在手机号且头像已补全，跳过")
                return None
            update_data = {
                "avatar_url": avatar_url,
                "site_time": site_time,
            }
            if name:
                update_data["name"] = name
            if law_firm:
                update_data["law_firm"] = law_firm
            if province:
                update_data["province"] = province
            if city:
                update_data["city"] = city
            if phone:
                update_data["phone"] = phone
            if not existing_row.get("url"):
                update_data["url"] = contact_url
            update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
            try:
                self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
                print("    -- 已存在手机号，已补全头像/时间")
            except Exception as exc:
                print(f"    更新失败: {exc}")
            return None
        return data
-    def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
+    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
-        avatar_url = ""
+        if not self.db or not phones:
-        site_time = None
+            return set()
        img_tag = soup.select_one(
            "div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
        )
        if img_tag:
            src = (img_tag.get("src") or "").strip()
            if src:
                if src.startswith("//"):
                    avatar_url = f"https:{src}"
                else:
                    avatar_url = src
                match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
                if match:
                    site_time = int(f"{match.group(1)}{match.group(2)}")
                else:
                    match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
                    if match:
                        site_time = int(f"{match.group(1)}{match.group(2)}")
        return avatar_url, site_time
-    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
+        deduped = sorted({p for p in phones if p})
-        for attempt in range(max_retries):
+        if not deduped:
            return set()
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
-                resp = self.client.get_text(url, timeout=15, verify=False)
+            chunk_size = 500
-                status_code = resp.status_code
+            for i in range(0, len(deduped), chunk_size):
-                text = resp.text
+                chunk = deduped[i:i + chunk_size]
-                if status_code == 403:
+                placeholders = ",".join(["%s"] * len(chunk))
-                    if attempt < max_retries - 1:
+                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
-                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                for row in cur.fetchall():
-                        self._refresh_session()
+                    existing.add(row[0])
-                        time.sleep(wait_time)
+        finally:
            cur.close()
        return existing
    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
        if not self.db:
            return 0, 0
        rows: List[Dict[str, str]] = []
        for record in records:
            row = self._to_legacy_lawyer_row(record)
            if row:
                rows.append(row)
        if not rows:
            return 0, 0
        existing = self._existing_phones_in_db([row["phone"] for row in rows])
        inserted = 0
        skipped = 0
        for row in rows:
            phone = row.get("phone", "")
            if not phone or phone in existing:
                skipped += 1
                continue
-                    print("    请求失败: 403 Forbidden")
+            try:
-                    return None
+                self.db.insert_data("lawyer", row)
-                if status_code >= 400:
+                existing.add(phone)
-                    raise RequestClientError(f"{status_code} Error")
+                inserted += 1
-                return text
+            except Exception as exc:
-            except RequestClientError as exc:
+                skipped += 1
-                print(f"    请求失败: {exc}")
+                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
                return None
        return None
-    def run(self):
+        return inserted, skipped
-        print("启动华律网采集...")
+
-        if not self.areas:
+    def crawl(
-            print("无城市数据")
+        self,
        output_path: str,
        max_cities: int = 0,
        city_filter: Optional[str] = None,
    ) -> None:
        cities = self.discover_cities()
        print(f"[discover] 共发现城市 {len(cities)} 个")
        if city_filter:
            key = city_filter.strip().lower()
            cities = [
                c for c in cities
                if key in c.city_name.lower() or key in str(c.city_id).lower()
            ]
            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
        if max_cities > 0:
            cities = cities[:max_cities]
            print(f"[discover] 截断城市数 {len(cities)}")
        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
        seen_ids: Set[str] = set()
        if os.path.exists(output_path):
            with open(output_path, "r", encoding="utf-8") as old_file:
                for line in old_file:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        item = json.loads(line)
                    except Exception:
                        continue
                    rid = item.get("record_id")
                    if rid:
                        seen_ids.add(rid)
            print(f"[resume] 已有记录 {len(seen_ids)} 条")
        total_new_json = 0
        total_new_db = 0
        total_skip_db = 0
        with open(output_path, "a", encoding="utf-8") as out:
            for idx, target in enumerate(cities, start=1):
                print(
                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
                    f"(pid={target.province_id}, cid={target.city_id})"
                )
                city_records = list(self.crawl_city(target))
                city_new_json = 0
                for record in city_records:
                    rid = record["record_id"]
                    if rid in seen_ids:
                        continue
                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
                    seen_ids.add(rid)
                    city_new_json += 1
                    total_new_json += 1
                city_new_db, city_skip_db = self._write_records_to_db(city_records)
                total_new_db += city_new_db
                total_skip_db += city_skip_db
                print(
                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
                )
        print(
            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
            f"DB跳过{total_skip_db}条, 输出: {output_path}"
        )
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="华律网全新采集脚本（站点数据直采）")
    parser.add_argument(
        "--output",
        default="/www/wwwroot/lawyers/data/hualv_records_all.jsonl",
        help="输出 jsonl 文件路径",
    )
    parser.add_argument(
        "--max-cities",
        type=int,
        default=0,
        help="最多采集多少个城市，0 表示不限",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=9999,
        help="每个城市最多采集多少页",
    )
    parser.add_argument(
        "--city-filter",
        default="",
        help="按城市名称或城市编码过滤，如 beijing / 110100",
    )
    parser.add_argument(
        "--sleep",
        type=float,
        default=0.15,
        help="详情页请求间隔秒数",
    )
    parser.add_argument(
        "--direct",
        action="store_true",
        help="直连模式，不使用 proxy_settings.json 代理",
    )
    parser.add_argument(
        "--no-db",
        action="store_true",
        help="只输出 JSONL，不写入数据库",
    )
    return parser.parse_args()
 def main():
    args = parse_args()
    if args.no_db:
        crawler = HualvCrawler(
            max_pages=args.max_pages,
            sleep_seconds=args.sleep,
            use_proxy=not args.direct,
            db_connection=None,
        )
        crawler.crawl(
            output_path=args.output,
            max_cities=args.max_cities,
            city_filter=args.city_filter or None,
        )
        return
-        for city_code, city_info in self.areas.items():
+    with Db() as db:
-            province_code = city_info.get("province_code")
+        crawler = HualvCrawler(
-            if not province_code:
+            max_pages=args.max_pages,
-                continue
+            sleep_seconds=args.sleep,
-            province_name = city_info.get("province", "")
+            use_proxy=not args.direct,
-            city_name = city_info.get("name", "")
+            db_connection=db,
-            print(f"采集 {province_name}-{city_name}")
+        )
-
+        crawler.crawl(
-            page = 1
+            output_path=args.output,
-            while True:
+            max_cities=args.max_cities,
-                payload = {"pid": province_code, "cid": city_code, "page": str(page)}
+            city_filter=args.city_filter or None,
-                data = self._post(payload)
+        )
                if not data or not data.get("lawyerList"):
                    break
                for item in data["lawyerList"]:
                    result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
                    if not result:
                        continue
                    try:
                        self.db.insert_data("lawyer", result)
                        print(f"  -> 新增: {result['name']} ({result['phone']})")
                    except Exception as exc:
                        print(f"  插入失败: {exc}")
                    time.sleep(1)
                page_count = data.get("lawyerItems", {}).get("pageCount", page)
                if page >= page_count:
                    break
                page += 1
                time.sleep(2)
            time.sleep(1)
        print("华律网采集完成")
 if __name__ == "__main__":
-    with Db() as db:
+    main()
        spider = HualvSpider(db)
        spider.run()
@@ -1,13 +1,16 @@
 import argparse
 import hashlib
 import json
 import os
 import random
 import re
 import sys
 import time
-import random
+from dataclasses import dataclass, field
-from typing import Dict, Optional, List, Set
+from typing import Dict, Iterable, List, Optional, Set, Tuple
-from urllib.parse import urljoin
+
-from concurrent.futures import ThreadPoolExecutor, as_completed
+import urllib3
-import threading
+from bs4 import BeautifulSoup
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -17,262 +20,628 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)
-import urllib3
+from Db import Db
 from bs4 import BeautifulSoup
 from request.requests_client import RequestClientError, RequestsClient
 from utils.rate_limiter import wait_for_request
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-from Db import Db
+SITE_NAME = "lawtime"
-from config import LAWTIME_CONFIG
+LEGACY_DOMAIN = "法律快车"
 SITE_BASE = "https://www.lawtime.cn"
 PROVINCE_API = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode=0"
 CITY_API_TEMPLATE = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode={province_id}"
 LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/lawyer"
-LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
+PHONE_RE = re.compile(r"1[3-9]\d{9}")
-DETAIL_BASE = "https://m.lawtime.cn"
+YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
 DOMAIN = "法律快车"
-class LawtimeSpider:
+@dataclass
-    def __init__(self, db_connection):
+class CityTarget:
    province_id: str
    province_name: str
    province_py: str
    city_id: str
    city_name: str
    city_py: str
@dataclass
 class ListCard:
    detail_url: str
    name: str
    phone: str
    address: str = ""
    specialties: List[str] = field(default_factory=list)
    metric_text: str = ""
 def normalize_phone(text: str) -> str:
    compact = re.sub(r"\D", "", text or "")
    match = PHONE_RE.search(compact)
    return match.group(0) if match else ""
 class LawtimeCrawler:
    def __init__(
        self,
        max_pages: int = 9999,
        sleep_seconds: float = 0.1,
        use_proxy: bool = True,
        db_connection=None,
    ):
        self.max_pages = max_pages
        self.sleep_seconds = max(0.0, sleep_seconds)
        self.db = db_connection
-        self.client = self._build_session()
+        self.client = RequestsClient(
-        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+            headers={
-        self._tls = threading.local()
+                "User-Agent": (
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/122.0.0.0 Safari/537.36"
                ),
                "Accept": "text/html,application/json,*/*;q=0.8",
                "Connection": "close",
            },
            use_proxy=use_proxy,
            retry_total=2,
            retry_backoff_factor=1,
            retry_status_forcelist=(429, 500, 502, 503, 504),
            retry_allowed_methods=("GET",),
        )
-    def _build_session(self) -> RequestsClient:
+    def _get_text(
-        headers = LAWTIME_CONFIG.get("HEADERS", {})
+        self,
-        custom_headers = dict(headers) if headers else {}
+        url: str,
-        custom_headers.setdefault("Connection", "close")
+        *,
-        return RequestsClient(headers=custom_headers)
+        timeout: int = 20,
        max_retries: int = 3,
        referer: str = SITE_BASE,
    ) -> str:
        headers = {"Referer": referer}
        last_error: Optional[Exception] = None
-    def _refresh_session(self) -> None:
+        for attempt in range(max_retries):
            wait_for_request()
            try:
                resp = self.client.get_text(
                    url,
                    timeout=timeout,
                    verify=False,
                    headers=headers,
                )
                code = resp.status_code
                if code == 403:
                    if attempt < max_retries - 1:
                        self.client.refresh()
                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                        continue
                    raise RequestClientError(f"{code} Error: {url}")
                if code >= 500 and attempt < max_retries - 1:
                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                    continue
                if code >= 400:
                    raise RequestClientError(f"{code} Error: {url}")
                return resp.text
            except Exception as exc:
                last_error = exc
                if attempt < max_retries - 1:
                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                    continue
                raise
-    def _get_thread_session(self) -> RequestsClient:
+        if last_error is not None:
-        s = getattr(self._tls, "session", None)
+            raise last_error
-        if s is not None:
+        raise RequestClientError(f"Unknown request error: {url}")
            return s
        s = self.client.clone()
        self._tls.session = s
        return s
-    def _refresh_thread_session(self) -> None:
+    def _get_json(self, url: str, *, referer: str) -> List[Dict]:
-        s = getattr(self._tls, "session", None)
+        text = self._get_text(url, referer=referer)
-        if s is not None:
+        cleaned = (text or "").strip().lstrip("\ufeff")
-            s.close()
+        if not cleaned or cleaned.startswith("<"):
-        self._tls.session = None
+            return []
        try:
            data = json.loads(cleaned)
        except ValueError:
            return []
        return data if isinstance(data, list) else []
-    def _existing_phones(self, phones: List[str]) -> Set[str]:
+    def discover_cities(self) -> List[CityTarget]:
-        if not phones:
+        provinces = self._get_json(PROVINCE_API, referer=SITE_BASE)
        if not provinces:
            print("[discover] 地区接口未返回有效数据")
            return []
        results: List[CityTarget] = []
        seen_py: Set[str] = set()
        for province in provinces:
            province_id = str(province.get("id") or "").strip()
            province_name = str(province.get("province") or province.get("city") or "").strip()
            province_py = str(province.get("pinyin") or "").strip()
            if not province_id or not province_name:
                continue
            city_api = CITY_API_TEMPLATE.format(province_id=province_id)
            try:
                cities = self._get_json(city_api, referer=LIST_URL_TEMPLATE.format(city_py=province_py or ""))
            except Exception as exc:
                print(f"[city] 获取失败 province={province_id}: {exc}")
                continue
            if not cities:
                cities = [
                    {
                        "id": province_id,
                        "province": province_name,
                        "city": province_name,
                        "pinyin": province_py,
                    }
                ]
            for city in cities:
                city_id = str(city.get("id") or "").strip()
                city_name = str(city.get("city") or city.get("province") or "").strip()
                city_py = str(city.get("pinyin") or "").strip()
                if not city_id or not city_name or not city_py:
                    continue
                if city_py in seen_py:
                    continue
                seen_py.add(city_py)
                results.append(
                    CityTarget(
                        province_id=province_id,
                        province_name=province_name,
                        province_py=province_py,
                        city_id=city_id,
                        city_name=city_name,
                        city_py=city_py,
                    )
                )
        return results
    def _build_list_url(self, city_py: str, page: int) -> str:
        base = LIST_URL_TEMPLATE.format(city_py=city_py)
        if page <= 1:
            return base
        return f"{base}?page={page}"
    def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[ListCard], bool, str]:
        list_url = self._build_list_url(target.city_py, page)
        html = self._get_text(list_url, referer=SITE_BASE + "/")
        cards = self.parse_list_cards(html)
        soup = BeautifulSoup(html, "html.parser")
        next_link = soup.select_one(f"div.page a[href*='page={page + 1}']")
        has_next = next_link is not None
        return cards, has_next, list_url
    def parse_list_cards(self, html: str) -> List[ListCard]:
        soup = BeautifulSoup(html, "html.parser")
        cards: List[ListCard] = []
        seen: Set[str] = set()
        for item in soup.select("li.lawyer-item-card"):
            link_tag = item.select_one("a.name[href]") or item.select_one("a.lawyer-img-box[href]")
            if not link_tag:
                continue
            detail_url = (link_tag.get("href") or "").strip()
            if not detail_url.startswith("http"):
                continue
            if detail_url in seen:
                continue
            seen.add(detail_url)
            name = link_tag.get_text(strip=True)
            phone = ""
            phone_tag = item.select_one("div.phone")
            if phone_tag:
                phone = normalize_phone(phone_tag.get_text(" ", strip=True))
            address = ""
            addr_tag = item.select_one("div.location .txt")
            if addr_tag:
                address = addr_tag.get_text(" ", strip=True)
            specialties: List[str] = []
            prof_tag = item.select_one("div.prof .txt")
            if prof_tag:
                specialties = [
                    x.strip() for x in re.split(r"[、,，]", prof_tag.get_text(" ", strip=True)) if x.strip()
                ]
            metric_text = ""
            metric_tag = item.select_one("div.num-msg")
            if metric_tag:
                metric_text = metric_tag.get_text(" ", strip=True)
            cards.append(
                ListCard(
                    detail_url=detail_url,
                    name=name,
                    phone=phone,
                    address=address,
                    specialties=specialties,
                    metric_text=metric_text,
                )
            )
        return cards
    def parse_detail(self, detail_url: str) -> Dict:
        html = self._get_text(detail_url, referer=SITE_BASE)
        if "网站防火墙" in html or "访问被拒绝" in html or "/ipfilter/verify" in html:
            raise RequestClientError(f"firewall blocked: {detail_url}")
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text(" ", strip=True)
        name = ""
        law_firm = ""
        phone = ""
        address = ""
        practice_years: Optional[int] = None
        specialties: List[str] = []
        if soup.title:
            title = soup.title.get_text(" ", strip=True)
            match = re.search(r"([^\s_，,。]+?)律师", title)
            if match:
                name = match.group(1).strip()
        phone_candidates = [
            soup.select_one(".data-w .tel-b b").get_text(" ", strip=True)
            if soup.select_one(".data-w .tel-b b")
            else "",
            soup.select_one(".law-info-b .item .two-r.b").get_text(" ", strip=True)
            if soup.select_one(".law-info-b .item .two-r.b")
            else "",
            text,
        ]
        for candidate in phone_candidates:
            phone = normalize_phone(candidate)
            if phone:
                break
        law_firm_tag = soup.select_one(".law-info-b .item .two-nowrap")
        if law_firm_tag:
            law_firm = law_firm_tag.get_text(" ", strip=True)
        for li in soup.select(".law-info-b .item"):
            li_text = li.get_text(" ", strip=True)
            if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
                law_firm = li_text
        addr_tag = soup.select_one(".law-info-b .item .two-r[title]")
        if addr_tag:
            addr_value = (addr_tag.get("title") or "").strip()
            if len(addr_value) > 8:
                address = addr_value
        if not address:
            addr_tag = soup.select_one(".law-info-b .item .two-r")
            if addr_tag:
                addr_value = addr_tag.get_text(" ", strip=True)
                if len(addr_value) > 8 and "律师" not in addr_value:
                    address = addr_value
        year_match = YEAR_RE.search(text)
        if year_match:
            try:
                practice_years = int(year_match.group(1))
            except Exception:
                practice_years = None
        specialties = [x.get_text(strip=True) for x in soup.select(".profession-b .item") if x.get_text(strip=True)]
        return {
            "name": name,
            "law_firm": law_firm,
            "phone": phone,
            "address": address,
            "practice_years": practice_years,
            "specialties": specialties,
            "detail_url": detail_url,
        }
    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
        seen_details: Set[str] = set()
        for page in range(1, self.max_pages + 1):
            try:
                cards, has_next, list_url = self.fetch_list_page(target, page)
            except Exception as exc:
                print(f"[list] 失败 {target.city_py} p{page}: {exc}")
                break
            if not cards:
                break
            for card in cards:
                if card.detail_url in seen_details:
                    continue
                seen_details.add(card.detail_url)
                detail: Dict = {}
                try:
                    detail = self.parse_detail(card.detail_url)
                except Exception as exc:
                    print(f"[detail] 失败 {card.detail_url}: {exc}")
                phone = normalize_phone(detail.get("phone") or card.phone)
                profile_name = (detail.get("name") or card.name).replace("律师", "").strip()
                now = int(time.time())
                record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
                yield {
                    "record_id": record_id,
                    "collected_at": now,
                    "source": {
                        "site": SITE_NAME,
                        "province_id": target.province_id,
                        "province": target.province_name,
                        "province_py": target.province_py,
                        "city_id": target.city_id,
                        "city": target.city_name,
                        "city_py": target.city_py,
                        "page": page,
                        "list_url": list_url,
                        "detail_url": card.detail_url,
                    },
                    "list_snapshot": {
                        "name": card.name,
                        "phone": card.phone,
                        "address": card.address,
                        "specialties": card.specialties,
                        "metric_text": card.metric_text,
                    },
                    "profile": {
                        "name": profile_name,
                        "law_firm": (detail.get("law_firm") or "").strip(),
                        "phone": phone,
                        "address": (detail.get("address") or card.address or "").strip(),
                        "practice_years": detail.get("practice_years"),
                        "specialties": detail.get("specialties") or card.specialties,
                    },
                }
                if self.sleep_seconds:
                    time.sleep(self.sleep_seconds)
            if not has_next:
                break
    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
        source = record.get("source", {}) or {}
        profile = record.get("profile", {}) or {}
        phone = normalize_phone(profile.get("phone", ""))
        if not phone:
            return None
        province = (source.get("province") or "").strip()
        city = (source.get("city") or province).strip()
        return {
            "name": (profile.get("name") or "").strip(),
            "law_firm": (profile.get("law_firm") or "").strip(),
            "province": province,
            "city": city,
            "phone": phone,
            "url": (source.get("detail_url") or source.get("list_url") or "").strip(),
            "domain": LEGACY_DOMAIN,
            "create_time": int(record.get("collected_at") or time.time()),
            "params": json.dumps(record, ensure_ascii=False),
        }
    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
        if not self.db or not phones:
            return set()
        deduped = sorted({p for p in phones if p})
        if not deduped:
            return set()
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
-            for i in range(0, len(phones), chunk_size):
+            for i in range(0, len(deduped), chunk_size):
-                chunk = phones[i:i + chunk_size]
+                chunk = deduped[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [DOMAIN, *chunk])
+                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
        return existing
-    def _load_areas(self):
+    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
-        condition = "level = 2 and domain='法律快车'"
+        if not self.db:
-        tables = ("area_new", "area", "area2")
+            return 0, 0
-        last_error = None
+
-        for table in tables:
+        rows: List[Dict[str, str]] = []
        for record in records:
            row = self._to_legacy_lawyer_row(record)
            if row:
                rows.append(row)
        if not rows:
            return 0, 0
        existing = self._existing_phones_in_db([row["phone"] for row in rows])
        inserted = 0
        skipped = 0
        for row in rows:
            phone = row.get("phone", "")
            if not phone or phone in existing:
                skipped += 1
                continue
            try:
-                rows = self.db.select_data(table, "pinyin, province, city", condition) or []
+                self.db.insert_data("lawyer", row)
                existing.add(phone)
                inserted += 1
            except Exception as exc:
-                last_error = exc
+                skipped += 1
-                continue
+                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
            if rows:
                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
                print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
                return rows
-        if last_error:
+        return inserted, skipped
            print(f"[法律快车] 加载地区数据失败: {last_error}")
        print("[法律快车] 无城市数据（已尝试 area_new/area/area2）")
        return []
-    def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
+    def crawl(
-        return self._get_with_session(self.client, url, max_retries=max_retries, is_thread=False)
+        self,
        output_path: str,
        max_cities: int = 0,
        city_filter: Optional[str] = None,
    ) -> None:
        cities = self.discover_cities()
        print(f"[discover] 共发现城市 {len(cities)} 个")
-    def _get_with_session(self, session: RequestsClient, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
+        if city_filter:
-        for attempt in range(max_retries):
+            key = city_filter.strip().lower()
-            try:
+            cities = [
-                resp = session.get_text(url, timeout=15, verify=False)
+                c for c in cities
-                status_code = resp.status_code
+                if key in c.city_py.lower() or key in c.city_name.lower()
-                text = resp.text
+            ]
-                if status_code == 403:
+            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
                    if attempt < max_retries - 1:
                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
                        print(f"请求失败 {url}: 403，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
                        if is_thread:
                            self._refresh_thread_session()
                            session = self._get_thread_session()
                        else:
                            self._refresh_session()
                            session = self.client
                        time.sleep(wait_time)
                        continue
                    print(f"请求失败 {url}: 403 Forbidden")
                    return None
                if status_code >= 400:
                    raise RequestClientError(f"{status_code} Error: {url}")
                return text
            except RequestClientError as exc:
                print(f"请求失败 {url}: {exc}")
                return None
        return None
-    def _parse_list(self, html: str, province: str, city: str) -> int:
+        if max_cities > 0:
-        soup = BeautifulSoup(html, "html.parser")
+            cities = cities[:max_cities]
-        links = [a.get("href", "") for a in soup.select("a.hide_link")]
+            print(f"[discover] 截断城市数 {len(cities)}")
        links = [link.replace("lll", "int") for link in links if link]
        if not links:
            return 0
-        detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
-        results: List[Dict[str, str]] = []
+        seen_ids: Set[str] = set()
-        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+        if os.path.exists(output_path):
-            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
+            with open(output_path, "r", encoding="utf-8") as old_file:
-            for fut in as_completed(futs):
+                for line in old_file:
-                try:
+                    line = line.strip()
-                    data = fut.result()
+                    if not line:
                except Exception as exc:
                    print(f"  详情解析异常: {exc}")
                    continue
                if data and data.get("phone"):
                    results.append(data)
        if not results:
            return len(detail_urls)
        phones = [d["phone"] for d in results if d.get("phone")]
        existing = self._existing_phones(phones)
        for data in results:
            phone = data.get("phone")
            if not phone:
                continue
            if phone in existing:
                print(f"  -- 已存在: {data['name']} ({phone})")
                        continue
                    try:
-                self.db.insert_data("lawyer", data)
+                        item = json.loads(line)
-                print(f"  -> 新增: {data['name']} ({phone})")
+                    except Exception:
            except Exception as exc:
                print(f"  插入失败 {data.get('url')}: {exc}")
        return len(detail_urls)
    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
        html = None
        sess = self._get_thread_session()
        html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
        if not html:
            return None
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text(" ")
        name = ""
        title_tag = soup.find("title")
        if title_tag:
            match = re.search(r"(\S+)律师", title_tag.get_text())
            if match:
                name = match.group(1)
        if not name:
            intl_div = soup.find("div", class_="intl")
            if intl_div:
                match = re.search(r"(\S+)律师", intl_div.get_text())
                if match:
                    name = match.group(1)
        phone = ""
        phone_pattern = r"1[3-9]\d{9}"
        for item in soup.select("div.item.flex"):
            label = item.find("div", class_="label")
            desc = item.find("div", class_="desc")
            if not label or not desc:
                        continue
-            label_text = label.get_text()
+                    rid = item.get("record_id")
-            desc_text = desc.get_text().replace("-", "")
+                    if rid:
-            if "联系电话" in label_text or "电话" in label_text:
+                        seen_ids.add(rid)
-                matches = re.findall(phone_pattern, desc_text)
+            print(f"[resume] 已有记录 {len(seen_ids)} 条")
                if matches:
                    phone = matches[0]
                    break
        if not phone:
            matches = re.findall(phone_pattern, text.replace("-", ""))
            if matches:
                phone = matches[0]
        if not phone:
            print(f"  无手机号: {url}")
            return None
-        law_firm = ""
+        total_new_json = 0
-        for item in soup.select("div.item.flex"):
+        total_new_db = 0
-            label = item.find("div", class_="label")
+        total_skip_db = 0
-            desc = item.find("div", class_="desc")
+
-            if not label or not desc:
+        with open(output_path, "a", encoding="utf-8") as out:
            for idx, target in enumerate(cities, start=1):
                print(
                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
                    f"({target.city_py})"
                )
                city_records = list(self.crawl_city(target))
                city_new_json = 0
                for record in city_records:
                    rid = record["record_id"]
                    if rid in seen_ids:
                        continue
-            if "执业律所" in label.get_text() or "律所" in label.get_text():
+                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
-                law_firm = desc.get_text(strip=True).replace("已认证", "")
+                    seen_ids.add(rid)
-                break
+                    city_new_json += 1
                    total_new_json += 1
-        params = {
+                city_new_db, city_skip_db = self._write_records_to_db(city_records)
-            "list_url": url,
+                total_new_db += city_new_db
-            "province": province,
+                total_skip_db += city_skip_db
            "city": city,
        }
-        return {
+                print(
-            "name": name or "",
+                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
-            "law_firm": law_firm,
+                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
-            "province": province,
+                )
            "city": city,
            "phone": phone,
            "url": url,
            "domain": DOMAIN,
            "create_time": int(time.time()),
            "params": json.dumps(params, ensure_ascii=False)
        }
-    def run(self):
+        print(
-        print("启动法律快车采集...")
+            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
-        areas = self._load_areas()
+            f"DB跳过{total_skip_db}条, 输出: {output_path}"
-        if not areas:
+        )
-            print("无地区数据")
+
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="法律快车全新采集脚本（站点数据直采）")
    parser.add_argument(
        "--output",
        default="/www/wwwroot/lawyers/data/lawtime_records_all.jsonl",
        help="输出 jsonl 文件路径",
    )
    parser.add_argument(
        "--max-cities",
        type=int,
        default=0,
        help="最多采集多少个城市，0 表示不限",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=9999,
        help="每个城市最多采集多少页",
    )
    parser.add_argument(
        "--city-filter",
        default="",
        help="按城市拼音或城市名过滤，如 beijing",
    )
    parser.add_argument(
        "--sleep",
        type=float,
        default=0.1,
        help="详情页请求间隔秒数",
    )
    parser.add_argument(
        "--direct",
        action="store_true",
        help="直连模式，不使用 proxy_settings.json 代理",
    )
    parser.add_argument(
        "--no-db",
        action="store_true",
        help="只输出 JSONL，不写入数据库",
    )
    return parser.parse_args()
 def main():
    args = parse_args()
    if args.no_db:
        crawler = LawtimeCrawler(
            max_pages=args.max_pages,
            sleep_seconds=args.sleep,
            use_proxy=not args.direct,
            db_connection=None,
        )
        crawler.crawl(
            output_path=args.output,
            max_cities=args.max_cities,
            city_filter=args.city_filter or None,
        )
        return
-        for area in areas:
+    with Db() as db:
-            pinyin = area.get("pinyin")
+        crawler = LawtimeCrawler(
-            province = area.get("province", "")
+            max_pages=args.max_pages,
-            city = area.get("city", "")
+            sleep_seconds=args.sleep,
-            if not pinyin:
+            use_proxy=not args.direct,
-                continue
+            db_connection=db,
-            page = 1
+        )
-            while True:
+        crawler.crawl(
-                list_url = LIST_BASE.format(pinyin=pinyin, page=page)
+            output_path=args.output,
-                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
+            max_cities=args.max_cities,
-                html = self._get(list_url)
+            city_filter=args.city_filter or None,
-                if not html:
+        )
                    break
                link_count = self._parse_list(html, province, city)
                if link_count == 0:
                    break
                page += 1
        print("法律快车采集完成")
 if __name__ == "__main__":
-    with Db() as db:
+    main()
        spider = LawtimeSpider(db)
        spider.run()
@@ -1,11 +1,17 @@
 import argparse
 import hashlib
 import json
 import os
 import random
 import re
 import sys
 import time
-import random
+from dataclasses import dataclass
-from typing import Dict, Optional, List, Set
+from typing import Dict, Iterable, List, Optional, Set, Tuple
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from urllib.parse import urljoin
-import threading
+
 import urllib3
 from bs4 import BeautifulSoup
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -15,146 +21,237 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)
-import urllib3
+from Db import Db
 from bs4 import BeautifulSoup
 from request.requests_client import RequestClientError, RequestsClient
 from utils.rate_limiter import wait_for_request
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-from Db import Db
+SITE_NAME = "64365"
 LEGACY_DOMAIN = "律图"
 SITE_BASE = "https://m.64365.com"
 AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js"
 LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
-DOMAIN = "律图"
+PHONE_RE = re.compile(r"1[3-9]\d{9}")
-LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
+YEAR_RE = re.compile(r"(\d+)\s*年")
-class Six4365Spider:
+@dataclass
-    def __init__(self, db_connection):
+class CityTarget:
    area_id: str
    province_id: str
    province_name: str
    province_py: str
    city_name: str
    city_py: str
@dataclass
 class ListCard:
    detail_url: str
    name: str
    specialties: List[str]
    score_text: str
    service_text: str
 def normalize_phone(text: str) -> str:
    compact = re.sub(r"\D", "", text or "")
    match = PHONE_RE.search(compact)
    return match.group(0) if match else ""
 class Six4365Crawler:
    def __init__(
        self,
        max_pages: int = 9999,
        sleep_seconds: float = 0.1,
        use_proxy: bool = True,
        db_connection=None,
    ):
        self.max_pages = max_pages
        self.sleep_seconds = max(0.0, sleep_seconds)
        self.db = db_connection
-        self.client = self._build_session()
+        self.client = RequestsClient(
-        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+            headers={
        self._tls = threading.local()
        self.cities = self._load_cities()
    def _build_session(self) -> RequestsClient:
        return RequestsClient(headers={
                "User-Agent": (
                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
                    "Mobile/15E148 Safari/604.1"
                ),
                "Accept": "text/html, */*; q=0.01",
                "Connection": "close",
-        })
+            },
            use_proxy=use_proxy,
            retry_total=2,
            retry_backoff_factor=1,
            retry_status_forcelist=(429, 500, 502, 503, 504),
            retry_allowed_methods=("GET", "POST"),
        )
-    def _refresh_session(self) -> None:
+    def _request_text(
        self,
        method: str,
        url: str,
        *,
        timeout: int = 20,
        max_retries: int = 3,
        referer: str = SITE_BASE,
        data: Optional[Dict] = None,
    ) -> str:
        headers = {"Referer": referer}
        last_error: Optional[Exception] = None
        for attempt in range(max_retries):
            wait_for_request()
            try:
                if method.upper() == "POST":
                    resp = self.client.post_text(
                        url,
                        timeout=timeout,
                        verify=False,
                        headers=headers,
                        data=data,
                    )
                else:
                    resp = self.client.get_text(
                        url,
                        timeout=timeout,
                        verify=False,
                        headers=headers,
                    )
                code = resp.status_code
                if code == 403:
                    if attempt < max_retries - 1:
                        self.client.refresh()
-
+                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-    def _get_thread_session(self) -> RequestsClient:
+                        continue
-        """每个线程使用独立请求客户端（共享相同 headers/代理配置）。"""
+                    raise RequestClientError(f"{code} Error: {url}")
-        s = getattr(self._tls, "session", None)
+                if code >= 500 and attempt < max_retries - 1:
-        if s is not None:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-            return s
+                    continue
-        s = self.client.clone()
+                if code >= 400:
-        self._tls.session = s
+                    raise RequestClientError(f"{code} Error: {url}")
-        return s
+                return resp.text
    def _refresh_thread_session(self) -> None:
        s = getattr(self._tls, "session", None)
        if s is not None:
            s.close()
        self._tls.session = None
    def _existing_urls(self, urls: List[str]) -> Set[str]:
        """批量查重，减少 N 次 is_data_exist"""
        if not urls:
            return set()
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            # IN 参数过多会失败，分批
            chunk_size = 500
            for i in range(0, len(urls), chunk_size):
                chunk = urls[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
                cur.execute(sql, chunk)
                for row in cur.fetchall():
                    # pymysql 默认返回 tuple
                    existing.add(row[0])
        finally:
            cur.close()
        return existing
    def _load_cities(self):
        tables = ("area_new", "area2", "area")
        last_error = None
        for table in tables:
            try:
                provinces = self.db.select_data(
                    table,
                    "id, code, province",
                    "domain='64365' AND level=1"
                ) or []
                cities = self.db.select_data(
                    table,
                    "code, city, province, pid",
                    "domain='64365' AND level=2"
                ) or []
            except Exception as exc:
                last_error = exc
                continue
            if not cities:
                continue
            province_map = {row.get('id'): row for row in provinces}
            data = {}
            for city in cities:
                province_row = province_map.get(city.get('pid'), {}) or {}
                data[str(city.get('code'))] = {
                    "name": city.get('city'),
                    "province": city.get('province'),
                    "province_name": province_row.get('province', city.get('province')),
                }
            print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
            return data
        if last_error:
            print(f"[律图] 加载地区数据失败: {last_error}")
        print("[律图] 无城市数据（已尝试 area_new/area2/area）")
        return {}
    def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
        for attempt in range(max_retries):
            try:
                resp = self.client.post_text(LIST_URL, data=payload, timeout=10, verify=False)
                status_code = resp.status_code
                text = resp.text
                if status_code == 403:
                if attempt < max_retries - 1:
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
                        self._refresh_session()
                        time.sleep(wait_time)
                    continue
-                    print("请求失败: 403 Forbidden")
+                raise
                    return None
                if status_code >= 400:
                    raise RequestClientError(f"{status_code} Error")
                return text
            except RequestClientError as exc:
                print(f"请求失败: {exc}")
                return None
        return None
-    def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
+        if last_error is not None:
            raise last_error
        raise RequestClientError(f"Unknown request error: {url}")
    def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
        return self._request_text(
            "GET",
            url,
            timeout=timeout,
            max_retries=max_retries,
            referer=referer,
        )
    def _post_text(
        self,
        url: str,
        *,
        data: Dict,
        timeout: int = 20,
        max_retries: int = 3,
        referer: str = SITE_BASE,
    ) -> str:
        return self._request_text(
            "POST",
            url,
            timeout=timeout,
            max_retries=max_retries,
            referer=referer,
            data=data,
        )
    def _extract_area_data(self, text: str) -> List[Dict]:
        match = re.search(
            r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData",
            text,
            re.S,
        )
        if not match:
            return []
        raw = match.group(1)
        try:
            data = json.loads(raw)
        except Exception:
            return []
        return data if isinstance(data, list) else []
    def discover_cities(self) -> List[CityTarget]:
        text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/")
        provinces = self._extract_area_data(text)
        targets: List[CityTarget] = []
        seen_area: Set[str] = set()
        for province in provinces:
            province_id = str(province.get("id") or "").strip()
            province_name = str(province.get("name") or "").strip()
            province_py = str(province.get("py") or "").strip()
            child_rows = province.get("child") or []
            # 常规省份 child 是地级市；直辖市 child 是区县，此时使用省级 id 抓取
            if child_rows and any((row.get("child") or []) for row in child_rows):
                for city in child_rows:
                    area_id = str(city.get("id") or "").strip()
                    city_name = str(city.get("name") or "").strip()
                    city_py = str(city.get("py") or "").strip()
                    if not area_id or not city_name:
                        continue
                    if area_id in seen_area:
                        continue
                    seen_area.add(area_id)
                    targets.append(
                        CityTarget(
                            area_id=area_id,
                            province_id=province_id,
                            province_name=province_name,
                            province_py=province_py,
                            city_name=city_name,
                            city_py=city_py,
                        )
                    )
            else:
                if not province_id or not province_name:
                    continue
                if province_id in seen_area:
                    continue
                seen_area.add(province_id)
                targets.append(
                    CityTarget(
                        area_id=province_id,
                        province_id=province_id,
                        province_name=province_name,
                        province_py=province_py,
                        city_name=province_name,
                        city_py=province_py,
                    )
                )
        return targets
    def _build_payload(self, area_id: str, page: int) -> Dict[str, str]:
        ua = self.client.headers.get("User-Agent", "")
        return {
            "AdCode": "",
-            "RegionId": str(city_code),
+            "RegionId": str(area_id),
            "CategoryId": "",
            "MaxNumber": "",
            "OnlyData": "true",
            "IgnoreButton": "",
-            "LawyerRecommendRequest[AreaId]": str(city_code),
+            "LawyerRecommendRequest[AreaId]": str(area_id),
            "LawyerRecommendRequest[LawCategoryIds]": "",
            "LawyerRecommendRequest[LawFirmPersonCount]": "",
            "LawyerRecommendRequest[LawFirmScale]": "",
@@ -171,162 +268,429 @@ class Six4365Spider:
            "LawyerRecommendRequest[RefferUrl]": "",
            "LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
            "LawyerRecommendRequest[resource_type_name]": "",
-            "LawyerRecommendRequest[UserAgent]": self.client.headers["User-Agent"],
+            "LawyerRecommendRequest[UserAgent]": ua,
            "LawyerRecommendRequest[AddLawyerWithNoData]": "false",
            "ShowCaseButton": "true",
        }
-    def _parse_list(self, html: str, province: str, city: str) -> int:
+    def fetch_list_html(self, target: CityTarget, page: int) -> str:
-        soup = BeautifulSoup(html, "html.parser")
+        payload = self._build_payload(target.area_id, page)
-        lawyers = soup.find_all("a", class_="lawyer")
+        return self._post_text(
-        if not lawyers:
+            LIST_API_URL,
-            return 0
+            data=payload,
            referer=SITE_BASE + "/findlawyer/",
        )
-        detail_urls: List[str] = []
+    def parse_list_cards(self, html: str) -> List[ListCard]:
-        for lawyer in lawyers:
+        soup = BeautifulSoup(html, "html.parser")
-            href = lawyer.get("href")
+        cards: List[ListCard] = []
        seen: Set[str] = set()
        for anchor in soup.select("a.lawyer[href]"):
            href = (anchor.get("href") or "").strip()
            if not href:
                continue
-            detail_urls.append(f"{href.rstrip('/')}/info/")
+            detail_url = urljoin(SITE_BASE, href)
-
+            if detail_url in seen:
        if not detail_urls:
            return 0
        results: List[Dict[str, str]] = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
            for fut in as_completed(futs):
                try:
                    data = fut.result()
                except Exception as exc:
                    print(f"    详情解析异常: {exc}")
                continue
-                if data:
+            seen.add(detail_url)
                    results.append(data)
-        if not results:
+            name = ""
-            return len(detail_urls)
+            name_tag = anchor.select_one("b.name")
            if name_tag:
                name = name_tag.get_text(strip=True)
-        existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
+            specialties: List[str] = []
-        for data in results:
+            skill_tag = anchor.select_one("div.skill")
-            if not data:
+            if skill_tag:
-                continue
+                raw = skill_tag.get_text(" ", strip=True).replace("擅长：", "")
-            url = data.get("url", "")
+                specialties = [x.strip() for x in re.split(r"[、,，]", raw) if x.strip()]
            if not url:
                continue
            if url in existing:
                print(f"  -- 已存在URL: {url}")
                continue
            try:
                self.db.insert_data("lawyer", data)
                print(f"  -> 新增: {data['name']} ({data['phone']})")
            except Exception as exc:
                print(f"  插入失败 {url}: {exc}")
-        return len(detail_urls)
+            score_text = ""
            score_tag = anchor.select_one("div.info span[title='评分'] em")
            if score_tag:
                score_text = score_tag.get_text(strip=True)
-    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+            service_text = ""
-        html = self._get_detail(url)
+            service_tag = anchor.select_one("div.info")
-        if not html:
+            if service_tag:
-            return None
+                service_text = service_tag.get_text(" ", strip=True)
            cards.append(
                ListCard(
                    detail_url=detail_url,
                    name=name,
                    specialties=specialties,
                    score_text=score_text,
                    service_text=service_text,
                )
            )
        return cards
    def parse_detail(self, detail_url: str) -> Dict:
        info_url = detail_url.rstrip("/") + "/info/"
        html = self._get_text(info_url, referer=detail_url)
        soup = BeautifulSoup(html, "html.parser")
        base_info = soup.find("ul", class_="intro-basic-bar")
        if not base_info:
            return None
        name = ""
        law_firm = ""
        phone = ""
        practice_years: Optional[int] = None
        office_area = ""
        address = ""
        specialties: List[str] = []
-        for li in base_info.find_all("li"):
+        for li in soup.select("ul.intro-basic-bar li"):
-            label = li.find("span", class_="label")
+            label_tag = li.select_one("span.label")
-            txt = li.find("div", class_="txt")
+            value_tag = li.select_one("div.txt")
-            if not label or not txt:
+            if not label_tag or not value_tag:
                continue
            label_text = label.get_text(strip=True)
            if "姓名" in label_text:
                name = txt.get_text(strip=True)
            if "执业律所" in label_text:
                law_firm = txt.get_text(strip=True)
-        more_section = soup.find("div", class_="more-intro-basic")
+            label = label_tag.get_text(" ", strip=True).replace("：", "")
-        if more_section:
+            value = value_tag.get_text(" ", strip=True)
-            phone_ul = more_section.find("ul", class_="intro-basic-bar")
+
-            if phone_ul:
+            if "姓名" in label and not name:
-                for li in phone_ul.find_all("li"):
+                name = value
-                    label = li.find("span", class_="label")
+            elif "执业律所" in label and not law_firm:
-                    txt = li.find("div", class_="txt")
+                law_firm = value
-                    if label and txt and "联系电话" in label.get_text(strip=True):
+            elif "联系电话" in label and not phone:
-                        phone = txt.get_text(strip=True).replace(" ", "")
+                phone = normalize_phone(value)
            elif "执业年限" in label and practice_years is None:
                year_match = YEAR_RE.search(value)
                if year_match:
                    try:
                        practice_years = int(year_match.group(1))
                    except Exception:
                        practice_years = None
            elif "办公地区" in label and not office_area:
                office_area = value
            elif "办公地址" in label and not address:
                address = value
        text = soup.get_text(" ", strip=True)
        if not phone:
            phone = normalize_phone(text)
        if not name and soup.title:
            title = soup.title.get_text(" ", strip=True)
            match = re.search(r"([^\s_，,。]+?)律师", title)
            if match:
                name = match.group(1).strip()
        skill_match = re.search(r"擅长：([^\n]+)", text)
        if skill_match:
            specialties = [x.strip() for x in re.split(r"[、,，]", skill_match.group(1)) if x.strip()]
        return {
            "name": name,
            "law_firm": law_firm,
            "phone": phone,
            "practice_years": practice_years,
            "office_area": office_area,
            "address": address,
            "specialties": specialties,
            "detail_url": detail_url,
            "info_url": info_url,
        }
    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
        seen_detail_urls: Set[str] = set()
        page_first_seen: Set[str] = set()
        for page in range(1, self.max_pages + 1):
            try:
                html = self.fetch_list_html(target, page)
            except Exception as exc:
                print(f"[list] 失败 area={target.area_id} p{page}: {exc}")
                break
-        phone = phone.replace('-', '').strip()
+            cards = self.parse_list_cards(html)
-        if not name or not phone:
+            if not cards:
                break
            first_url = cards[0].detail_url
            if first_url in page_first_seen:
                break
            page_first_seen.add(first_url)
            for card in cards:
                if card.detail_url in seen_detail_urls:
                    continue
                seen_detail_urls.add(card.detail_url)
                try:
                    detail = self.parse_detail(card.detail_url)
                except Exception as exc:
                    print(f"[detail] 失败 {card.detail_url}: {exc}")
                    continue
                now = int(time.time())
                uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url)
                uid = uid_match.group(1) if uid_match else card.detail_url
                record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
                yield {
                    "record_id": record_id,
                    "collected_at": now,
                    "source": {
                        "site": SITE_NAME,
                        "province_id": target.province_id,
                        "province": target.province_name,
                        "province_py": target.province_py,
                        "area_id": target.area_id,
                        "city": target.city_name,
                        "city_py": target.city_py,
                        "page": page,
                        "detail_url": card.detail_url,
                        "info_url": detail.get("info_url", ""),
                    },
                    "list_snapshot": {
                        "name": card.name,
                        "specialties": card.specialties,
                        "score_text": card.score_text,
                        "service_text": card.service_text,
                    },
                    "profile": {
                        "name": detail.get("name") or card.name,
                        "law_firm": detail.get("law_firm") or "",
                        "phone": detail.get("phone") or "",
                        "practice_years": detail.get("practice_years"),
                        "office_area": detail.get("office_area") or "",
                        "address": detail.get("address") or "",
                        "specialties": detail.get("specialties") or card.specialties,
                    },
                }
                if self.sleep_seconds:
                    time.sleep(self.sleep_seconds)
    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
        source = record.get("source", {}) or {}
        profile = record.get("profile", {}) or {}
        phone = normalize_phone(profile.get("phone", ""))
        if not phone:
            return None
-        data = {
+        province = (source.get("province") or "").strip()
-            "phone": phone,
+        city = (source.get("city") or province).strip()
        return {
            "name": (profile.get("name") or "").strip(),
            "law_firm": (profile.get("law_firm") or "").strip(),
            "province": province,
            "city": city,
-            "law_firm": law_firm,
+            "phone": phone,
-            "url": url,
+            "url": (source.get("info_url") or source.get("detail_url") or "").strip(),
-            "domain": DOMAIN,
+            "domain": LEGACY_DOMAIN,
-            "name": name,
+            "create_time": int(record.get("collected_at") or time.time()),
-            "create_time": int(time.time()),
+            "params": json.dumps(record, ensure_ascii=False),
            "params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
        }
        return data
-    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
+    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
-        session = self._get_thread_session()
+        if not self.db or not phones:
-        for attempt in range(max_retries):
+            return set()
        deduped = sorted({p for p in phones if p})
        if not deduped:
            return set()
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
-                resp = session.get_text(url, timeout=10, verify=False)
+            chunk_size = 500
-                status_code = resp.status_code
+            for i in range(0, len(deduped), chunk_size):
-                text = resp.text
+                chunk = deduped[i:i + chunk_size]
-                if status_code == 403:
+                placeholders = ",".join(["%s"] * len(chunk))
-                    if attempt < max_retries - 1:
+                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
-                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                for row in cur.fetchall():
-                        self._refresh_thread_session()
+                    existing.add(row[0])
-                        session = self._get_thread_session()
+        finally:
-                        time.sleep(wait_time)
+            cur.close()
                        continue
                    print("    请求失败: 403 Forbidden")
                    return None
                if status_code >= 400:
                    raise RequestClientError(f"{status_code} Error")
                return text
            except RequestClientError as exc:
                print(f"    请求失败: {exc}")
                return None
        return None
-    def run(self):
+        return existing
-        print("启动律图采集...")
+
-        if not self.cities:
+    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
-            print("无城市数据")
+        if not self.db:
            return 0, 0
        rows: List[Dict[str, str]] = []
        for record in records:
            row = self._to_legacy_lawyer_row(record)
            if row:
                rows.append(row)
        if not rows:
            return 0, 0
        existing = self._existing_phones_in_db([row["phone"] for row in rows])
        inserted = 0
        skipped = 0
        for row in rows:
            phone = row.get("phone", "")
            if not phone or phone in existing:
                skipped += 1
                continue
            try:
                self.db.insert_data("lawyer", row)
                existing.add(phone)
                inserted += 1
            except Exception as exc:
                skipped += 1
                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
        return inserted, skipped
    def crawl(
        self,
        output_path: str,
        max_cities: int = 0,
        city_filter: Optional[str] = None,
    ) -> None:
        cities = self.discover_cities()
        print(f"[discover] 共发现地区 {len(cities)} 个")
        if city_filter:
            key = city_filter.strip().lower()
            cities = [
                c for c in cities
                if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id
            ]
            print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}")
        if max_cities > 0:
            cities = cities[:max_cities]
            print(f"[discover] 截断地区数 {len(cities)}")
        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
        seen_ids: Set[str] = set()
        if os.path.exists(output_path):
            with open(output_path, "r", encoding="utf-8") as old_file:
                for line in old_file:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        item = json.loads(line)
                    except Exception:
                        continue
                    rid = item.get("record_id")
                    if rid:
                        seen_ids.add(rid)
            print(f"[resume] 已有记录 {len(seen_ids)} 条")
        total_new_json = 0
        total_new_db = 0
        total_skip_db = 0
        with open(output_path, "a", encoding="utf-8") as out:
            for idx, target in enumerate(cities, start=1):
                print(
                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
                    f"(area={target.area_id})"
                )
                city_records = list(self.crawl_city(target))
                city_new_json = 0
                for record in city_records:
                    rid = record["record_id"]
                    if rid in seen_ids:
                        continue
                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
                    seen_ids.add(rid)
                    city_new_json += 1
                    total_new_json += 1
                city_new_db, city_skip_db = self._write_records_to_db(city_records)
                total_new_db += city_new_db
                total_skip_db += city_skip_db
                print(
                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
                )
        print(
            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
            f"DB跳过{total_skip_db}条, 输出: {output_path}"
        )
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="律图全新采集脚本（站点数据直采）")
    parser.add_argument(
        "--output",
        default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl",
        help="输出 jsonl 文件路径",
    )
    parser.add_argument(
        "--max-cities",
        type=int,
        default=0,
        help="最多采集多少个地区，0 表示不限",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=9999,
        help="每个地区最多采集多少页",
    )
    parser.add_argument(
        "--city-filter",
        default="",
        help="按城市名称/拼音/编码过滤",
    )
    parser.add_argument(
        "--sleep",
        type=float,
        default=0.1,
        help="详情页请求间隔秒数",
    )
    parser.add_argument(
        "--direct",
        action="store_true",
        help="直连模式，不使用 proxy_settings.json 代理",
    )
    parser.add_argument(
        "--no-db",
        action="store_true",
        help="只输出 JSONL，不写入数据库",
    )
    return parser.parse_args()
 def main():
    args = parse_args()
    if args.no_db:
        crawler = Six4365Crawler(
            max_pages=args.max_pages,
            sleep_seconds=args.sleep,
            use_proxy=not args.direct,
            db_connection=None,
        )
        crawler.crawl(
            output_path=args.output,
            max_cities=args.max_cities,
            city_filter=args.city_filter or None,
        )
        return
-        for city_code, info in self.cities.items():
+    with Db() as db:
-            province = info.get("province_name", "")
+        crawler = Six4365Crawler(
-            city = info.get("name", "")
+            max_pages=args.max_pages,
-            print(f"采集 {province}-{city}")
+            sleep_seconds=args.sleep,
-            page = 1
+            use_proxy=not args.direct,
-            while True:
+            db_connection=db,
-                payload = self._build_payload(city_code, page)
+        )
-                html = self._post(payload)
+        crawler.crawl(
-                if not html:
+            output_path=args.output,
-                    break
+            max_cities=args.max_cities,
-                link_count = self._parse_list(html, province, city)
+            city_filter=args.city_filter or None,
-                if link_count == 0:
+        )
                    break
                page += 1
        print("律图采集完成")
 if __name__ == "__main__":
-    with Db() as db:
+    main()
        spider = Six4365Spider(db)
        spider.run()
@@ -1,13 +1,80 @@
 #!/usr/bin/env bash
 set -euo pipefail
-# 切换到脚本所在目录，确保相对路径正确
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-cd "$(dirname "$0")"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 LOG_DIR="${PROJECT_ROOT}/logs"
 DATA_DIR="${PROJECT_ROOT}/data"
-echo "使用 request/proxy_settings.json 读取代理配置"
+mkdir -p "${LOG_DIR}" "${DATA_DIR}"
-nohup python3 dls.py > dls.log 2>&1 & # 大律师
+if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
-nohup python3 findlaw.py > findlaw.log 2>&1 & # 找法网
+  PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
-nohup python3 lawtime.py > lawtime.log 2>&1 &  # 法律快车
+else
-nohup python3 six4365.py > six4365.log 2>&1 & # 律图
+  PYTHON_BIN="python3"
-nohup python3 hualv.py > hualv.log 2>&1 & # 华律
+fi
 RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
 echo "[start] project=${PROJECT_ROOT}"
 echo "[start] python=${PYTHON_BIN}"
 echo "[start] mode=${RUN_MODE}"
 echo "[start] proxy=request/proxy_settings.json"
 # 大律师（新结构采集 + 写库）可通过环境变量控制
 DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
 DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
 DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
 DLS_SLEEP="${DLS_SLEEP:-0.2}"
 DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
 DLS_EXTRA_ARGS=()
 if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
  DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
 fi
 if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
  DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
 fi
 if [[ -n "${DLS_CITY_FILTER}" ]]; then
  DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
 fi
 DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
 if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
  DLS_EXTRA_ARGS+=(--direct)
 fi
 if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
  DLS_EXTRA_ARGS+=(--no-db)
 fi
 run_bg() {
  local name="$1"
  shift
  local logfile="${LOG_DIR}/${name}.log"
  nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
  echo "[start] ${name} pid=$! log=${logfile}"
 }
 run_fg() {
  local name="$1"
  shift
  local logfile="${LOG_DIR}/${name}.log"
  echo "[start] ${name} fg log=${logfile}"
  env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
 }
 if [[ "${RUN_MODE}" == "sequential" ]]; then
  run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
  run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
  run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
  run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
  run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
  echo "[done] sequential completed"
 else
  run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
  run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
  run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
  run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
  run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
  echo "[done] all crawlers started in background"
 fi
@@ -51,6 +51,7 @@ class RequestsClient:
        self,
        headers: Optional[Mapping[str, str]] = None,
        *,
        use_proxy: bool = True,
        retry_total: int = 0,
        retry_backoff_factor: float = 0.0,
        retry_status_forcelist: Optional[Iterable[int]] = None,
@@ -58,6 +59,7 @@ class RequestsClient:
        default_timeout: Optional[TimeoutType] = None,
    ) -> None:
        self._base_headers: Dict[str, str] = dict(headers or {})
        self.use_proxy = bool(use_proxy)
        self.retry_total = int(retry_total)
        self.retry_backoff_factor = float(retry_backoff_factor)
        self.retry_status_forcelist = tuple(retry_status_forcelist or ())
@@ -67,8 +69,13 @@ class RequestsClient:
    def _build_session(self) -> requests.Session:
        session = requests.Session()
        if self.use_proxy:
            # 统一从 proxy_settings.json 注入代理，并屏蔽系统环境代理干扰
            apply_proxy(session)
        else:
            # 强制直连：不读取环境代理，不走配置文件代理
            session.trust_env = False
            session.proxies.clear()
        if self.retry_total > 0:
            # 适配器级重试：主要处理连接波动与指定状态码的瞬时失败
            retries = Retry(
@@ -109,6 +116,7 @@ class RequestsClient:
        # 线程场景建议 clone：复用同配置，但使用独立连接池
        clone_client = RequestsClient(
            headers=dict(self.headers),
            use_proxy=self.use_proxy,
            retry_total=self.retry_total,
            retry_backoff_factor=self.retry_backoff_factor,
            retry_status_forcelist=self.retry_status_forcelist,
@@ -3,3 +3,4 @@ requests>=2.28.0
 beautifulsoup4>=4.11.0
 urllib3>=1.26.0
 lxml>=4.9.0
 openpyxl>=3.1.0