feat: enhance project configuration and improve data export functionality

- Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
2026-03-18 10:02:25 +08:00
parent c2b77975c1
commit 38e7c284e8
14 changed files with 1665 additions and 3004 deletions
@@ -1,17 +1,11 @@
-import argparse
-import hashlib
 import json
 import os
-import random
-import re
 import sys
 import time
-from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Set, Tuple
-from urllib.parse import urljoin
-
-import urllib3
-from bs4 import BeautifulSoup
+import random
+from typing import Dict, Optional, List, Set
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -21,237 +15,165 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

-from Db import Db
-from request.requests_client import RequestClientError, RequestsClient
-from utils.rate_limiter import wait_for_request
+import requests
+import urllib3
+from bs4 import BeautifulSoup
+from request.proxy_config import get_proxies, report_proxy_status

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

-SITE_NAME = "64365"
-LEGACY_DOMAIN = "律图"
-SITE_BASE = "https://m.64365.com"
-AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js"
-LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
+from Db import Db

-PHONE_RE = re.compile(r"1[3-9]\d{9}")
-YEAR_RE = re.compile(r"(\d+)\s*年")
+DOMAIN = "律图"
+LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"


-@dataclass
-class CityTarget:
-    area_id: str
-    province_id: str
-    province_name: str
-    province_py: str
-    city_name: str
-    city_py: str
-
-
-@dataclass
-class ListCard:
-    detail_url: str
-    name: str
-    specialties: List[str]
-    score_text: str
-    service_text: str
-
-
-def normalize_phone(text: str) -> str:
-    compact = re.sub(r"\D", "", text or "")
-    match = PHONE_RE.search(compact)
-    return match.group(0) if match else ""
-
-
-class Six4365Crawler:
-    def __init__(
-        self,
-        max_pages: int = 9999,
-        sleep_seconds: float = 0.1,
-        use_proxy: bool = True,
-        db_connection=None,
-    ):
-        self.max_pages = max_pages
-        self.sleep_seconds = max(0.0, sleep_seconds)
+class Six4365Spider:
+    def __init__(self, db_connection):
        self.db = db_connection
-        self.client = RequestsClient(
-            headers={
-                "User-Agent": (
-                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
-                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
-                    "Mobile/15E148 Safari/604.1"
-                ),
-                "Accept": "text/html, */*; q=0.01",
-                "Connection": "close",
-            },
-            use_proxy=use_proxy,
-            retry_total=2,
-            retry_backoff_factor=1,
-            retry_status_forcelist=(429, 500, 502, 503, 504),
-            retry_allowed_methods=("GET", "POST"),
-        )
+        self.session = self._build_session()
+        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+        self._tls = threading.local()
+        self.cities = self._load_cities()

-    def _request_text(
-        self,
-        method: str,
-        url: str,
-        *,
-        timeout: int = 20,
-        max_retries: int = 3,
-        referer: str = SITE_BASE,
-        data: Optional[Dict] = None,
-    ) -> str:
-        headers = {"Referer": referer}
-        last_error: Optional[Exception] = None
+    def _build_session(self) -> requests.Session:
+        report_proxy_status()
+        session = requests.Session()
+        session.trust_env = False
+        proxies = get_proxies()
+        if proxies:
+            session.proxies.update(proxies)
+        else:
+            session.proxies.clear()
+        session.headers.update({
+            "User-Agent": (
+                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                "Mobile/15E148 Safari/604.1"
+            ),
+            "Connection": "close",
+        })
+        return session

-        for attempt in range(max_retries):
-            wait_for_request()
+    def _refresh_session(self) -> None:
+        try:
+            self.session.close()
+        except Exception:
+            pass
+        self.session = self._build_session()
+
+    def _get_thread_session(self) -> requests.Session:
+        """requests.Session 不是严格线程安全：每个线程用独立 session（但共享同样代理/headers）"""
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            return s
+        s = self._build_session()
+        s.headers.update(dict(self.session.headers))
+        self._tls.session = s
+        return s
+
+    def _refresh_thread_session(self) -> None:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
            try:
-                if method.upper() == "POST":
-                    resp = self.client.post_text(
-                        url,
-                        timeout=timeout,
-                        verify=False,
-                        headers=headers,
-                        data=data,
-                    )
-                else:
-                    resp = self.client.get_text(
-                        url,
-                        timeout=timeout,
-                        verify=False,
-                        headers=headers,
-                    )
+                s.close()
+            except Exception:
+                pass
+        self._tls.session = None

-                code = resp.status_code
-                if code == 403:
-                    if attempt < max_retries - 1:
-                        self.client.refresh()
-                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                        continue
-                    raise RequestClientError(f"{code} Error: {url}")
-                if code >= 500 and attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                if code >= 400:
-                    raise RequestClientError(f"{code} Error: {url}")
-                return resp.text
+    def _existing_urls(self, urls: List[str]) -> Set[str]:
+        """批量查重，减少 N 次 is_data_exist"""
+        if not urls:
+            return set()
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            # IN 参数过多会失败，分批
+            chunk_size = 500
+            for i in range(0, len(urls), chunk_size):
+                chunk = urls[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
+                cur.execute(sql, chunk)
+                for row in cur.fetchall():
+                    # pymysql 默认返回 tuple
+                    existing.add(row[0])
+        finally:
+            cur.close()
+        return existing
+
+    def _load_cities(self):
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
+            try:
+                provinces = self.db.select_data(
+                    table,
+                    "id, code, province",
+                    "domain='64365' AND level=1"
+                ) or []
+                cities = self.db.select_data(
+                    table,
+                    "code, city, province, pid",
+                    "domain='64365' AND level=2"
+                ) or []
            except Exception as exc:
                last_error = exc
-                if attempt < max_retries - 1:
-                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
-                    continue
-                raise
+                continue

-        if last_error is not None:
-            raise last_error
-        raise RequestClientError(f"Unknown request error: {url}")
+            if not cities:
+                continue

-    def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
-        return self._request_text(
-            "GET",
-            url,
-            timeout=timeout,
-            max_retries=max_retries,
-            referer=referer,
-        )
+            province_map = {row.get('id'): row for row in provinces}
+            data = {}
+            for city in cities:
+                province_row = province_map.get(city.get('pid'), {}) or {}
+                data[str(city.get('code'))] = {
+                    "name": city.get('city'),
+                    "province": city.get('province'),
+                    "province_name": province_row.get('province', city.get('province')),
+                }
+            print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
+            return data

-    def _post_text(
-        self,
-        url: str,
-        *,
-        data: Dict,
-        timeout: int = 20,
-        max_retries: int = 3,
-        referer: str = SITE_BASE,
-    ) -> str:
-        return self._request_text(
-            "POST",
-            url,
-            timeout=timeout,
-            max_retries=max_retries,
-            referer=referer,
-            data=data,
-        )
+        if last_error:
+            print(f"[律图] 加载地区数据失败: {last_error}")
+        print("[律图] 无城市数据（已尝试 area_new/area2/area）")
+        return {}

-    def _extract_area_data(self, text: str) -> List[Dict]:
-        match = re.search(
-            r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData",
-            text,
-            re.S,
-        )
-        if not match:
-            return []
-
-        raw = match.group(1)
-        try:
-            data = json.loads(raw)
-        except Exception:
-            return []
-        return data if isinstance(data, list) else []
-
-    def discover_cities(self) -> List[CityTarget]:
-        text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/")
-        provinces = self._extract_area_data(text)
-
-        targets: List[CityTarget] = []
-        seen_area: Set[str] = set()
-
-        for province in provinces:
-            province_id = str(province.get("id") or "").strip()
-            province_name = str(province.get("name") or "").strip()
-            province_py = str(province.get("py") or "").strip()
-            child_rows = province.get("child") or []
-
-            # 常规省份 child 是地级市；直辖市 child 是区县，此时使用省级 id 抓取
-            if child_rows and any((row.get("child") or []) for row in child_rows):
-                for city in child_rows:
-                    area_id = str(city.get("id") or "").strip()
-                    city_name = str(city.get("name") or "").strip()
-                    city_py = str(city.get("py") or "").strip()
-                    if not area_id or not city_name:
+    def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = self.session.post(LIST_URL, data=payload, timeout=10, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                resp.close()
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_session()
+                        time.sleep(wait_time)
                        continue
-                    if area_id in seen_area:
-                        continue
-                    seen_area.add(area_id)
-                    targets.append(
-                        CityTarget(
-                            area_id=area_id,
-                            province_id=province_id,
-                            province_name=province_name,
-                            province_py=province_py,
-                            city_name=city_name,
-                            city_py=city_py,
-                        )
-                    )
-            else:
-                if not province_id or not province_name:
-                    continue
-                if province_id in seen_area:
-                    continue
-                seen_area.add(province_id)
-                targets.append(
-                    CityTarget(
-                        area_id=province_id,
-                        province_id=province_id,
-                        province_name=province_name,
-                        province_py=province_py,
-                        city_name=province_name,
-                        city_py=province_py,
-                    )
-                )
+                    print("请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise requests.exceptions.HTTPError(f"{status_code} Error")
+                return text
+            except requests.exceptions.RequestException as exc:
+                print(f"请求失败: {exc}")
+                return None
+        return None

-        return targets
-
-    def _build_payload(self, area_id: str, page: int) -> Dict[str, str]:
-        ua = self.client.headers.get("User-Agent", "")
+    def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
        return {
            "AdCode": "",
-            "RegionId": str(area_id),
+            "RegionId": str(city_code),
            "CategoryId": "",
            "MaxNumber": "",
            "OnlyData": "true",
            "IgnoreButton": "",
-            "LawyerRecommendRequest[AreaId]": str(area_id),
+            "LawyerRecommendRequest[AreaId]": str(city_code),
            "LawyerRecommendRequest[LawCategoryIds]": "",
            "LawyerRecommendRequest[LawFirmPersonCount]": "",
            "LawyerRecommendRequest[LawFirmScale]": "",
@@ -268,429 +190,163 @@ class Six4365Crawler:
            "LawyerRecommendRequest[RefferUrl]": "",
            "LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
            "LawyerRecommendRequest[resource_type_name]": "",
-            "LawyerRecommendRequest[UserAgent]": ua,
+            "LawyerRecommendRequest[UserAgent]": self.session.headers["User-Agent"],
            "LawyerRecommendRequest[AddLawyerWithNoData]": "false",
            "ShowCaseButton": "true",
        }

-    def fetch_list_html(self, target: CityTarget, page: int) -> str:
-        payload = self._build_payload(target.area_id, page)
-        return self._post_text(
-            LIST_API_URL,
-            data=payload,
-            referer=SITE_BASE + "/findlawyer/",
-        )
-
-    def parse_list_cards(self, html: str) -> List[ListCard]:
+    def _parse_list(self, html: str, province: str, city: str) -> int:
        soup = BeautifulSoup(html, "html.parser")
-        cards: List[ListCard] = []
-        seen: Set[str] = set()
+        lawyers = soup.find_all("a", class_="lawyer")
+        if not lawyers:
+            return 0

-        for anchor in soup.select("a.lawyer[href]"):
-            href = (anchor.get("href") or "").strip()
+        detail_urls: List[str] = []
+        for lawyer in lawyers:
+            href = lawyer.get("href")
            if not href:
                continue
-            detail_url = urljoin(SITE_BASE, href)
-            if detail_url in seen:
+            detail_urls.append(f"{href.rstrip('/')}/info/")
+
+        if not detail_urls:
+            return 0
+
+        results: List[Dict[str, str]] = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
+            for fut in as_completed(futs):
+                try:
+                    data = fut.result()
+                except Exception as exc:
+                    print(f"    详情解析异常: {exc}")
+                    continue
+                if data:
+                    results.append(data)
+
+        if not results:
+            return len(detail_urls)
+
+        existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
+        for data in results:
+            if not data:
                continue
-            seen.add(detail_url)
+            url = data.get("url", "")
+            if not url:
+                continue
+            if url in existing:
+                print(f"  -- 已存在URL: {url}")
+                continue
+            try:
+                self.db.insert_data("lawyer", data)
+                print(f"  -> 新增: {data['name']} ({data['phone']})")
+            except Exception as exc:
+                print(f"  插入失败 {url}: {exc}")

-            name = ""
-            name_tag = anchor.select_one("b.name")
-            if name_tag:
-                name = name_tag.get_text(strip=True)
+        return len(detail_urls)

-            specialties: List[str] = []
-            skill_tag = anchor.select_one("div.skill")
-            if skill_tag:
-                raw = skill_tag.get_text(" ", strip=True).replace("擅长：", "")
-                specialties = [x.strip() for x in re.split(r"[、,，]", raw) if x.strip()]
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        html = self._get_detail(url)
+        if not html:
+            return None

-            score_text = ""
-            score_tag = anchor.select_one("div.info span[title='评分'] em")
-            if score_tag:
-                score_text = score_tag.get_text(strip=True)
-
-            service_text = ""
-            service_tag = anchor.select_one("div.info")
-            if service_tag:
-                service_text = service_tag.get_text(" ", strip=True)
-
-            cards.append(
-                ListCard(
-                    detail_url=detail_url,
-                    name=name,
-                    specialties=specialties,
-                    score_text=score_text,
-                    service_text=service_text,
-                )
-            )
-
-        return cards
-
-    def parse_detail(self, detail_url: str) -> Dict:
-        info_url = detail_url.rstrip("/") + "/info/"
-        html = self._get_text(info_url, referer=detail_url)
        soup = BeautifulSoup(html, "html.parser")
+        base_info = soup.find("ul", class_="intro-basic-bar")
+        if not base_info:
+            return None

        name = ""
        law_firm = ""
        phone = ""
-        practice_years: Optional[int] = None
-        office_area = ""
-        address = ""
-        specialties: List[str] = []

-        for li in soup.select("ul.intro-basic-bar li"):
-            label_tag = li.select_one("span.label")
-            value_tag = li.select_one("div.txt")
-            if not label_tag or not value_tag:
+        for li in base_info.find_all("li"):
+            label = li.find("span", class_="label")
+            txt = li.find("div", class_="txt")
+            if not label or not txt:
                continue
+            label_text = label.get_text(strip=True)
+            if "姓名" in label_text:
+                name = txt.get_text(strip=True)
+            if "执业律所" in label_text:
+                law_firm = txt.get_text(strip=True)

-            label = label_tag.get_text(" ", strip=True).replace("：", "")
-            value = value_tag.get_text(" ", strip=True)
+        more_section = soup.find("div", class_="more-intro-basic")
+        if more_section:
+            phone_ul = more_section.find("ul", class_="intro-basic-bar")
+            if phone_ul:
+                for li in phone_ul.find_all("li"):
+                    label = li.find("span", class_="label")
+                    txt = li.find("div", class_="txt")
+                    if label and txt and "联系电话" in label.get_text(strip=True):
+                        phone = txt.get_text(strip=True).replace(" ", "")
+                        break

-            if "姓名" in label and not name:
-                name = value
-            elif "执业律所" in label and not law_firm:
-                law_firm = value
-            elif "联系电话" in label and not phone:
-                phone = normalize_phone(value)
-            elif "执业年限" in label and practice_years is None:
-                year_match = YEAR_RE.search(value)
-                if year_match:
-                    try:
-                        practice_years = int(year_match.group(1))
-                    except Exception:
-                        practice_years = None
-            elif "办公地区" in label and not office_area:
-                office_area = value
-            elif "办公地址" in label and not address:
-                address = value
-
-        text = soup.get_text(" ", strip=True)
-        if not phone:
-            phone = normalize_phone(text)
-
-        if not name and soup.title:
-            title = soup.title.get_text(" ", strip=True)
-            match = re.search(r"([^\s_，,。]+?)律师", title)
-            if match:
-                name = match.group(1).strip()
-
-        skill_match = re.search(r"擅长：([^\n]+)", text)
-        if skill_match:
-            specialties = [x.strip() for x in re.split(r"[、,，]", skill_match.group(1)) if x.strip()]
-
-        return {
-            "name": name,
-            "law_firm": law_firm,
-            "phone": phone,
-            "practice_years": practice_years,
-            "office_area": office_area,
-            "address": address,
-            "specialties": specialties,
-            "detail_url": detail_url,
-            "info_url": info_url,
-        }
-
-    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
-        seen_detail_urls: Set[str] = set()
-        page_first_seen: Set[str] = set()
-
-        for page in range(1, self.max_pages + 1):
-            try:
-                html = self.fetch_list_html(target, page)
-            except Exception as exc:
-                print(f"[list] 失败 area={target.area_id} p{page}: {exc}")
-                break
-
-            cards = self.parse_list_cards(html)
-            if not cards:
-                break
-
-            first_url = cards[0].detail_url
-            if first_url in page_first_seen:
-                break
-            page_first_seen.add(first_url)
-
-            for card in cards:
-                if card.detail_url in seen_detail_urls:
-                    continue
-                seen_detail_urls.add(card.detail_url)
-
-                try:
-                    detail = self.parse_detail(card.detail_url)
-                except Exception as exc:
-                    print(f"[detail] 失败 {card.detail_url}: {exc}")
-                    continue
-
-                now = int(time.time())
-                uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url)
-                uid = uid_match.group(1) if uid_match else card.detail_url
-                record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
-
-                yield {
-                    "record_id": record_id,
-                    "collected_at": now,
-                    "source": {
-                        "site": SITE_NAME,
-                        "province_id": target.province_id,
-                        "province": target.province_name,
-                        "province_py": target.province_py,
-                        "area_id": target.area_id,
-                        "city": target.city_name,
-                        "city_py": target.city_py,
-                        "page": page,
-                        "detail_url": card.detail_url,
-                        "info_url": detail.get("info_url", ""),
-                    },
-                    "list_snapshot": {
-                        "name": card.name,
-                        "specialties": card.specialties,
-                        "score_text": card.score_text,
-                        "service_text": card.service_text,
-                    },
-                    "profile": {
-                        "name": detail.get("name") or card.name,
-                        "law_firm": detail.get("law_firm") or "",
-                        "phone": detail.get("phone") or "",
-                        "practice_years": detail.get("practice_years"),
-                        "office_area": detail.get("office_area") or "",
-                        "address": detail.get("address") or "",
-                        "specialties": detail.get("specialties") or card.specialties,
-                    },
-                }
-
-                if self.sleep_seconds:
-                    time.sleep(self.sleep_seconds)
-
-    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
-        source = record.get("source", {}) or {}
-        profile = record.get("profile", {}) or {}
-
-        phone = normalize_phone(profile.get("phone", ""))
-        if not phone:
+        phone = phone.replace('-', '').strip()
+        if not name or not phone:
            return None

-        province = (source.get("province") or "").strip()
-        city = (source.get("city") or province).strip()
-        return {
-            "name": (profile.get("name") or "").strip(),
-            "law_firm": (profile.get("law_firm") or "").strip(),
+        data = {
+            "phone": phone,
            "province": province,
            "city": city,
-            "phone": phone,
-            "url": (source.get("info_url") or source.get("detail_url") or "").strip(),
-            "domain": LEGACY_DOMAIN,
-            "create_time": int(record.get("collected_at") or time.time()),
-            "params": json.dumps(record, ensure_ascii=False),
+            "law_firm": law_firm,
+            "url": url,
+            "domain": DOMAIN,
+            "name": name,
+            "create_time": int(time.time()),
+            "params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
        }
+        return data

-    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
-        if not self.db or not phones:
-            return set()
-
-        deduped = sorted({p for p in phones if p})
-        if not deduped:
-            return set()
-
-        existing: Set[str] = set()
-        cur = self.db.db.cursor()
-        try:
-            chunk_size = 500
-            for i in range(0, len(deduped), chunk_size):
-                chunk = deduped[i:i + chunk_size]
-                placeholders = ",".join(["%s"] * len(chunk))
-                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
-                for row in cur.fetchall():
-                    existing.add(row[0])
-        finally:
-            cur.close()
-
-        return existing
-
-    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
-        if not self.db:
-            return 0, 0
-
-        rows: List[Dict[str, str]] = []
-        for record in records:
-            row = self._to_legacy_lawyer_row(record)
-            if row:
-                rows.append(row)
-        if not rows:
-            return 0, 0
-
-        existing = self._existing_phones_in_db([row["phone"] for row in rows])
-        inserted = 0
-        skipped = 0
-
-        for row in rows:
-            phone = row.get("phone", "")
-            if not phone or phone in existing:
-                skipped += 1
-                continue
+    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
+        session = self._get_thread_session()
+        for attempt in range(max_retries):
            try:
-                self.db.insert_data("lawyer", row)
-                existing.add(phone)
-                inserted += 1
-            except Exception as exc:
-                skipped += 1
-                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
-
-        return inserted, skipped
-
-    def crawl(
-        self,
-        output_path: str,
-        max_cities: int = 0,
-        city_filter: Optional[str] = None,
-    ) -> None:
-        cities = self.discover_cities()
-        print(f"[discover] 共发现地区 {len(cities)} 个")
-
-        if city_filter:
-            key = city_filter.strip().lower()
-            cities = [
-                c for c in cities
-                if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id
-            ]
-            print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}")
-
-        if max_cities > 0:
-            cities = cities[:max_cities]
-            print(f"[discover] 截断地区数 {len(cities)}")
-
-        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
-
-        seen_ids: Set[str] = set()
-        if os.path.exists(output_path):
-            with open(output_path, "r", encoding="utf-8") as old_file:
-                for line in old_file:
-                    line = line.strip()
-                    if not line:
+                resp = session.get(url, timeout=10, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                resp.close()
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_thread_session()
+                        session = self._get_thread_session()
+                        time.sleep(wait_time)
                        continue
-                    try:
-                        item = json.loads(line)
-                    except Exception:
-                        continue
-                    rid = item.get("record_id")
-                    if rid:
-                        seen_ids.add(rid)
-            print(f"[resume] 已有记录 {len(seen_ids)} 条")
+                    print("    请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise requests.exceptions.HTTPError(f"{status_code} Error")
+                return text
+            except requests.exceptions.RequestException as exc:
+                print(f"    请求失败: {exc}")
+                return None
+        return None

-        total_new_json = 0
-        total_new_db = 0
-        total_skip_db = 0
+    def run(self):
+        print("启动律图采集...")
+        if not self.cities:
+            print("无城市数据")
+            return

-        with open(output_path, "a", encoding="utf-8") as out:
-            for idx, target in enumerate(cities, start=1):
-                print(
-                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
-                    f"(area={target.area_id})"
-                )
-                city_records = list(self.crawl_city(target))
-
-                city_new_json = 0
-                for record in city_records:
-                    rid = record["record_id"]
-                    if rid in seen_ids:
-                        continue
-                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
-                    seen_ids.add(rid)
-                    city_new_json += 1
-                    total_new_json += 1
-
-                city_new_db, city_skip_db = self._write_records_to_db(city_records)
-                total_new_db += city_new_db
-                total_skip_db += city_skip_db
-
-                print(
-                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
-                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
-                )
-
-        print(
-            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
-            f"DB跳过{total_skip_db}条, 输出: {output_path}"
-        )
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="律图全新采集脚本（站点数据直采）")
-    parser.add_argument(
-        "--output",
-        default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl",
-        help="输出 jsonl 文件路径",
-    )
-    parser.add_argument(
-        "--max-cities",
-        type=int,
-        default=0,
-        help="最多采集多少个地区，0 表示不限",
-    )
-    parser.add_argument(
-        "--max-pages",
-        type=int,
-        default=9999,
-        help="每个地区最多采集多少页",
-    )
-    parser.add_argument(
-        "--city-filter",
-        default="",
-        help="按城市名称/拼音/编码过滤",
-    )
-    parser.add_argument(
-        "--sleep",
-        type=float,
-        default=0.1,
-        help="详情页请求间隔秒数",
-    )
-    parser.add_argument(
-        "--direct",
-        action="store_true",
-        help="直连模式，不使用 proxy_settings.json 代理",
-    )
-    parser.add_argument(
-        "--no-db",
-        action="store_true",
-        help="只输出 JSONL，不写入数据库",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    if args.no_db:
-        crawler = Six4365Crawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=None,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
-        return
-
-    with Db() as db:
-        crawler = Six4365Crawler(
-            max_pages=args.max_pages,
-            sleep_seconds=args.sleep,
-            use_proxy=not args.direct,
-            db_connection=db,
-        )
-        crawler.crawl(
-            output_path=args.output,
-            max_cities=args.max_cities,
-            city_filter=args.city_filter or None,
-        )
+        for city_code, info in self.cities.items():
+            province = info.get("province_name", "")
+            city = info.get("name", "")
+            print(f"采集 {province}-{city}")
+            page = 1
+            while True:
+                payload = self._build_payload(city_code, page)
+                html = self._post(payload)
+                if not html:
+                    break
+                link_count = self._parse_list(html, province, city)
+                if link_count == 0:
+                    break
+                page += 1
+        print("律图采集完成")


 if __name__ == "__main__":
-    main()
+    with Db() as db:
+        spider = Six4365Spider(db)
+        spider.run()