重构采集脚本并新增按时间导出Excel

- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore，忽略 data/logs 本地产物
2026-03-02 11:46:05 +08:00
parent 03847a4b8e
commit 19cf9ce901
12 changed files with 3545 additions and 1059 deletions
@@ -1,13 +1,16 @@
+import argparse
+import hashlib
 import json
 import os
+import random
 import re
 import sys
 import time
-import random
-from typing import Dict, Optional, List, Set
-from urllib.parse import urljoin
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import threading
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Optional, Set, Tuple
+
+import urllib3
+from bs4 import BeautifulSoup

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -17,262 +20,628 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

-import urllib3
-from bs4 import BeautifulSoup
+from Db import Db
 from request.requests_client import RequestClientError, RequestsClient
+from utils.rate_limiter import wait_for_request

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

-from Db import Db
-from config import LAWTIME_CONFIG
+SITE_NAME = "lawtime"
+LEGACY_DOMAIN = "法律快车"
+SITE_BASE = "https://www.lawtime.cn"
+PROVINCE_API = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode=0"
+CITY_API_TEMPLATE = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode={province_id}"
+LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/lawyer"

-LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
-DETAIL_BASE = "https://m.lawtime.cn"
-DOMAIN = "法律快车"
+PHONE_RE = re.compile(r"1[3-9]\d{9}")
+YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")


-class LawtimeSpider:
-    def __init__(self, db_connection):
+@dataclass
+class CityTarget:
+    province_id: str
+    province_name: str
+    province_py: str
+    city_id: str
+    city_name: str
+    city_py: str
+
+
+@dataclass
+class ListCard:
+    detail_url: str
+    name: str
+    phone: str
+    address: str = ""
+    specialties: List[str] = field(default_factory=list)
+    metric_text: str = ""
+
+
+def normalize_phone(text: str) -> str:
+    compact = re.sub(r"\D", "", text or "")
+    match = PHONE_RE.search(compact)
+    return match.group(0) if match else ""
+
+
+class LawtimeCrawler:
+    def __init__(
+        self,
+        max_pages: int = 9999,
+        sleep_seconds: float = 0.1,
+        use_proxy: bool = True,
+        db_connection=None,
+    ):
+        self.max_pages = max_pages
+        self.sleep_seconds = max(0.0, sleep_seconds)
        self.db = db_connection
-        self.client = self._build_session()
-        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
-        self._tls = threading.local()
+        self.client = RequestsClient(
+            headers={
+                "User-Agent": (
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) "
+                    "Chrome/122.0.0.0 Safari/537.36"
+                ),
+                "Accept": "text/html,application/json,*/*;q=0.8",
+                "Connection": "close",
+            },
+            use_proxy=use_proxy,
+            retry_total=2,
+            retry_backoff_factor=1,
+            retry_status_forcelist=(429, 500, 502, 503, 504),
+            retry_allowed_methods=("GET",),
+        )

-    def _build_session(self) -> RequestsClient:
-        headers = LAWTIME_CONFIG.get("HEADERS", {})
-        custom_headers = dict(headers) if headers else {}
-        custom_headers.setdefault("Connection", "close")
-        return RequestsClient(headers=custom_headers)
+    def _get_text(
+        self,
+        url: str,
+        *,
+        timeout: int = 20,
+        max_retries: int = 3,
+        referer: str = SITE_BASE,
+    ) -> str:
+        headers = {"Referer": referer}
+        last_error: Optional[Exception] = None

-    def _refresh_session(self) -> None:
-        self.client.refresh()
+        for attempt in range(max_retries):
+            wait_for_request()
+            try:
+                resp = self.client.get_text(
+                    url,
+                    timeout=timeout,
+                    verify=False,
+                    headers=headers,
+                )
+                code = resp.status_code
+                if code == 403:
+                    if attempt < max_retries - 1:
+                        self.client.refresh()
+                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                        continue
+                    raise RequestClientError(f"{code} Error: {url}")
+                if code >= 500 and attempt < max_retries - 1:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                    continue
+                if code >= 400:
+                    raise RequestClientError(f"{code} Error: {url}")
+                return resp.text
+            except Exception as exc:
+                last_error = exc
+                if attempt < max_retries - 1:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                    continue
+                raise

-    def _get_thread_session(self) -> RequestsClient:
-        s = getattr(self._tls, "session", None)
-        if s is not None:
-            return s
-        s = self.client.clone()
-        self._tls.session = s
-        return s
+        if last_error is not None:
+            raise last_error
+        raise RequestClientError(f"Unknown request error: {url}")

-    def _refresh_thread_session(self) -> None:
-        s = getattr(self._tls, "session", None)
-        if s is not None:
-            s.close()
-        self._tls.session = None
+    def _get_json(self, url: str, *, referer: str) -> List[Dict]:
+        text = self._get_text(url, referer=referer)
+        cleaned = (text or "").strip().lstrip("\ufeff")
+        if not cleaned or cleaned.startswith("<"):
+            return []
+        try:
+            data = json.loads(cleaned)
+        except ValueError:
+            return []
+        return data if isinstance(data, list) else []

-    def _existing_phones(self, phones: List[str]) -> Set[str]:
-        if not phones:
+    def discover_cities(self) -> List[CityTarget]:
+        provinces = self._get_json(PROVINCE_API, referer=SITE_BASE)
+        if not provinces:
+            print("[discover] 地区接口未返回有效数据")
+            return []
+
+        results: List[CityTarget] = []
+        seen_py: Set[str] = set()
+
+        for province in provinces:
+            province_id = str(province.get("id") or "").strip()
+            province_name = str(province.get("province") or province.get("city") or "").strip()
+            province_py = str(province.get("pinyin") or "").strip()
+            if not province_id or not province_name:
+                continue
+
+            city_api = CITY_API_TEMPLATE.format(province_id=province_id)
+            try:
+                cities = self._get_json(city_api, referer=LIST_URL_TEMPLATE.format(city_py=province_py or ""))
+            except Exception as exc:
+                print(f"[city] 获取失败 province={province_id}: {exc}")
+                continue
+
+            if not cities:
+                cities = [
+                    {
+                        "id": province_id,
+                        "province": province_name,
+                        "city": province_name,
+                        "pinyin": province_py,
+                    }
+                ]
+
+            for city in cities:
+                city_id = str(city.get("id") or "").strip()
+                city_name = str(city.get("city") or city.get("province") or "").strip()
+                city_py = str(city.get("pinyin") or "").strip()
+                if not city_id or not city_name or not city_py:
+                    continue
+                if city_py in seen_py:
+                    continue
+                seen_py.add(city_py)
+
+                results.append(
+                    CityTarget(
+                        province_id=province_id,
+                        province_name=province_name,
+                        province_py=province_py,
+                        city_id=city_id,
+                        city_name=city_name,
+                        city_py=city_py,
+                    )
+                )
+
+        return results
+
+    def _build_list_url(self, city_py: str, page: int) -> str:
+        base = LIST_URL_TEMPLATE.format(city_py=city_py)
+        if page <= 1:
+            return base
+        return f"{base}?page={page}"
+
+    def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[ListCard], bool, str]:
+        list_url = self._build_list_url(target.city_py, page)
+        html = self._get_text(list_url, referer=SITE_BASE + "/")
+
+        cards = self.parse_list_cards(html)
+
+        soup = BeautifulSoup(html, "html.parser")
+        next_link = soup.select_one(f"div.page a[href*='page={page + 1}']")
+        has_next = next_link is not None
+
+        return cards, has_next, list_url
+
+    def parse_list_cards(self, html: str) -> List[ListCard]:
+        soup = BeautifulSoup(html, "html.parser")
+        cards: List[ListCard] = []
+        seen: Set[str] = set()
+
+        for item in soup.select("li.lawyer-item-card"):
+            link_tag = item.select_one("a.name[href]") or item.select_one("a.lawyer-img-box[href]")
+            if not link_tag:
+                continue
+            detail_url = (link_tag.get("href") or "").strip()
+            if not detail_url.startswith("http"):
+                continue
+            if detail_url in seen:
+                continue
+            seen.add(detail_url)
+
+            name = link_tag.get_text(strip=True)
+            phone = ""
+            phone_tag = item.select_one("div.phone")
+            if phone_tag:
+                phone = normalize_phone(phone_tag.get_text(" ", strip=True))
+
+            address = ""
+            addr_tag = item.select_one("div.location .txt")
+            if addr_tag:
+                address = addr_tag.get_text(" ", strip=True)
+
+            specialties: List[str] = []
+            prof_tag = item.select_one("div.prof .txt")
+            if prof_tag:
+                specialties = [
+                    x.strip() for x in re.split(r"[、,，]", prof_tag.get_text(" ", strip=True)) if x.strip()
+                ]
+
+            metric_text = ""
+            metric_tag = item.select_one("div.num-msg")
+            if metric_tag:
+                metric_text = metric_tag.get_text(" ", strip=True)
+
+            cards.append(
+                ListCard(
+                    detail_url=detail_url,
+                    name=name,
+                    phone=phone,
+                    address=address,
+                    specialties=specialties,
+                    metric_text=metric_text,
+                )
+            )
+
+        return cards
+
+    def parse_detail(self, detail_url: str) -> Dict:
+        html = self._get_text(detail_url, referer=SITE_BASE)
+        if "网站防火墙" in html or "访问被拒绝" in html or "/ipfilter/verify" in html:
+            raise RequestClientError(f"firewall blocked: {detail_url}")
+
+        soup = BeautifulSoup(html, "html.parser")
+        text = soup.get_text(" ", strip=True)
+
+        name = ""
+        law_firm = ""
+        phone = ""
+        address = ""
+        practice_years: Optional[int] = None
+        specialties: List[str] = []
+
+        if soup.title:
+            title = soup.title.get_text(" ", strip=True)
+            match = re.search(r"([^\s_，,。]+?)律师", title)
+            if match:
+                name = match.group(1).strip()
+
+        phone_candidates = [
+            soup.select_one(".data-w .tel-b b").get_text(" ", strip=True)
+            if soup.select_one(".data-w .tel-b b")
+            else "",
+            soup.select_one(".law-info-b .item .two-r.b").get_text(" ", strip=True)
+            if soup.select_one(".law-info-b .item .two-r.b")
+            else "",
+            text,
+        ]
+        for candidate in phone_candidates:
+            phone = normalize_phone(candidate)
+            if phone:
+                break
+
+        law_firm_tag = soup.select_one(".law-info-b .item .two-nowrap")
+        if law_firm_tag:
+            law_firm = law_firm_tag.get_text(" ", strip=True)
+
+        for li in soup.select(".law-info-b .item"):
+            li_text = li.get_text(" ", strip=True)
+            if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
+                law_firm = li_text
+
+        addr_tag = soup.select_one(".law-info-b .item .two-r[title]")
+        if addr_tag:
+            addr_value = (addr_tag.get("title") or "").strip()
+            if len(addr_value) > 8:
+                address = addr_value
+
+        if not address:
+            addr_tag = soup.select_one(".law-info-b .item .two-r")
+            if addr_tag:
+                addr_value = addr_tag.get_text(" ", strip=True)
+                if len(addr_value) > 8 and "律师" not in addr_value:
+                    address = addr_value
+
+        year_match = YEAR_RE.search(text)
+        if year_match:
+            try:
+                practice_years = int(year_match.group(1))
+            except Exception:
+                practice_years = None
+
+        specialties = [x.get_text(strip=True) for x in soup.select(".profession-b .item") if x.get_text(strip=True)]
+
+        return {
+            "name": name,
+            "law_firm": law_firm,
+            "phone": phone,
+            "address": address,
+            "practice_years": practice_years,
+            "specialties": specialties,
+            "detail_url": detail_url,
+        }
+
+    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
+        seen_details: Set[str] = set()
+
+        for page in range(1, self.max_pages + 1):
+            try:
+                cards, has_next, list_url = self.fetch_list_page(target, page)
+            except Exception as exc:
+                print(f"[list] 失败 {target.city_py} p{page}: {exc}")
+                break
+
+            if not cards:
+                break
+
+            for card in cards:
+                if card.detail_url in seen_details:
+                    continue
+                seen_details.add(card.detail_url)
+
+                detail: Dict = {}
+                try:
+                    detail = self.parse_detail(card.detail_url)
+                except Exception as exc:
+                    print(f"[detail] 失败 {card.detail_url}: {exc}")
+
+                phone = normalize_phone(detail.get("phone") or card.phone)
+                profile_name = (detail.get("name") or card.name).replace("律师", "").strip()
+
+                now = int(time.time())
+                record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
+
+                yield {
+                    "record_id": record_id,
+                    "collected_at": now,
+                    "source": {
+                        "site": SITE_NAME,
+                        "province_id": target.province_id,
+                        "province": target.province_name,
+                        "province_py": target.province_py,
+                        "city_id": target.city_id,
+                        "city": target.city_name,
+                        "city_py": target.city_py,
+                        "page": page,
+                        "list_url": list_url,
+                        "detail_url": card.detail_url,
+                    },
+                    "list_snapshot": {
+                        "name": card.name,
+                        "phone": card.phone,
+                        "address": card.address,
+                        "specialties": card.specialties,
+                        "metric_text": card.metric_text,
+                    },
+                    "profile": {
+                        "name": profile_name,
+                        "law_firm": (detail.get("law_firm") or "").strip(),
+                        "phone": phone,
+                        "address": (detail.get("address") or card.address or "").strip(),
+                        "practice_years": detail.get("practice_years"),
+                        "specialties": detail.get("specialties") or card.specialties,
+                    },
+                }
+
+                if self.sleep_seconds:
+                    time.sleep(self.sleep_seconds)
+
+            if not has_next:
+                break
+
+    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
+        source = record.get("source", {}) or {}
+        profile = record.get("profile", {}) or {}
+
+        phone = normalize_phone(profile.get("phone", ""))
+        if not phone:
+            return None
+
+        province = (source.get("province") or "").strip()
+        city = (source.get("city") or province).strip()
+        return {
+            "name": (profile.get("name") or "").strip(),
+            "law_firm": (profile.get("law_firm") or "").strip(),
+            "province": province,
+            "city": city,
+            "phone": phone,
+            "url": (source.get("detail_url") or source.get("list_url") or "").strip(),
+            "domain": LEGACY_DOMAIN,
+            "create_time": int(record.get("collected_at") or time.time()),
+            "params": json.dumps(record, ensure_ascii=False),
+        }
+
+    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
+        if not self.db or not phones:
            return set()
+
+        deduped = sorted({p for p in phones if p})
+        if not deduped:
+            return set()
+
        existing: Set[str] = set()
        cur = self.db.db.cursor()
        try:
            chunk_size = 500
-            for i in range(0, len(phones), chunk_size):
-                chunk = phones[i:i + chunk_size]
+            for i in range(0, len(deduped), chunk_size):
+                chunk = deduped[i:i + chunk_size]
                placeholders = ",".join(["%s"] * len(chunk))
                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
-                cur.execute(sql, [DOMAIN, *chunk])
+                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
                for row in cur.fetchall():
                    existing.add(row[0])
        finally:
            cur.close()
+
        return existing

-    def _load_areas(self):
-        condition = "level = 2 and domain='法律快车'"
-        tables = ("area_new", "area", "area2")
-        last_error = None
-        for table in tables:
-            try:
-                rows = self.db.select_data(table, "pinyin, province, city", condition) or []
-            except Exception as exc:
-                last_error = exc
+    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
+        if not self.db:
+            return 0, 0
+
+        rows: List[Dict[str, str]] = []
+        for record in records:
+            row = self._to_legacy_lawyer_row(record)
+            if row:
+                rows.append(row)
+        if not rows:
+            return 0, 0
+
+        existing = self._existing_phones_in_db([row["phone"] for row in rows])
+        inserted = 0
+        skipped = 0
+
+        for row in rows:
+            phone = row.get("phone", "")
+            if not phone or phone in existing:
+                skipped += 1
                continue
-            if rows:
-                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
-                print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
-                return rows
-
-        if last_error:
-            print(f"[法律快车] 加载地区数据失败: {last_error}")
-        print("[法律快车] 无城市数据（已尝试 area_new/area/area2）")
-        return []
-
-    def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
-        return self._get_with_session(self.client, url, max_retries=max_retries, is_thread=False)
-
-    def _get_with_session(self, session: RequestsClient, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
-        for attempt in range(max_retries):
            try:
-                resp = session.get_text(url, timeout=15, verify=False)
-                status_code = resp.status_code
-                text = resp.text
-                if status_code == 403:
-                    if attempt < max_retries - 1:
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
-                        print(f"请求失败 {url}: 403，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
-                        if is_thread:
-                            self._refresh_thread_session()
-                            session = self._get_thread_session()
-                        else:
-                            self._refresh_session()
-                            session = self.client
-                        time.sleep(wait_time)
+                self.db.insert_data("lawyer", row)
+                existing.add(phone)
+                inserted += 1
+            except Exception as exc:
+                skipped += 1
+                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
+
+        return inserted, skipped
+
+    def crawl(
+        self,
+        output_path: str,
+        max_cities: int = 0,
+        city_filter: Optional[str] = None,
+    ) -> None:
+        cities = self.discover_cities()
+        print(f"[discover] 共发现城市 {len(cities)} 个")
+
+        if city_filter:
+            key = city_filter.strip().lower()
+            cities = [
+                c for c in cities
+                if key in c.city_py.lower() or key in c.city_name.lower()
+            ]
+            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
+
+        if max_cities > 0:
+            cities = cities[:max_cities]
+            print(f"[discover] 截断城市数 {len(cities)}")
+
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+
+        seen_ids: Set[str] = set()
+        if os.path.exists(output_path):
+            with open(output_path, "r", encoding="utf-8") as old_file:
+                for line in old_file:
+                    line = line.strip()
+                    if not line:
                        continue
-                    print(f"请求失败 {url}: 403 Forbidden")
-                    return None
-                if status_code >= 400:
-                    raise RequestClientError(f"{status_code} Error: {url}")
-                return text
-            except RequestClientError as exc:
-                print(f"请求失败 {url}: {exc}")
-                return None
-        return None
+                    try:
+                        item = json.loads(line)
+                    except Exception:
+                        continue
+                    rid = item.get("record_id")
+                    if rid:
+                        seen_ids.add(rid)
+            print(f"[resume] 已有记录 {len(seen_ids)} 条")

-    def _parse_list(self, html: str, province: str, city: str) -> int:
-        soup = BeautifulSoup(html, "html.parser")
-        links = [a.get("href", "") for a in soup.select("a.hide_link")]
-        links = [link.replace("lll", "int") for link in links if link]
-        if not links:
-            return 0
+        total_new_json = 0
+        total_new_db = 0
+        total_skip_db = 0

-        detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
+        with open(output_path, "a", encoding="utf-8") as out:
+            for idx, target in enumerate(cities, start=1):
+                print(
+                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
+                    f"({target.city_py})"
+                )
+                city_records = list(self.crawl_city(target))

-        results: List[Dict[str, str]] = []
-        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
-            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
-            for fut in as_completed(futs):
-                try:
-                    data = fut.result()
-                except Exception as exc:
-                    print(f"  详情解析异常: {exc}")
-                    continue
-                if data and data.get("phone"):
-                    results.append(data)
+                city_new_json = 0
+                for record in city_records:
+                    rid = record["record_id"]
+                    if rid in seen_ids:
+                        continue
+                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
+                    seen_ids.add(rid)
+                    city_new_json += 1
+                    total_new_json += 1

-        if not results:
-            return len(detail_urls)
+                city_new_db, city_skip_db = self._write_records_to_db(city_records)
+                total_new_db += city_new_db
+                total_skip_db += city_skip_db

-        phones = [d["phone"] for d in results if d.get("phone")]
-        existing = self._existing_phones(phones)
+                print(
+                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
+                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
+                )

-        for data in results:
-            phone = data.get("phone")
-            if not phone:
-                continue
-            if phone in existing:
-                print(f"  -- 已存在: {data['name']} ({phone})")
-                continue
-            try:
-                self.db.insert_data("lawyer", data)
-                print(f"  -> 新增: {data['name']} ({phone})")
-            except Exception as exc:
-                print(f"  插入失败 {data.get('url')}: {exc}")
+        print(
+            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
+            f"DB跳过{total_skip_db}条, 输出: {output_path}"
+        )

-        return len(detail_urls)

-    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
-        html = None
-        sess = self._get_thread_session()
-        html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
-        if not html:
-            return None
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="法律快车全新采集脚本（站点数据直采）")
+    parser.add_argument(
+        "--output",
+        default="/www/wwwroot/lawyers/data/lawtime_records_all.jsonl",
+        help="输出 jsonl 文件路径",
+    )
+    parser.add_argument(
+        "--max-cities",
+        type=int,
+        default=0,
+        help="最多采集多少个城市，0 表示不限",
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=9999,
+        help="每个城市最多采集多少页",
+    )
+    parser.add_argument(
+        "--city-filter",
+        default="",
+        help="按城市拼音或城市名过滤，如 beijing",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=float,
+        default=0.1,
+        help="详情页请求间隔秒数",
+    )
+    parser.add_argument(
+        "--direct",
+        action="store_true",
+        help="直连模式，不使用 proxy_settings.json 代理",
+    )
+    parser.add_argument(
+        "--no-db",
+        action="store_true",
+        help="只输出 JSONL，不写入数据库",
+    )
+    return parser.parse_args()

-        soup = BeautifulSoup(html, "html.parser")
-        text = soup.get_text(" ")

-        name = ""
-        title_tag = soup.find("title")
-        if title_tag:
-            match = re.search(r"(\S+)律师", title_tag.get_text())
-            if match:
-                name = match.group(1)
-        if not name:
-            intl_div = soup.find("div", class_="intl")
-            if intl_div:
-                match = re.search(r"(\S+)律师", intl_div.get_text())
-                if match:
-                    name = match.group(1)
+def main():
+    args = parse_args()

-        phone = ""
-        phone_pattern = r"1[3-9]\d{9}"
-        for item in soup.select("div.item.flex"):
-            label = item.find("div", class_="label")
-            desc = item.find("div", class_="desc")
-            if not label or not desc:
-                continue
-            label_text = label.get_text()
-            desc_text = desc.get_text().replace("-", "")
-            if "联系电话" in label_text or "电话" in label_text:
-                matches = re.findall(phone_pattern, desc_text)
-                if matches:
-                    phone = matches[0]
-                    break
-        if not phone:
-            matches = re.findall(phone_pattern, text.replace("-", ""))
-            if matches:
-                phone = matches[0]
-        if not phone:
-            print(f"  无手机号: {url}")
-            return None
+    if args.no_db:
+        crawler = LawtimeCrawler(
+            max_pages=args.max_pages,
+            sleep_seconds=args.sleep,
+            use_proxy=not args.direct,
+            db_connection=None,
+        )
+        crawler.crawl(
+            output_path=args.output,
+            max_cities=args.max_cities,
+            city_filter=args.city_filter or None,
+        )
+        return

-        law_firm = ""
-        for item in soup.select("div.item.flex"):
-            label = item.find("div", class_="label")
-            desc = item.find("div", class_="desc")
-            if not label or not desc:
-                continue
-            if "执业律所" in label.get_text() or "律所" in label.get_text():
-                law_firm = desc.get_text(strip=True).replace("已认证", "")
-                break
-
-        params = {
-            "list_url": url,
-            "province": province,
-            "city": city,
-        }
-
-        return {
-            "name": name or "",
-            "law_firm": law_firm,
-            "province": province,
-            "city": city,
-            "phone": phone,
-            "url": url,
-            "domain": DOMAIN,
-            "create_time": int(time.time()),
-            "params": json.dumps(params, ensure_ascii=False)
-        }
-
-    def run(self):
-        print("启动法律快车采集...")
-        areas = self._load_areas()
-        if not areas:
-            print("无地区数据")
-            return
-
-        for area in areas:
-            pinyin = area.get("pinyin")
-            province = area.get("province", "")
-            city = area.get("city", "")
-            if not pinyin:
-                continue
-            page = 1
-            while True:
-                list_url = LIST_BASE.format(pinyin=pinyin, page=page)
-                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
-                html = self._get(list_url)
-                if not html:
-                    break
-                link_count = self._parse_list(html, province, city)
-                if link_count == 0:
-                    break
-                page += 1
-        print("法律快车采集完成")
+    with Db() as db:
+        crawler = LawtimeCrawler(
+            max_pages=args.max_pages,
+            sleep_seconds=args.sleep,
+            use_proxy=not args.direct,
+            db_connection=db,
+        )
+        crawler.crawl(
+            output_path=args.output,
+            max_cities=args.max_cities,
+            city_filter=args.city_filter or None,
+        )


 if __name__ == "__main__":
-    with Db() as db:
-        spider = LawtimeSpider(db)
-        spider.run()
+    main()