重构采集脚本并新增按时间导出Excel

- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore，忽略 data/logs 本地产物
2026-03-02 11:46:05 +08:00
parent 03847a4b8e
commit 19cf9ce901
12 changed files with 3545 additions and 1059 deletions
@@ -1,10 +1,18 @@
+import argparse
+import ast
+import hashlib
 import json
 import os
+import random
 import re
 import sys
 import time
-import random
-from typing import Dict, Optional
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Set, Tuple
+from urllib.parse import urljoin
+
+import urllib3
+from bs4 import BeautifulSoup

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)
@@ -14,312 +22,638 @@ if request_dir not in sys.path:
 if project_root not in sys.path:
    sys.path.append(project_root)

-from bs4 import BeautifulSoup
-from request.requests_client import RequestClientError, RequestsClient
-
 from Db import Db
-from config import HEADERS
+from request.requests_client import RequestClientError, RequestsClient
+from utils.rate_limiter import wait_for_request

-LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
-DOMAIN = "华律"
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+SITE_NAME = "hualv"
+LEGACY_DOMAIN = "华律"
+SITE_BASE = "https://m.66law.cn"
+CITY_DATA_URL = "https://cache.66law.cn/dist/main-v2.0.js"
+LIST_API_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
+
+PHONE_RE = re.compile(r"1[3-9]\d{9}")
+EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
+YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")


-class HualvSpider:
-    def __init__(self, db_connection):
+@dataclass
+class CityTarget:
+    province_id: int
+    province_name: str
+    city_id: int
+    city_name: str
+
+
+def normalize_phone(text: str) -> str:
+    compact = re.sub(r"\D", "", text or "")
+    match = PHONE_RE.search(compact)
+    return match.group(0) if match else ""
+
+
+def strip_html_tags(text: str) -> str:
+    return re.sub(r"<[^>]+>", "", text or "").strip()
+
+
+class HualvCrawler:
+    def __init__(
+        self,
+        max_pages: int = 9999,
+        sleep_seconds: float = 0.15,
+        use_proxy: bool = True,
+        db_connection=None,
+    ):
+        self.max_pages = max_pages
+        self.sleep_seconds = max(0.0, sleep_seconds)
        self.db = db_connection
-        self.client = self._build_session()
-        self.areas = self._load_areas()
-
-    def _build_session(self) -> RequestsClient:
-        custom_headers = HEADERS.copy()
-        custom_headers['User-Agent'] = (
-            'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
-            'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
-            'Mobile/15E148 Safari/604.1'
+        self.client = RequestsClient(
+            headers={
+                "User-Agent": (
+                    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                    "Mobile/15E148 Safari/604.1"
+                ),
+                "Accept": "application/json, text/javascript, */*; q=0.01",
+                "X-Requested-With": "XMLHttpRequest",
+                "Connection": "close",
+            },
+            use_proxy=use_proxy,
+            retry_total=2,
+            retry_backoff_factor=1,
+            retry_status_forcelist=(429, 500, 502, 503, 504),
+            retry_allowed_methods=("GET", "POST"),
        )
-        custom_headers["Connection"] = "close"
-        return RequestsClient(headers=custom_headers)

-    def _refresh_session(self) -> None:
-        self.client.refresh()
+    def _request_text(
+        self,
+        method: str,
+        url: str,
+        *,
+        timeout: int = 20,
+        max_retries: int = 3,
+        referer: str = SITE_BASE,
+        data: Optional[Dict] = None,
+    ) -> str:
+        headers = {"Referer": referer}
+        last_error: Optional[Exception] = None

-    def _load_areas(self):
-        tables = ("area_new", "area2", "area")
-        last_error = None
-        for table in tables:
+        for attempt in range(max_retries):
+            wait_for_request()
            try:
-                provinces = self.db.select_data(
-                    table,
-                    "code, province, pinyin, id",
-                    "domain='66law' AND level=1"
-                ) or []
-                cities = self.db.select_data(
-                    table,
-                    "code, city, province, pid",
-                    "domain='66law' AND level=2"
-                ) or []
+                if method.upper() == "POST":
+                    resp = self.client.post_text(
+                        url,
+                        timeout=timeout,
+                        verify=False,
+                        headers=headers,
+                        data=data,
+                    )
+                else:
+                    resp = self.client.get_text(
+                        url,
+                        timeout=timeout,
+                        verify=False,
+                        headers=headers,
+                    )
+
+                code = resp.status_code
+                if code == 403:
+                    if attempt < max_retries - 1:
+                        self.client.refresh()
+                        time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                        continue
+                    raise RequestClientError(f"{code} Error: {url}")
+                if code >= 500 and attempt < max_retries - 1:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                    continue
+                if code >= 400:
+                    raise RequestClientError(f"{code} Error: {url}")
+                return resp.text
            except Exception as exc:
                last_error = exc
-                continue
+                if attempt < max_retries - 1:
+                    time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
+                    continue
+                raise

-            if not cities:
-                continue
+        if last_error is not None:
+            raise last_error
+        raise RequestClientError(f"Unknown request error: {url}")

-            province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
-            city_map = {}
-            for city in cities:
-                province_info = province_map.get(city.get('pid'), {}) or {}
-                province_code = province_info.get('code')
-                city_map[city.get('code')] = {
-                    "name": city.get('city'),
-                    "province": city.get('province'),
-                    "province_code": province_code,
-                }
-            print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
-            return city_map
-
-        if last_error:
-            print(f"[华律] 加载地区数据失败: {last_error}")
-        print("[华律] 无城市数据（已尝试 area_new/area2/area）")
-        return {}
-
-    def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
-        for attempt in range(max_retries):
-            try:
-                resp = self.client.post_text(LIST_URL, data=data, timeout=20, verify=False)
-                status_code = resp.status_code
-                text = resp.text
-                if status_code == 403:
-                    if attempt < max_retries - 1:
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
-                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
-                        self._refresh_session()
-                        time.sleep(wait_time)
-                        continue
-                    print("请求失败: 403 Forbidden")
-                    return None
-                if status_code >= 400:
-                    raise RequestClientError(f"{status_code} Error")
-                try:
-                    return json.loads(text)
-                except ValueError as exc:
-                    print(f"解析JSON失败: {exc}")
-                    return None
-            except RequestClientError as exc:
-                print(f"请求失败: {exc}")
-                return None
-        return None
-
-    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
-        contact_url = f"{url}lawyer_contact.aspx"
-        print(f"  详情: {contact_url}")
-        existing = self.db.select_data(
-            "lawyer",
-            "id, avatar_url",
-            f"domain='{DOMAIN}' AND url='{contact_url}'"
+    def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
+        return self._request_text(
+            "GET",
+            url,
+            timeout=timeout,
+            max_retries=max_retries,
+            referer=referer,
        )
-        existing_id = None
-        if existing:
-            existing_id = existing[0].get("id")
-            avatar = (existing[0].get("avatar_url") or "").strip()
-            if avatar:
-                print("    -- 已存在且头像已补全，跳过")
-                return None

-        html = self._get_detail(contact_url)
-        if not html:
-            return None
+    def _post_text(
+        self,
+        url: str,
+        *,
+        data: Dict,
+        timeout: int = 20,
+        max_retries: int = 3,
+        referer: str = SITE_BASE,
+    ) -> str:
+        return self._request_text(
+            "POST",
+            url,
+            timeout=timeout,
+            max_retries=max_retries,
+            referer=referer,
+            data=data,
+        )

+    def _extract_spc_location(self, script_text: str) -> List:
+        # main-v2.js 内置了 sPCLocation=new Array(...)，后面紧跟 cateinfo 数组
+        marker = "sPCLocation = new Array("
+        start = script_text.find(marker)
+        if start == -1:
+            marker = "sPCLocation=new Array("
+            start = script_text.find(marker)
+        if start == -1:
+            return []
+        start += len(marker)
+
+        next_marker = script_text.find("cateinfo = new Array(", start)
+        if next_marker == -1:
+            next_marker = script_text.find("cateinfo=new Array(", start)
+
+        if next_marker != -1:
+            end = script_text.rfind(");", start, next_marker)
+        else:
+            end = script_text.find(");", start)
+
+        if end == -1 or end <= start:
+            return []
+
+        raw = "[" + script_text[start:end] + "]"
+        try:
+            data = ast.literal_eval(raw)
+        except Exception:
+            return []
+        return data if isinstance(data, list) else []
+
+    def discover_cities(self) -> List[CityTarget]:
+        script_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyer/")
+        rows = self._extract_spc_location(script_text)
+
+        targets: List[CityTarget] = []
+        seen: Set[Tuple[int, int]] = set()
+
+        for province in rows:
+            if not isinstance(province, list) or len(province) < 3:
+                continue
+            try:
+                province_id = int(province[0])
+            except Exception:
+                continue
+            province_name = str(province[1] or "").strip()
+            city_rows = province[2] if isinstance(province[2], list) else []
+
+            for city in city_rows:
+                if not isinstance(city, list) or len(city) < 2:
+                    continue
+                try:
+                    city_id = int(city[0])
+                except Exception:
+                    continue
+                city_name = str(city[1] or "").strip()
+                if city_id <= 0 or not city_name:
+                    continue
+
+                key = (province_id, city_id)
+                if key in seen:
+                    continue
+                seen.add(key)
+
+                targets.append(
+                    CityTarget(
+                        province_id=province_id,
+                        province_name=province_name,
+                        city_id=city_id,
+                        city_name=city_name,
+                    )
+                )
+        return targets
+
+    def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[Dict], int]:
+        payload = {
+            "pid": str(target.province_id),
+            "cid": str(target.city_id),
+            "page": str(page),
+        }
+        text = self._post_text(
+            LIST_API_URL,
+            data=payload,
+            referer=SITE_BASE + "/findlawyer/",
+        )
+        data = json.loads((text or "").strip().lstrip("\ufeff") or "{}")
+        items = data.get("lawyerList") or data.get("queryLawyerList") or []
+        if not isinstance(items, list):
+            items = []
+
+        page_count = 0
+        try:
+            page_count = int((data.get("lawyerItems") or {}).get("pageCount") or 0)
+        except Exception:
+            page_count = 0
+        return items, page_count
+
+    def parse_detail(self, detail_url: str) -> Dict:
+        contact_url = detail_url.rstrip("/") + "/lawyer_contact.aspx"
+        html = self._get_text(contact_url, referer=detail_url)
        soup = BeautifulSoup(html, "html.parser")
-        info_list = soup.find("ul", class_="information-list")
-        if not info_list:
-            return None
-
-        phone = ""
-        law_firm = ""
-        for li in info_list.find_all("li"):
-            text = li.get_text(strip=True)
-            if "手机号" in text:
-                cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
-                match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
-                if match:
-                    phone = match.group(0)
-            if "执业单位" in text:
-                law_firm = text.replace("执业单位", "").strip()
+        full_text = soup.get_text(" ", strip=True)

        name = ""
-        breadcrumb = soup.find("div", class_="weizhi")
-        if breadcrumb:
-            links = breadcrumb.find_all("a")
-            if len(links) > 2:
-                name = links[2].get_text(strip=True)
+        law_firm = ""
+        phone = ""
+        email = ""
+        address = ""
+        license_no = ""
+        practice_years: Optional[int] = None

-        phone = phone.replace('-', '').strip()
-        if not phone or not re.fullmatch(r"1\d{10}", phone):
-            print("    无手机号，跳过")
+        name_tag = soup.select_one(".logo-box .title b")
+        if name_tag:
+            name = name_tag.get_text(strip=True).replace("律师", "").strip()
+        if not name and soup.title:
+            match = re.search(r"([^\s,，。_]+?)律师", soup.title.get_text(" ", strip=True))
+            if match:
+                name = match.group(1).strip()
+
+        phone_candidates = [
+            soup.select_one(".logo-box .r-bar .tel").get_text(" ", strip=True)
+            if soup.select_one(".logo-box .r-bar .tel")
+            else "",
+            soup.select_one(".lawyer-show ul.info").get_text(" ", strip=True)
+            if soup.select_one(".lawyer-show ul.info")
+            else "",
+            full_text,
+        ]
+        for candidate in phone_candidates:
+            phone = normalize_phone(candidate)
+            if phone:
+                break
+
+        for li in soup.select(".lawyer-show ul.info li"):
+            li_text = li.get_text(" ", strip=True)
+            if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
+                law_firm = li_text
+
+        if not law_firm:
+            match = re.search(r'"affiliation":\{"@type":"Organization","name":"([^"]+)"', html)
+            if match:
+                law_firm = match.group(1).strip()
+
+        match = re.search(r'"identifier":"([^"]+)"', html)
+        if match:
+            license_no = match.group(1).strip()
+
+        match = re.search(r'"streetAddress":"([^"]+)"', html)
+        if match:
+            address = match.group(1).strip()
+
+        email_match = EMAIL_RE.search(html)
+        if email_match:
+            email = email_match.group(0).strip()
+
+        year_match = YEAR_RE.search(full_text)
+        if year_match:
+            try:
+                practice_years = int(year_match.group(1))
+            except Exception:
+                practice_years = None
+
+        specialties = [node.get_text(strip=True) for node in soup.select(".tag-h38 span")]
+        specialties = [x for x in specialties if x]
+
+        return {
+            "name": name,
+            "law_firm": law_firm,
+            "phone": phone,
+            "email": email,
+            "address": address,
+            "license_no": license_no,
+            "practice_years": practice_years,
+            "specialties": specialties,
+            "detail_url": detail_url,
+            "contact_url": contact_url,
+        }
+
+    def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
+        seen_details: Set[str] = set()
+
+        for page in range(1, self.max_pages + 1):
+            try:
+                items, page_count = self.fetch_list_page(target, page)
+            except Exception as exc:
+                print(f"[list] 失败 pid={target.province_id} cid={target.city_id} p{page}: {exc}")
+                break
+
+            if not items:
+                break
+
+            for item in items:
+                detail_url = str(item.get("lawyerUrl") or "").strip()
+                if not detail_url:
+                    continue
+                if detail_url.startswith("//"):
+                    detail_url = "https:" + detail_url
+                if not detail_url.startswith("http"):
+                    detail_url = urljoin(SITE_BASE, detail_url)
+
+                if detail_url in seen_details:
+                    continue
+                seen_details.add(detail_url)
+
+                try:
+                    detail = self.parse_detail(detail_url)
+                except Exception as exc:
+                    print(f"[detail] 失败 {detail_url}: {exc}")
+                    continue
+
+                now = int(time.time())
+                uid = str(item.get("lawyerId") or item.get("globalUserId") or detail_url)
+                record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
+
+                list_name = str(item.get("name") or "").replace("律师", "").strip()
+                category_text = str(item.get("categoryNames") or "").strip()
+                category_arr = [x.strip() for x in re.split(r"[、,，]", category_text) if x.strip()]
+
+                yield {
+                    "record_id": record_id,
+                    "collected_at": now,
+                    "source": {
+                        "site": SITE_NAME,
+                        "province_id": target.province_id,
+                        "province": target.province_name,
+                        "city_id": target.city_id,
+                        "city": target.city_name,
+                        "page": page,
+                        "detail_url": detail_url,
+                        "contact_url": detail.get("contact_url", ""),
+                    },
+                    "list_snapshot": {
+                        "lawyer_id": item.get("lawyerId"),
+                        "name": list_name,
+                        "category_names": category_arr,
+                        "help_count": strip_html_tags(str(item.get("helpCount") or "")),
+                        "comment_score": strip_html_tags(str(item.get("commentScore") or "")),
+                        "response_time": str(item.get("responseTime") or "").strip(),
+                        "year": item.get("year"),
+                        "is_adv": bool(item.get("isAdv")),
+                    },
+                    "profile": {
+                        "name": detail.get("name") or list_name,
+                        "law_firm": detail.get("law_firm") or "",
+                        "phone": detail.get("phone") or "",
+                        "email": detail.get("email") or "",
+                        "address": detail.get("address") or "",
+                        "license_no": detail.get("license_no") or "",
+                        "practice_years": detail.get("practice_years"),
+                        "specialties": detail.get("specialties") or category_arr,
+                    },
+                    "raw": item,
+                }
+
+                if self.sleep_seconds:
+                    time.sleep(self.sleep_seconds)
+
+            if page_count > 0 and page >= page_count:
+                break
+
+    def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
+        source = record.get("source", {}) or {}
+        profile = record.get("profile", {}) or {}
+
+        phone = normalize_phone(profile.get("phone", ""))
+        if not phone:
            return None

-        avatar_url, site_time = self._extract_avatar_and_time(soup)
-        data = {
-            "phone": phone,
+        province = (source.get("province") or "").strip()
+        city = (source.get("city") or province).strip()
+        return {
+            "name": (profile.get("name") or "").strip(),
+            "law_firm": (profile.get("law_firm") or "").strip(),
            "province": province,
            "city": city,
-            "law_firm": law_firm,
-            "url": contact_url,
-            "avatar_url": avatar_url,
-            "create_time": int(time.time()),
-            "site_time": site_time,
-            "domain": DOMAIN,
-            "name": name,
-            "params": json.dumps({"source": url}, ensure_ascii=False)
+            "phone": phone,
+            "url": (source.get("contact_url") or source.get("detail_url") or "").strip(),
+            "domain": LEGACY_DOMAIN,
+            "create_time": int(record.get("collected_at") or time.time()),
+            "params": json.dumps(record, ensure_ascii=False),
        }
-        if existing_id:
-            update_data = {
-                "avatar_url": avatar_url,
-                "site_time": site_time,
-            }
-            if name:
-                update_data["name"] = name
-            if law_firm:
-                update_data["law_firm"] = law_firm
-            if province:
-                update_data["province"] = province
-            if city:
-                update_data["city"] = city
-            if phone:
-                update_data["phone"] = phone
-            update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
-            try:
-                self.db.update_data("lawyer", update_data, f"id={existing_id}")
-                print("    -- 已存在，已补全头像/时间")
-            except Exception as exc:
-                print(f"    更新失败: {exc}")
-            return None
-        # 若手机号已存在，则更新头像/时间，不再插入新记录
-        existing_phone = self.db.select_data(
-            "lawyer",
-            "id, avatar_url, url",
-            f"domain='{DOMAIN}' AND phone='{phone}'"
-        )
-        if existing_phone:
-            existing_row = existing_phone[0]
-            avatar = (existing_row.get("avatar_url") or "").strip()
-            if avatar:
-                print("    -- 已存在手机号且头像已补全，跳过")
-                return None
-            update_data = {
-                "avatar_url": avatar_url,
-                "site_time": site_time,
-            }
-            if name:
-                update_data["name"] = name
-            if law_firm:
-                update_data["law_firm"] = law_firm
-            if province:
-                update_data["province"] = province
-            if city:
-                update_data["city"] = city
-            if phone:
-                update_data["phone"] = phone
-            if not existing_row.get("url"):
-                update_data["url"] = contact_url
-            update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
-            try:
-                self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
-                print("    -- 已存在手机号，已补全头像/时间")
-            except Exception as exc:
-                print(f"    更新失败: {exc}")
-            return None
-        return data

-    def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
-        avatar_url = ""
-        site_time = None
-        img_tag = soup.select_one(
-            "div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
-        )
-        if img_tag:
-            src = (img_tag.get("src") or "").strip()
-            if src:
-                if src.startswith("//"):
-                    avatar_url = f"https:{src}"
-                else:
-                    avatar_url = src
-                match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
-                if match:
-                    site_time = int(f"{match.group(1)}{match.group(2)}")
-                else:
-                    match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
-                    if match:
-                        site_time = int(f"{match.group(1)}{match.group(2)}")
-        return avatar_url, site_time
+    def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
+        if not self.db or not phones:
+            return set()

-    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
-        for attempt in range(max_retries):
-            try:
-                resp = self.client.get_text(url, timeout=15, verify=False)
-                status_code = resp.status_code
-                text = resp.text
-                if status_code == 403:
-                    if attempt < max_retries - 1:
-                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
-                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
-                        self._refresh_session()
-                        time.sleep(wait_time)
-                        continue
-                    print("    请求失败: 403 Forbidden")
-                    return None
-                if status_code >= 400:
-                    raise RequestClientError(f"{status_code} Error")
-                return text
-            except RequestClientError as exc:
-                print(f"    请求失败: {exc}")
-                return None
-        return None
+        deduped = sorted({p for p in phones if p})
+        if not deduped:
+            return set()

-    def run(self):
-        print("启动华律网采集...")
-        if not self.areas:
-            print("无城市数据")
-            return
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            chunk_size = 500
+            for i in range(0, len(deduped), chunk_size):
+                chunk = deduped[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
+                cur.execute(sql, [LEGACY_DOMAIN, *chunk])
+                for row in cur.fetchall():
+                    existing.add(row[0])
+        finally:
+            cur.close()

-        for city_code, city_info in self.areas.items():
-            province_code = city_info.get("province_code")
-            if not province_code:
+        return existing
+
+    def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
+        if not self.db:
+            return 0, 0
+
+        rows: List[Dict[str, str]] = []
+        for record in records:
+            row = self._to_legacy_lawyer_row(record)
+            if row:
+                rows.append(row)
+        if not rows:
+            return 0, 0
+
+        existing = self._existing_phones_in_db([row["phone"] for row in rows])
+        inserted = 0
+        skipped = 0
+
+        for row in rows:
+            phone = row.get("phone", "")
+            if not phone or phone in existing:
+                skipped += 1
                continue
-            province_name = city_info.get("province", "")
-            city_name = city_info.get("name", "")
-            print(f"采集 {province_name}-{city_name}")
+            try:
+                self.db.insert_data("lawyer", row)
+                existing.add(phone)
+                inserted += 1
+            except Exception as exc:
+                skipped += 1
+                print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")

-            page = 1
-            while True:
-                payload = {"pid": province_code, "cid": city_code, "page": str(page)}
-                data = self._post(payload)
-                if not data or not data.get("lawyerList"):
-                    break
+        return inserted, skipped

-                for item in data["lawyerList"]:
-                    result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
-                    if not result:
+    def crawl(
+        self,
+        output_path: str,
+        max_cities: int = 0,
+        city_filter: Optional[str] = None,
+    ) -> None:
+        cities = self.discover_cities()
+        print(f"[discover] 共发现城市 {len(cities)} 个")
+
+        if city_filter:
+            key = city_filter.strip().lower()
+            cities = [
+                c for c in cities
+                if key in c.city_name.lower() or key in str(c.city_id).lower()
+            ]
+            print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
+
+        if max_cities > 0:
+            cities = cities[:max_cities]
+            print(f"[discover] 截断城市数 {len(cities)}")
+
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+
+        seen_ids: Set[str] = set()
+        if os.path.exists(output_path):
+            with open(output_path, "r", encoding="utf-8") as old_file:
+                for line in old_file:
+                    line = line.strip()
+                    if not line:
                        continue
                    try:
-                        self.db.insert_data("lawyer", result)
-                        print(f"  -> 新增: {result['name']} ({result['phone']})")
-                    except Exception as exc:
-                        print(f"  插入失败: {exc}")
-                    time.sleep(1)
+                        item = json.loads(line)
+                    except Exception:
+                        continue
+                    rid = item.get("record_id")
+                    if rid:
+                        seen_ids.add(rid)
+            print(f"[resume] 已有记录 {len(seen_ids)} 条")

-                page_count = data.get("lawyerItems", {}).get("pageCount", page)
-                if page >= page_count:
-                    break
-                page += 1
-                time.sleep(2)
+        total_new_json = 0
+        total_new_db = 0
+        total_skip_db = 0

-            time.sleep(1)
-        print("华律网采集完成")
+        with open(output_path, "a", encoding="utf-8") as out:
+            for idx, target in enumerate(cities, start=1):
+                print(
+                    f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
+                    f"(pid={target.province_id}, cid={target.city_id})"
+                )
+                city_records = list(self.crawl_city(target))
+
+                city_new_json = 0
+                for record in city_records:
+                    rid = record["record_id"]
+                    if rid in seen_ids:
+                        continue
+                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
+                    seen_ids.add(rid)
+                    city_new_json += 1
+                    total_new_json += 1
+
+                city_new_db, city_skip_db = self._write_records_to_db(city_records)
+                total_new_db += city_new_db
+                total_skip_db += city_skip_db
+
+                print(
+                    f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
+                    f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
+                )
+
+        print(
+            f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
+            f"DB跳过{total_skip_db}条, 输出: {output_path}"
+        )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="华律网全新采集脚本（站点数据直采）")
+    parser.add_argument(
+        "--output",
+        default="/www/wwwroot/lawyers/data/hualv_records_all.jsonl",
+        help="输出 jsonl 文件路径",
+    )
+    parser.add_argument(
+        "--max-cities",
+        type=int,
+        default=0,
+        help="最多采集多少个城市，0 表示不限",
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=9999,
+        help="每个城市最多采集多少页",
+    )
+    parser.add_argument(
+        "--city-filter",
+        default="",
+        help="按城市名称或城市编码过滤，如 beijing / 110100",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=float,
+        default=0.15,
+        help="详情页请求间隔秒数",
+    )
+    parser.add_argument(
+        "--direct",
+        action="store_true",
+        help="直连模式，不使用 proxy_settings.json 代理",
+    )
+    parser.add_argument(
+        "--no-db",
+        action="store_true",
+        help="只输出 JSONL，不写入数据库",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if args.no_db:
+        crawler = HualvCrawler(
+            max_pages=args.max_pages,
+            sleep_seconds=args.sleep,
+            use_proxy=not args.direct,
+            db_connection=None,
+        )
+        crawler.crawl(
+            output_path=args.output,
+            max_cities=args.max_cities,
+            city_filter=args.city_filter or None,
+        )
+        return
+
+    with Db() as db:
+        crawler = HualvCrawler(
+            max_pages=args.max_pages,
+            sleep_seconds=args.sleep,
+            use_proxy=not args.direct,
+            db_connection=db,
+        )
+        crawler.crawl(
+            output_path=args.output,
+            max_cities=args.max_cities,
+            city_filter=args.city_filter or None,
+        )


 if __name__ == "__main__":
-    with Db() as db:
-        spider = HualvSpider(db)
-        spider.run()
+    main()