chore: initialize lawyers crawler project

2026-03-02 00:19:48 +08:00
commit 03847a4b8e
17 changed files with 1928 additions and 0 deletions
@@ -0,0 +1,268 @@
+import json
+import os
+import sys
+import time
+import random
+from typing import Dict, Optional
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+import urllib3
+from bs4 import BeautifulSoup
+from request.requests_client import (
+    RequestClientError,
+    RequestConnectTimeout,
+    RequestConnectionError,
+    RequestTimeout,
+    RequestsClient,
+)
+
+# 禁用 SSL 警告
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+from Db import Db
+from utils.rate_limiter import wait_for_request
+
+DOMAIN = "大律师"
+LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
+_PROXY_TESTED = False
+
+
+class DlsSpider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.areas = self._load_areas()
+
+    def _build_session(self) -> RequestsClient:
+        """构建带重试机制的 session"""
+        client = RequestsClient(
+            headers={
+                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
+                "Host": "m.maxlaw.cn",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+                "Connection": "close",
+            },
+            retry_total=3,  # 总共重试3次
+            retry_backoff_factor=1,  # 重试间隔：1s, 2s, 4s
+            retry_status_forcelist=(429, 500, 502, 503, 504),  # 对这些状态码进行重试
+            retry_allowed_methods=("GET", "POST"),
+        )
+        self._proxy_test(client, client.proxies or None)
+        return client
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+        self._proxy_test(self.client, self.client.proxies or None)
+
+    def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
+        global _PROXY_TESTED
+        if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
+            return
+        _PROXY_TESTED = True
+        if not proxies:
+            print("[proxy] test skipped: no proxy configured")
+            return
+        test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
+        timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
+        try:
+            resp = client.get_text(
+                test_url,
+                timeout=timeout,
+                headers={"Connection": "close"},
+            )
+            print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
+        except Exception as exc:
+            print(f"[proxy] test failed: {exc}")
+
+    def _load_areas(self):
+        try:
+            return self.db.select_data(
+                "area_new",
+                "province, city, pinyin",
+                "domain='maxlaw'"
+            ) or []
+        except Exception as exc:
+            print(f"加载地区失败: {exc}")
+            return []
+
+    def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
+        """发送 GET 请求，带重试机制"""
+        wait_for_request()
+        
+        for attempt in range(max_retries):
+            try:
+                # 使用更长的超时时间，分别设置连接和读取超时
+                resp = self.client.get_text(
+                    url, 
+                    timeout=(10, 30),  # (connect_timeout, read_timeout)
+                    verify=False,
+                    headers=headers,
+                )
+                status_code = resp.status_code
+                content = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error: {url}")
+                return content
+            except RequestConnectTimeout as exc:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt  # 指数退避：2s, 4s, 8s
+                    print(f"连接超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    time.sleep(wait_time)
+                else:
+                    print(f"连接超时，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except RequestTimeout as exc:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    print(f"请求超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    time.sleep(wait_time)
+                else:
+                    print(f"请求超时，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except RequestConnectionError as exc:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    print(f"连接错误，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    time.sleep(wait_time)
+                else:
+                    print(f"连接错误，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except RequestClientError as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        
+        return None
+
+    def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
+        soup = BeautifulSoup(html, "html.parser")
+        cards = soup.find_all("div", class_="lstx")
+        if not cards:
+            return 0
+
+        inserted = 0
+        for card in cards:
+            link = card.find("a")
+            if not link or not link.get("href"):
+                continue
+            detail = self._parse_detail(link['href'], province, city, list_url)
+            if not detail:
+                continue
+            phone = detail.get("phone")
+            if not phone:
+                continue
+            condition = f"phone='{phone}' and domain='{DOMAIN}'"
+            if self.db.is_data_exist("lawyer", condition):
+                print(f"  -- 已存在: {detail['name']} ({phone})")
+                time.sleep(0.3)
+                continue
+            try:
+                self.db.insert_data("lawyer", detail)
+                inserted += 1
+                print(f"  -> 新增: {detail['name']} ({phone})")
+            except Exception as exc:
+                print(f"  插入失败: {exc}")
+            time.sleep(1)
+            time.sleep(0.3)
+        # 列表页结束后再缓一缓，降低风控
+        time.sleep(0.6)
+        return inserted
+
+    def _detail_headers(self, referer: str) -> Dict[str, str]:
+        return {
+            "Referer": referer,
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Pragma": "no-cache",
+            "Upgrade-Insecure-Requests": "1",
+        }
+
+    def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
+        url = f"https://m.maxlaw.cn{path}"
+        print(f"  详情: {url}")
+        html = self._get(url, headers=self._detail_headers(list_url))
+        if not html:
+            return None
+
+        soup = BeautifulSoup(html, "html.parser")
+        name_tag = soup.find("h2", class_="lawyerName")
+        law_firm_tag = soup.find("p", class_="law-firm")
+        contact_list = soup.find("ul", class_="contact-content")
+
+        name = name_tag.get_text(strip=True) if name_tag else ""
+        law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
+        phone = ""
+
+        if contact_list:
+            items = contact_list.find_all("li")
+            if len(items) > 2:
+                phone_tag = items[2].find("p")
+                if phone_tag:
+                    phone = phone_tag.get_text(strip=True)
+                    phone = phone.split("咨询请说明来自大律师网")[0].strip()
+
+        phone = phone.replace('-', '').strip()
+        if not name or not phone:
+            print("    信息不完整，跳过")
+            return None
+
+        safe_city = city if city else province
+        return {
+            "name": name,
+            "law_firm": law_firm,
+            "province": province,
+            "city": safe_city,
+            "phone": phone,
+            "url": url,
+            "domain": DOMAIN,
+            "create_time": int(time.time()),
+            "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
+        }
+
+    def run(self):
+        print("启动大律师采集...")
+        if not self.areas:
+            print("无地区数据")
+            return
+
+        for area in self.areas:
+            pinyin = area.get("pinyin")
+            province = area.get("province", "")
+            city = area.get("city", "")
+            if not pinyin:
+                continue
+            page = 1
+            while True:
+                list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
+                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
+                html = self._get(list_url)
+                if not html:
+                    break
+                inserted = self._parse_list(html, province, city, list_url)
+                if inserted == 0:
+                    break
+                page += 1
+        print("大律师采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = DlsSpider(db)
+        spider.run()
@@ -0,0 +1,209 @@
+import json
+import os
+import sys
+import time
+import random
+from typing import Dict, List, Set, Optional
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+from request.requests_client import RequestClientError, RequestSSLError, RequestsClient
+from Db import Db
+
+DOMAIN = "找法网"
+LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
+
+
+class FindlawSpider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.cities = self._load_cities()
+
+    def _build_session(self) -> RequestsClient:
+        return RequestsClient(headers={
+            "User-Agent": (
+                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                "Mobile/15E148 Safari/604.1"
+            ),
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "X-Requested-With": "XMLHttpRequest",
+            "Connection": "close",
+        })
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+
+    def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
+        headers = {"Referer": referer}
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.get_text(url, timeout=15, verify=verify, headers=headers)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error: {url}")
+                return text
+            except RequestSSLError:
+                if verify:
+                    return self._get(url, referer, verify=False, max_retries=max_retries)
+                print(f"SSL错误 {url}")
+                return None
+            except RequestClientError as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        return None
+
+    def _existing_phones(self, phones: List[str]) -> Set[str]:
+        if not phones:
+            return set()
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            chunk_size = 500
+            for i in range(0, len(phones), chunk_size):
+                chunk = phones[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
+                cur.execute(sql, [DOMAIN, *chunk])
+                for row in cur.fetchall():
+                    existing.add(row[0])
+        finally:
+            cur.close()
+        return existing
+
+    def _load_cities(self):
+        condition = "domain='findlaw' AND level=2"
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
+            try:
+                rows = self.db.select_data(table, "city, province, pinyin", condition) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+            if rows:
+                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
+                print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
+                return rows
+
+        if last_error:
+            print(f"[找法网] 加载地区数据失败: {last_error}")
+        print("[找法网] 无城市数据（已尝试 area_new/area2/area）")
+        for table in tables:
+            try:
+                cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
+                c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
+                print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
+            except Exception:
+                pass
+        return []
+
+    def _fetch_page(self, url: str, referer: str) -> List[Dict]:
+        text = self._get(url, referer, verify=True)
+        if not text:
+            return []
+
+        try:
+            # 某些返回体前会携带 BOM 或包装脚本，此处做兼容
+            text = text.strip().lstrip("\ufeff")
+            try:
+                data = json.loads(text)
+            except ValueError:
+                json_start = text.find('{')
+                json_end = text.rfind('}')
+                if json_start == -1 or json_end == -1:
+                    print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
+                    return []
+                cleaned = text[json_start:json_end + 1]
+                data = json.loads(cleaned)
+            if isinstance(data, str):
+                try:
+                    data = json.loads(data)
+                except ValueError:
+                    print(f"解析JSON失败 {url}: 二次解析仍为字符串，开头: {str(data)[:80]!r}")
+                    return []
+        except ValueError as exc:
+            print(f"解析JSON失败 {url}: {exc}")
+            return []
+
+        items = data.get("data", {}).get("lawyer_list", [])
+        parsed = []
+        for item in items:
+            phone = (item.get("mobile") or "").replace("-", "")
+            parsed.append({
+                "name": item.get("username", ""),
+                "law_firm": item.get("lawyer_lawroom", ""),
+                "province": item.get("areaInfo", {}).get("province", ""),
+                "city": item.get("areaInfo", {}).get("city", ""),
+                "phone": phone,
+                "url": url,
+                "domain": DOMAIN,
+                "create_time": int(time.time()),
+                "params": json.dumps(item, ensure_ascii=False)
+            })
+        return parsed
+
+    def run(self):
+        print("启动找法网采集...")
+        if not self.cities:
+            print("无城市数据")
+            return
+
+        for city in self.cities:
+            pinyin = city.get("pinyin")
+            province = city.get("province", "")
+            city_name = city.get("city", "")
+            if not pinyin:
+                continue
+            print(f"采集 {province}-{city_name}")
+            page = 1
+            while True:
+                url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
+                referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
+                print(f"  第 {page} 页: {url}")
+                items = self._fetch_page(url, referer)
+                if not items:
+                    break
+
+                phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
+                existing = self._existing_phones(phones)
+
+                for entry in items:
+                    phone = entry.get("phone")
+                    if not phone:
+                        continue
+                    if phone in existing:
+                        print(f"    -- 已存在: {entry['name']} ({phone})")
+                        continue
+                    try:
+                        self.db.insert_data("lawyer", entry)
+                        print(f"    -> 新增: {entry['name']} ({phone})")
+                    except Exception as exc:
+                        print(f"    插入失败: {exc}")
+
+                page += 1
+
+        print("找法网采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = FindlawSpider(db)
+        spider.run()
@@ -0,0 +1,325 @@
+import json
+import os
+import re
+import sys
+import time
+import random
+from typing import Dict, Optional
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+from bs4 import BeautifulSoup
+from request.requests_client import RequestClientError, RequestsClient
+
+from Db import Db
+from config import HEADERS
+
+LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
+DOMAIN = "华律"
+
+
+class HualvSpider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.areas = self._load_areas()
+
+    def _build_session(self) -> RequestsClient:
+        custom_headers = HEADERS.copy()
+        custom_headers['User-Agent'] = (
+            'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
+            'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
+            'Mobile/15E148 Safari/604.1'
+        )
+        custom_headers["Connection"] = "close"
+        return RequestsClient(headers=custom_headers)
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+
+    def _load_areas(self):
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
+            try:
+                provinces = self.db.select_data(
+                    table,
+                    "code, province, pinyin, id",
+                    "domain='66law' AND level=1"
+                ) or []
+                cities = self.db.select_data(
+                    table,
+                    "code, city, province, pid",
+                    "domain='66law' AND level=2"
+                ) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+
+            if not cities:
+                continue
+
+            province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
+            city_map = {}
+            for city in cities:
+                province_info = province_map.get(city.get('pid'), {}) or {}
+                province_code = province_info.get('code')
+                city_map[city.get('code')] = {
+                    "name": city.get('city'),
+                    "province": city.get('province'),
+                    "province_code": province_code,
+                }
+            print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
+            return city_map
+
+        if last_error:
+            print(f"[华律] 加载地区数据失败: {last_error}")
+        print("[华律] 无城市数据（已尝试 area_new/area2/area）")
+        return {}
+
+    def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.post_text(LIST_URL, data=data, timeout=20, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print("请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error")
+                try:
+                    return json.loads(text)
+                except ValueError as exc:
+                    print(f"解析JSON失败: {exc}")
+                    return None
+            except RequestClientError as exc:
+                print(f"请求失败: {exc}")
+                return None
+        return None
+
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        contact_url = f"{url}lawyer_contact.aspx"
+        print(f"  详情: {contact_url}")
+        existing = self.db.select_data(
+            "lawyer",
+            "id, avatar_url",
+            f"domain='{DOMAIN}' AND url='{contact_url}'"
+        )
+        existing_id = None
+        if existing:
+            existing_id = existing[0].get("id")
+            avatar = (existing[0].get("avatar_url") or "").strip()
+            if avatar:
+                print("    -- 已存在且头像已补全，跳过")
+                return None
+
+        html = self._get_detail(contact_url)
+        if not html:
+            return None
+
+        soup = BeautifulSoup(html, "html.parser")
+        info_list = soup.find("ul", class_="information-list")
+        if not info_list:
+            return None
+
+        phone = ""
+        law_firm = ""
+        for li in info_list.find_all("li"):
+            text = li.get_text(strip=True)
+            if "手机号" in text:
+                cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
+                match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
+                if match:
+                    phone = match.group(0)
+            if "执业单位" in text:
+                law_firm = text.replace("执业单位", "").strip()
+
+        name = ""
+        breadcrumb = soup.find("div", class_="weizhi")
+        if breadcrumb:
+            links = breadcrumb.find_all("a")
+            if len(links) > 2:
+                name = links[2].get_text(strip=True)
+
+        phone = phone.replace('-', '').strip()
+        if not phone or not re.fullmatch(r"1\d{10}", phone):
+            print("    无手机号，跳过")
+            return None
+
+        avatar_url, site_time = self._extract_avatar_and_time(soup)
+        data = {
+            "phone": phone,
+            "province": province,
+            "city": city,
+            "law_firm": law_firm,
+            "url": contact_url,
+            "avatar_url": avatar_url,
+            "create_time": int(time.time()),
+            "site_time": site_time,
+            "domain": DOMAIN,
+            "name": name,
+            "params": json.dumps({"source": url}, ensure_ascii=False)
+        }
+        if existing_id:
+            update_data = {
+                "avatar_url": avatar_url,
+                "site_time": site_time,
+            }
+            if name:
+                update_data["name"] = name
+            if law_firm:
+                update_data["law_firm"] = law_firm
+            if province:
+                update_data["province"] = province
+            if city:
+                update_data["city"] = city
+            if phone:
+                update_data["phone"] = phone
+            update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
+            try:
+                self.db.update_data("lawyer", update_data, f"id={existing_id}")
+                print("    -- 已存在，已补全头像/时间")
+            except Exception as exc:
+                print(f"    更新失败: {exc}")
+            return None
+        # 若手机号已存在，则更新头像/时间，不再插入新记录
+        existing_phone = self.db.select_data(
+            "lawyer",
+            "id, avatar_url, url",
+            f"domain='{DOMAIN}' AND phone='{phone}'"
+        )
+        if existing_phone:
+            existing_row = existing_phone[0]
+            avatar = (existing_row.get("avatar_url") or "").strip()
+            if avatar:
+                print("    -- 已存在手机号且头像已补全，跳过")
+                return None
+            update_data = {
+                "avatar_url": avatar_url,
+                "site_time": site_time,
+            }
+            if name:
+                update_data["name"] = name
+            if law_firm:
+                update_data["law_firm"] = law_firm
+            if province:
+                update_data["province"] = province
+            if city:
+                update_data["city"] = city
+            if phone:
+                update_data["phone"] = phone
+            if not existing_row.get("url"):
+                update_data["url"] = contact_url
+            update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
+            try:
+                self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
+                print("    -- 已存在手机号，已补全头像/时间")
+            except Exception as exc:
+                print(f"    更新失败: {exc}")
+            return None
+        return data
+
+    def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
+        avatar_url = ""
+        site_time = None
+        img_tag = soup.select_one(
+            "div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
+        )
+        if img_tag:
+            src = (img_tag.get("src") or "").strip()
+            if src:
+                if src.startswith("//"):
+                    avatar_url = f"https:{src}"
+                else:
+                    avatar_url = src
+                match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
+                if match:
+                    site_time = int(f"{match.group(1)}{match.group(2)}")
+                else:
+                    match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
+                    if match:
+                        site_time = int(f"{match.group(1)}{match.group(2)}")
+        return avatar_url, site_time
+
+    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.get_text(url, timeout=15, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print("    请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error")
+                return text
+            except RequestClientError as exc:
+                print(f"    请求失败: {exc}")
+                return None
+        return None
+
+    def run(self):
+        print("启动华律网采集...")
+        if not self.areas:
+            print("无城市数据")
+            return
+
+        for city_code, city_info in self.areas.items():
+            province_code = city_info.get("province_code")
+            if not province_code:
+                continue
+            province_name = city_info.get("province", "")
+            city_name = city_info.get("name", "")
+            print(f"采集 {province_name}-{city_name}")
+
+            page = 1
+            while True:
+                payload = {"pid": province_code, "cid": city_code, "page": str(page)}
+                data = self._post(payload)
+                if not data or not data.get("lawyerList"):
+                    break
+
+                for item in data["lawyerList"]:
+                    result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
+                    if not result:
+                        continue
+                    try:
+                        self.db.insert_data("lawyer", result)
+                        print(f"  -> 新增: {result['name']} ({result['phone']})")
+                    except Exception as exc:
+                        print(f"  插入失败: {exc}")
+                    time.sleep(1)
+
+                page_count = data.get("lawyerItems", {}).get("pageCount", page)
+                if page >= page_count:
+                    break
+                page += 1
+                time.sleep(2)
+
+            time.sleep(1)
+        print("华律网采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = HualvSpider(db)
+        spider.run()
@@ -0,0 +1,278 @@
+import json
+import os
+import re
+import sys
+import time
+import random
+from typing import Dict, Optional, List, Set
+from urllib.parse import urljoin
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+import urllib3
+from bs4 import BeautifulSoup
+from request.requests_client import RequestClientError, RequestsClient
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+from Db import Db
+from config import LAWTIME_CONFIG
+
+LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
+DETAIL_BASE = "https://m.lawtime.cn"
+DOMAIN = "法律快车"
+
+
+class LawtimeSpider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+        self._tls = threading.local()
+
+    def _build_session(self) -> RequestsClient:
+        headers = LAWTIME_CONFIG.get("HEADERS", {})
+        custom_headers = dict(headers) if headers else {}
+        custom_headers.setdefault("Connection", "close")
+        return RequestsClient(headers=custom_headers)
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+
+    def _get_thread_session(self) -> RequestsClient:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            return s
+        s = self.client.clone()
+        self._tls.session = s
+        return s
+
+    def _refresh_thread_session(self) -> None:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            s.close()
+        self._tls.session = None
+
+    def _existing_phones(self, phones: List[str]) -> Set[str]:
+        if not phones:
+            return set()
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            chunk_size = 500
+            for i in range(0, len(phones), chunk_size):
+                chunk = phones[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
+                cur.execute(sql, [DOMAIN, *chunk])
+                for row in cur.fetchall():
+                    existing.add(row[0])
+        finally:
+            cur.close()
+        return existing
+
+    def _load_areas(self):
+        condition = "level = 2 and domain='法律快车'"
+        tables = ("area_new", "area", "area2")
+        last_error = None
+        for table in tables:
+            try:
+                rows = self.db.select_data(table, "pinyin, province, city", condition) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+            if rows:
+                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
+                print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
+                return rows
+
+        if last_error:
+            print(f"[法律快车] 加载地区数据失败: {last_error}")
+        print("[法律快车] 无城市数据（已尝试 area_new/area/area2）")
+        return []
+
+    def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
+        return self._get_with_session(self.client, url, max_retries=max_retries, is_thread=False)
+
+    def _get_with_session(self, session: RequestsClient, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = session.get_text(url, timeout=15, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"请求失败 {url}: 403，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        if is_thread:
+                            self._refresh_thread_session()
+                            session = self._get_thread_session()
+                        else:
+                            self._refresh_session()
+                            session = self.client
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error: {url}")
+                return text
+            except RequestClientError as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        return None
+
+    def _parse_list(self, html: str, province: str, city: str) -> int:
+        soup = BeautifulSoup(html, "html.parser")
+        links = [a.get("href", "") for a in soup.select("a.hide_link")]
+        links = [link.replace("lll", "int") for link in links if link]
+        if not links:
+            return 0
+
+        detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
+
+        results: List[Dict[str, str]] = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
+            for fut in as_completed(futs):
+                try:
+                    data = fut.result()
+                except Exception as exc:
+                    print(f"  详情解析异常: {exc}")
+                    continue
+                if data and data.get("phone"):
+                    results.append(data)
+
+        if not results:
+            return len(detail_urls)
+
+        phones = [d["phone"] for d in results if d.get("phone")]
+        existing = self._existing_phones(phones)
+
+        for data in results:
+            phone = data.get("phone")
+            if not phone:
+                continue
+            if phone in existing:
+                print(f"  -- 已存在: {data['name']} ({phone})")
+                continue
+            try:
+                self.db.insert_data("lawyer", data)
+                print(f"  -> 新增: {data['name']} ({phone})")
+            except Exception as exc:
+                print(f"  插入失败 {data.get('url')}: {exc}")
+
+        return len(detail_urls)
+
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        html = None
+        sess = self._get_thread_session()
+        html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
+        if not html:
+            return None
+
+        soup = BeautifulSoup(html, "html.parser")
+        text = soup.get_text(" ")
+
+        name = ""
+        title_tag = soup.find("title")
+        if title_tag:
+            match = re.search(r"(\S+)律师", title_tag.get_text())
+            if match:
+                name = match.group(1)
+        if not name:
+            intl_div = soup.find("div", class_="intl")
+            if intl_div:
+                match = re.search(r"(\S+)律师", intl_div.get_text())
+                if match:
+                    name = match.group(1)
+
+        phone = ""
+        phone_pattern = r"1[3-9]\d{9}"
+        for item in soup.select("div.item.flex"):
+            label = item.find("div", class_="label")
+            desc = item.find("div", class_="desc")
+            if not label or not desc:
+                continue
+            label_text = label.get_text()
+            desc_text = desc.get_text().replace("-", "")
+            if "联系电话" in label_text or "电话" in label_text:
+                matches = re.findall(phone_pattern, desc_text)
+                if matches:
+                    phone = matches[0]
+                    break
+        if not phone:
+            matches = re.findall(phone_pattern, text.replace("-", ""))
+            if matches:
+                phone = matches[0]
+        if not phone:
+            print(f"  无手机号: {url}")
+            return None
+
+        law_firm = ""
+        for item in soup.select("div.item.flex"):
+            label = item.find("div", class_="label")
+            desc = item.find("div", class_="desc")
+            if not label or not desc:
+                continue
+            if "执业律所" in label.get_text() or "律所" in label.get_text():
+                law_firm = desc.get_text(strip=True).replace("已认证", "")
+                break
+
+        params = {
+            "list_url": url,
+            "province": province,
+            "city": city,
+        }
+
+        return {
+            "name": name or "",
+            "law_firm": law_firm,
+            "province": province,
+            "city": city,
+            "phone": phone,
+            "url": url,
+            "domain": DOMAIN,
+            "create_time": int(time.time()),
+            "params": json.dumps(params, ensure_ascii=False)
+        }
+
+    def run(self):
+        print("启动法律快车采集...")
+        areas = self._load_areas()
+        if not areas:
+            print("无地区数据")
+            return
+
+        for area in areas:
+            pinyin = area.get("pinyin")
+            province = area.get("province", "")
+            city = area.get("city", "")
+            if not pinyin:
+                continue
+            page = 1
+            while True:
+                list_url = LIST_BASE.format(pinyin=pinyin, page=page)
+                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
+                html = self._get(list_url)
+                if not html:
+                    break
+                link_count = self._parse_list(html, province, city)
+                if link_count == 0:
+                    break
+                page += 1
+        print("法律快车采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = LawtimeSpider(db)
+        spider.run()
@@ -0,0 +1,332 @@
+import json
+import os
+import sys
+import time
+import random
+from typing import Dict, Optional, List, Set
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+import urllib3
+from bs4 import BeautifulSoup
+from request.requests_client import RequestClientError, RequestsClient
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+from Db import Db
+
+DOMAIN = "律图"
+LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
+
+
+class Six4365Spider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+        self._tls = threading.local()
+        self.cities = self._load_cities()
+
+    def _build_session(self) -> RequestsClient:
+        return RequestsClient(headers={
+            "User-Agent": (
+                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                "Mobile/15E148 Safari/604.1"
+            ),
+            "Connection": "close",
+        })
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+
+    def _get_thread_session(self) -> RequestsClient:
+        """每个线程使用独立请求客户端（共享相同 headers/代理配置）。"""
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            return s
+        s = self.client.clone()
+        self._tls.session = s
+        return s
+
+    def _refresh_thread_session(self) -> None:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            s.close()
+        self._tls.session = None
+
+    def _existing_urls(self, urls: List[str]) -> Set[str]:
+        """批量查重，减少 N 次 is_data_exist"""
+        if not urls:
+            return set()
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            # IN 参数过多会失败，分批
+            chunk_size = 500
+            for i in range(0, len(urls), chunk_size):
+                chunk = urls[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
+                cur.execute(sql, chunk)
+                for row in cur.fetchall():
+                    # pymysql 默认返回 tuple
+                    existing.add(row[0])
+        finally:
+            cur.close()
+        return existing
+
+    def _load_cities(self):
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
+            try:
+                provinces = self.db.select_data(
+                    table,
+                    "id, code, province",
+                    "domain='64365' AND level=1"
+                ) or []
+                cities = self.db.select_data(
+                    table,
+                    "code, city, province, pid",
+                    "domain='64365' AND level=2"
+                ) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+
+            if not cities:
+                continue
+
+            province_map = {row.get('id'): row for row in provinces}
+            data = {}
+            for city in cities:
+                province_row = province_map.get(city.get('pid'), {}) or {}
+                data[str(city.get('code'))] = {
+                    "name": city.get('city'),
+                    "province": city.get('province'),
+                    "province_name": province_row.get('province', city.get('province')),
+                }
+            print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
+            return data
+
+        if last_error:
+            print(f"[律图] 加载地区数据失败: {last_error}")
+        print("[律图] 无城市数据（已尝试 area_new/area2/area）")
+        return {}
+
+    def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.post_text(LIST_URL, data=payload, timeout=10, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print("请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error")
+                return text
+            except RequestClientError as exc:
+                print(f"请求失败: {exc}")
+                return None
+        return None
+
+    def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
+        return {
+            "AdCode": "",
+            "RegionId": str(city_code),
+            "CategoryId": "",
+            "MaxNumber": "",
+            "OnlyData": "true",
+            "IgnoreButton": "",
+            "LawyerRecommendRequest[AreaId]": str(city_code),
+            "LawyerRecommendRequest[LawCategoryIds]": "",
+            "LawyerRecommendRequest[LawFirmPersonCount]": "",
+            "LawyerRecommendRequest[LawFirmScale]": "",
+            "LawyerRecommendRequest[OrderType]": "0",
+            "LawyerRecommendRequest[PageIndex]": str(page),
+            "LawyerRecommendRequest[PageSize]": "10",
+            "LawyerRecommendRequest[TagId]": "",
+            "LawyerRecommendRequest[Type]": "1",
+            "LawyerRecommendRequest[AccountType]": "",
+            "LawyerRecommendRequest[AddLawyer]": "true",
+            "LawyerRecommendRequest[Content]": "",
+            "LawyerRecommendRequest[Duty]": "",
+            "LawyerRecommendRequest[ExcludeLawyerIds][]": "",
+            "LawyerRecommendRequest[RefferUrl]": "",
+            "LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
+            "LawyerRecommendRequest[resource_type_name]": "",
+            "LawyerRecommendRequest[UserAgent]": self.client.headers["User-Agent"],
+            "LawyerRecommendRequest[AddLawyerWithNoData]": "false",
+            "ShowCaseButton": "true",
+        }
+
+    def _parse_list(self, html: str, province: str, city: str) -> int:
+        soup = BeautifulSoup(html, "html.parser")
+        lawyers = soup.find_all("a", class_="lawyer")
+        if not lawyers:
+            return 0
+
+        detail_urls: List[str] = []
+        for lawyer in lawyers:
+            href = lawyer.get("href")
+            if not href:
+                continue
+            detail_urls.append(f"{href.rstrip('/')}/info/")
+
+        if not detail_urls:
+            return 0
+
+        results: List[Dict[str, str]] = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
+            for fut in as_completed(futs):
+                try:
+                    data = fut.result()
+                except Exception as exc:
+                    print(f"    详情解析异常: {exc}")
+                    continue
+                if data:
+                    results.append(data)
+
+        if not results:
+            return len(detail_urls)
+
+        existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
+        for data in results:
+            if not data:
+                continue
+            url = data.get("url", "")
+            if not url:
+                continue
+            if url in existing:
+                print(f"  -- 已存在URL: {url}")
+                continue
+            try:
+                self.db.insert_data("lawyer", data)
+                print(f"  -> 新增: {data['name']} ({data['phone']})")
+            except Exception as exc:
+                print(f"  插入失败 {url}: {exc}")
+
+        return len(detail_urls)
+
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        html = self._get_detail(url)
+        if not html:
+            return None
+
+        soup = BeautifulSoup(html, "html.parser")
+        base_info = soup.find("ul", class_="intro-basic-bar")
+        if not base_info:
+            return None
+
+        name = ""
+        law_firm = ""
+        phone = ""
+
+        for li in base_info.find_all("li"):
+            label = li.find("span", class_="label")
+            txt = li.find("div", class_="txt")
+            if not label or not txt:
+                continue
+            label_text = label.get_text(strip=True)
+            if "姓名" in label_text:
+                name = txt.get_text(strip=True)
+            if "执业律所" in label_text:
+                law_firm = txt.get_text(strip=True)
+
+        more_section = soup.find("div", class_="more-intro-basic")
+        if more_section:
+            phone_ul = more_section.find("ul", class_="intro-basic-bar")
+            if phone_ul:
+                for li in phone_ul.find_all("li"):
+                    label = li.find("span", class_="label")
+                    txt = li.find("div", class_="txt")
+                    if label and txt and "联系电话" in label.get_text(strip=True):
+                        phone = txt.get_text(strip=True).replace(" ", "")
+                        break
+
+        phone = phone.replace('-', '').strip()
+        if not name or not phone:
+            return None
+
+        data = {
+            "phone": phone,
+            "province": province,
+            "city": city,
+            "law_firm": law_firm,
+            "url": url,
+            "domain": DOMAIN,
+            "name": name,
+            "create_time": int(time.time()),
+            "params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
+        }
+        return data
+
+    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
+        session = self._get_thread_session()
+        for attempt in range(max_retries):
+            try:
+                resp = session.get_text(url, timeout=10, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_thread_session()
+                        session = self._get_thread_session()
+                        time.sleep(wait_time)
+                        continue
+                    print("    请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error")
+                return text
+            except RequestClientError as exc:
+                print(f"    请求失败: {exc}")
+                return None
+        return None
+
+    def run(self):
+        print("启动律图采集...")
+        if not self.cities:
+            print("无城市数据")
+            return
+
+        for city_code, info in self.cities.items():
+            province = info.get("province_name", "")
+            city = info.get("name", "")
+            print(f"采集 {province}-{city}")
+            page = 1
+            while True:
+                payload = self._build_payload(city_code, page)
+                html = self._post(payload)
+                if not html:
+                    break
+                link_count = self._parse_list(html, province, city)
+                if link_count == 0:
+                    break
+                page += 1
+        print("律图采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = Six4365Spider(db)
+        spider.run()
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# 切换到脚本所在目录，确保相对路径正确
+cd "$(dirname "$0")"
+
+echo "使用 request/proxy_settings.json 读取代理配置"
+
+nohup python3 dls.py > dls.log 2>&1 & # 大律师
+nohup python3 findlaw.py > findlaw.log 2>&1 & # 找法网
+nohup python3 lawtime.py > lawtime.log 2>&1 &  # 法律快车
+nohup python3 six4365.py > six4365.log 2>&1 & # 律图
+nohup python3 hualv.py > hualv.log 2>&1 & # 华律