chore: initialize lawyers crawler project

2026-03-02 00:19:48 +08:00
commit 03847a4b8e
17 changed files with 1928 additions and 0 deletions
@@ -0,0 +1,268 @@
+import json
+import os
+import sys
+import time
+import random
+from typing import Dict, Optional
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+import urllib3
+from bs4 import BeautifulSoup
+from request.requests_client import (
+    RequestClientError,
+    RequestConnectTimeout,
+    RequestConnectionError,
+    RequestTimeout,
+    RequestsClient,
+)
+
+# 禁用 SSL 警告
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+from Db import Db
+from utils.rate_limiter import wait_for_request
+
+DOMAIN = "大律师"
+LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
+_PROXY_TESTED = False
+
+
+class DlsSpider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.areas = self._load_areas()
+
+    def _build_session(self) -> RequestsClient:
+        """构建带重试机制的 session"""
+        client = RequestsClient(
+            headers={
+                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
+                "Host": "m.maxlaw.cn",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+                "Connection": "close",
+            },
+            retry_total=3,  # 总共重试3次
+            retry_backoff_factor=1,  # 重试间隔：1s, 2s, 4s
+            retry_status_forcelist=(429, 500, 502, 503, 504),  # 对这些状态码进行重试
+            retry_allowed_methods=("GET", "POST"),
+        )
+        self._proxy_test(client, client.proxies or None)
+        return client
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+        self._proxy_test(self.client, self.client.proxies or None)
+
+    def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
+        global _PROXY_TESTED
+        if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
+            return
+        _PROXY_TESTED = True
+        if not proxies:
+            print("[proxy] test skipped: no proxy configured")
+            return
+        test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
+        timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
+        try:
+            resp = client.get_text(
+                test_url,
+                timeout=timeout,
+                headers={"Connection": "close"},
+            )
+            print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
+        except Exception as exc:
+            print(f"[proxy] test failed: {exc}")
+
+    def _load_areas(self):
+        try:
+            return self.db.select_data(
+                "area_new",
+                "province, city, pinyin",
+                "domain='maxlaw'"
+            ) or []
+        except Exception as exc:
+            print(f"加载地区失败: {exc}")
+            return []
+
+    def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
+        """发送 GET 请求，带重试机制"""
+        wait_for_request()
+        
+        for attempt in range(max_retries):
+            try:
+                # 使用更长的超时时间，分别设置连接和读取超时
+                resp = self.client.get_text(
+                    url, 
+                    timeout=(10, 30),  # (connect_timeout, read_timeout)
+                    verify=False,
+                    headers=headers,
+                )
+                status_code = resp.status_code
+                content = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error: {url}")
+                return content
+            except RequestConnectTimeout as exc:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt  # 指数退避：2s, 4s, 8s
+                    print(f"连接超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    time.sleep(wait_time)
+                else:
+                    print(f"连接超时，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except RequestTimeout as exc:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    print(f"请求超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    time.sleep(wait_time)
+                else:
+                    print(f"请求超时，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except RequestConnectionError as exc:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    print(f"连接错误，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
+                    time.sleep(wait_time)
+                else:
+                    print(f"连接错误，已达到最大重试次数 {url}: {exc}")
+                    return None
+            except RequestClientError as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        
+        return None
+
+    def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
+        soup = BeautifulSoup(html, "html.parser")
+        cards = soup.find_all("div", class_="lstx")
+        if not cards:
+            return 0
+
+        inserted = 0
+        for card in cards:
+            link = card.find("a")
+            if not link or not link.get("href"):
+                continue
+            detail = self._parse_detail(link['href'], province, city, list_url)
+            if not detail:
+                continue
+            phone = detail.get("phone")
+            if not phone:
+                continue
+            condition = f"phone='{phone}' and domain='{DOMAIN}'"
+            if self.db.is_data_exist("lawyer", condition):
+                print(f"  -- 已存在: {detail['name']} ({phone})")
+                time.sleep(0.3)
+                continue
+            try:
+                self.db.insert_data("lawyer", detail)
+                inserted += 1
+                print(f"  -> 新增: {detail['name']} ({phone})")
+            except Exception as exc:
+                print(f"  插入失败: {exc}")
+            time.sleep(1)
+            time.sleep(0.3)
+        # 列表页结束后再缓一缓，降低风控
+        time.sleep(0.6)
+        return inserted
+
+    def _detail_headers(self, referer: str) -> Dict[str, str]:
+        return {
+            "Referer": referer,
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Pragma": "no-cache",
+            "Upgrade-Insecure-Requests": "1",
+        }
+
+    def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
+        url = f"https://m.maxlaw.cn{path}"
+        print(f"  详情: {url}")
+        html = self._get(url, headers=self._detail_headers(list_url))
+        if not html:
+            return None
+
+        soup = BeautifulSoup(html, "html.parser")
+        name_tag = soup.find("h2", class_="lawyerName")
+        law_firm_tag = soup.find("p", class_="law-firm")
+        contact_list = soup.find("ul", class_="contact-content")
+
+        name = name_tag.get_text(strip=True) if name_tag else ""
+        law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
+        phone = ""
+
+        if contact_list:
+            items = contact_list.find_all("li")
+            if len(items) > 2:
+                phone_tag = items[2].find("p")
+                if phone_tag:
+                    phone = phone_tag.get_text(strip=True)
+                    phone = phone.split("咨询请说明来自大律师网")[0].strip()
+
+        phone = phone.replace('-', '').strip()
+        if not name or not phone:
+            print("    信息不完整，跳过")
+            return None
+
+        safe_city = city if city else province
+        return {
+            "name": name,
+            "law_firm": law_firm,
+            "province": province,
+            "city": safe_city,
+            "phone": phone,
+            "url": url,
+            "domain": DOMAIN,
+            "create_time": int(time.time()),
+            "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
+        }
+
+    def run(self):
+        print("启动大律师采集...")
+        if not self.areas:
+            print("无地区数据")
+            return
+
+        for area in self.areas:
+            pinyin = area.get("pinyin")
+            province = area.get("province", "")
+            city = area.get("city", "")
+            if not pinyin:
+                continue
+            page = 1
+            while True:
+                list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
+                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
+                html = self._get(list_url)
+                if not html:
+                    break
+                inserted = self._parse_list(html, province, city, list_url)
+                if inserted == 0:
+                    break
+                page += 1
+        print("大律师采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = DlsSpider(db)
+        spider.run()