chore: initialize lawyers crawler project

2026-03-02 00:19:48 +08:00
commit 03847a4b8e
17 changed files with 1928 additions and 0 deletions
@@ -0,0 +1,332 @@
+import json
+import os
+import sys
+import time
+import random
+from typing import Dict, Optional, List, Set
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+import urllib3
+from bs4 import BeautifulSoup
+from request.requests_client import RequestClientError, RequestsClient
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+from Db import Db
+
+DOMAIN = "律图"
+LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
+
+
+class Six4365Spider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+        self._tls = threading.local()
+        self.cities = self._load_cities()
+
+    def _build_session(self) -> RequestsClient:
+        return RequestsClient(headers={
+            "User-Agent": (
+                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
+                "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
+                "Mobile/15E148 Safari/604.1"
+            ),
+            "Connection": "close",
+        })
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+
+    def _get_thread_session(self) -> RequestsClient:
+        """每个线程使用独立请求客户端（共享相同 headers/代理配置）。"""
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            return s
+        s = self.client.clone()
+        self._tls.session = s
+        return s
+
+    def _refresh_thread_session(self) -> None:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            s.close()
+        self._tls.session = None
+
+    def _existing_urls(self, urls: List[str]) -> Set[str]:
+        """批量查重，减少 N 次 is_data_exist"""
+        if not urls:
+            return set()
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            # IN 参数过多会失败，分批
+            chunk_size = 500
+            for i in range(0, len(urls), chunk_size):
+                chunk = urls[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
+                cur.execute(sql, chunk)
+                for row in cur.fetchall():
+                    # pymysql 默认返回 tuple
+                    existing.add(row[0])
+        finally:
+            cur.close()
+        return existing
+
+    def _load_cities(self):
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
+            try:
+                provinces = self.db.select_data(
+                    table,
+                    "id, code, province",
+                    "domain='64365' AND level=1"
+                ) or []
+                cities = self.db.select_data(
+                    table,
+                    "code, city, province, pid",
+                    "domain='64365' AND level=2"
+                ) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+
+            if not cities:
+                continue
+
+            province_map = {row.get('id'): row for row in provinces}
+            data = {}
+            for city in cities:
+                province_row = province_map.get(city.get('pid'), {}) or {}
+                data[str(city.get('code'))] = {
+                    "name": city.get('city'),
+                    "province": city.get('province'),
+                    "province_name": province_row.get('province', city.get('province')),
+                }
+            print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
+            return data
+
+        if last_error:
+            print(f"[律图] 加载地区数据失败: {last_error}")
+        print("[律图] 无城市数据（已尝试 area_new/area2/area）")
+        return {}
+
+    def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.post_text(LIST_URL, data=payload, timeout=10, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print("请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error")
+                return text
+            except RequestClientError as exc:
+                print(f"请求失败: {exc}")
+                return None
+        return None
+
+    def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
+        return {
+            "AdCode": "",
+            "RegionId": str(city_code),
+            "CategoryId": "",
+            "MaxNumber": "",
+            "OnlyData": "true",
+            "IgnoreButton": "",
+            "LawyerRecommendRequest[AreaId]": str(city_code),
+            "LawyerRecommendRequest[LawCategoryIds]": "",
+            "LawyerRecommendRequest[LawFirmPersonCount]": "",
+            "LawyerRecommendRequest[LawFirmScale]": "",
+            "LawyerRecommendRequest[OrderType]": "0",
+            "LawyerRecommendRequest[PageIndex]": str(page),
+            "LawyerRecommendRequest[PageSize]": "10",
+            "LawyerRecommendRequest[TagId]": "",
+            "LawyerRecommendRequest[Type]": "1",
+            "LawyerRecommendRequest[AccountType]": "",
+            "LawyerRecommendRequest[AddLawyer]": "true",
+            "LawyerRecommendRequest[Content]": "",
+            "LawyerRecommendRequest[Duty]": "",
+            "LawyerRecommendRequest[ExcludeLawyerIds][]": "",
+            "LawyerRecommendRequest[RefferUrl]": "",
+            "LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
+            "LawyerRecommendRequest[resource_type_name]": "",
+            "LawyerRecommendRequest[UserAgent]": self.client.headers["User-Agent"],
+            "LawyerRecommendRequest[AddLawyerWithNoData]": "false",
+            "ShowCaseButton": "true",
+        }
+
+    def _parse_list(self, html: str, province: str, city: str) -> int:
+        soup = BeautifulSoup(html, "html.parser")
+        lawyers = soup.find_all("a", class_="lawyer")
+        if not lawyers:
+            return 0
+
+        detail_urls: List[str] = []
+        for lawyer in lawyers:
+            href = lawyer.get("href")
+            if not href:
+                continue
+            detail_urls.append(f"{href.rstrip('/')}/info/")
+
+        if not detail_urls:
+            return 0
+
+        results: List[Dict[str, str]] = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
+            for fut in as_completed(futs):
+                try:
+                    data = fut.result()
+                except Exception as exc:
+                    print(f"    详情解析异常: {exc}")
+                    continue
+                if data:
+                    results.append(data)
+
+        if not results:
+            return len(detail_urls)
+
+        existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
+        for data in results:
+            if not data:
+                continue
+            url = data.get("url", "")
+            if not url:
+                continue
+            if url in existing:
+                print(f"  -- 已存在URL: {url}")
+                continue
+            try:
+                self.db.insert_data("lawyer", data)
+                print(f"  -> 新增: {data['name']} ({data['phone']})")
+            except Exception as exc:
+                print(f"  插入失败 {url}: {exc}")
+
+        return len(detail_urls)
+
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        html = self._get_detail(url)
+        if not html:
+            return None
+
+        soup = BeautifulSoup(html, "html.parser")
+        base_info = soup.find("ul", class_="intro-basic-bar")
+        if not base_info:
+            return None
+
+        name = ""
+        law_firm = ""
+        phone = ""
+
+        for li in base_info.find_all("li"):
+            label = li.find("span", class_="label")
+            txt = li.find("div", class_="txt")
+            if not label or not txt:
+                continue
+            label_text = label.get_text(strip=True)
+            if "姓名" in label_text:
+                name = txt.get_text(strip=True)
+            if "执业律所" in label_text:
+                law_firm = txt.get_text(strip=True)
+
+        more_section = soup.find("div", class_="more-intro-basic")
+        if more_section:
+            phone_ul = more_section.find("ul", class_="intro-basic-bar")
+            if phone_ul:
+                for li in phone_ul.find_all("li"):
+                    label = li.find("span", class_="label")
+                    txt = li.find("div", class_="txt")
+                    if label and txt and "联系电话" in label.get_text(strip=True):
+                        phone = txt.get_text(strip=True).replace(" ", "")
+                        break
+
+        phone = phone.replace('-', '').strip()
+        if not name or not phone:
+            return None
+
+        data = {
+            "phone": phone,
+            "province": province,
+            "city": city,
+            "law_firm": law_firm,
+            "url": url,
+            "domain": DOMAIN,
+            "name": name,
+            "create_time": int(time.time()),
+            "params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
+        }
+        return data
+
+    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
+        session = self._get_thread_session()
+        for attempt in range(max_retries):
+            try:
+                resp = session.get_text(url, timeout=10, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_thread_session()
+                        session = self._get_thread_session()
+                        time.sleep(wait_time)
+                        continue
+                    print("    请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error")
+                return text
+            except RequestClientError as exc:
+                print(f"    请求失败: {exc}")
+                return None
+        return None
+
+    def run(self):
+        print("启动律图采集...")
+        if not self.cities:
+            print("无城市数据")
+            return
+
+        for city_code, info in self.cities.items():
+            province = info.get("province_name", "")
+            city = info.get("name", "")
+            print(f"采集 {province}-{city}")
+            page = 1
+            while True:
+                payload = self._build_payload(city_code, page)
+                html = self._post(payload)
+                if not html:
+                    break
+                link_count = self._parse_list(html, province, city)
+                if link_count == 0:
+                    break
+                page += 1
+        print("律图采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = Six4365Spider(db)
+        spider.run()