chore: initialize lawyers crawler project

2026-03-02 00:19:48 +08:00
commit 03847a4b8e
17 changed files with 1928 additions and 0 deletions
@@ -0,0 +1,278 @@
+import json
+import os
+import re
+import sys
+import time
+import random
+from typing import Dict, Optional, List, Set
+from urllib.parse import urljoin
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+import urllib3
+from bs4 import BeautifulSoup
+from request.requests_client import RequestClientError, RequestsClient
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+from Db import Db
+from config import LAWTIME_CONFIG
+
+LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
+DETAIL_BASE = "https://m.lawtime.cn"
+DOMAIN = "法律快车"
+
+
+class LawtimeSpider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
+        self._tls = threading.local()
+
+    def _build_session(self) -> RequestsClient:
+        headers = LAWTIME_CONFIG.get("HEADERS", {})
+        custom_headers = dict(headers) if headers else {}
+        custom_headers.setdefault("Connection", "close")
+        return RequestsClient(headers=custom_headers)
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+
+    def _get_thread_session(self) -> RequestsClient:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            return s
+        s = self.client.clone()
+        self._tls.session = s
+        return s
+
+    def _refresh_thread_session(self) -> None:
+        s = getattr(self._tls, "session", None)
+        if s is not None:
+            s.close()
+        self._tls.session = None
+
+    def _existing_phones(self, phones: List[str]) -> Set[str]:
+        if not phones:
+            return set()
+        existing: Set[str] = set()
+        cur = self.db.db.cursor()
+        try:
+            chunk_size = 500
+            for i in range(0, len(phones), chunk_size):
+                chunk = phones[i:i + chunk_size]
+                placeholders = ",".join(["%s"] * len(chunk))
+                sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
+                cur.execute(sql, [DOMAIN, *chunk])
+                for row in cur.fetchall():
+                    existing.add(row[0])
+        finally:
+            cur.close()
+        return existing
+
+    def _load_areas(self):
+        condition = "level = 2 and domain='法律快车'"
+        tables = ("area_new", "area", "area2")
+        last_error = None
+        for table in tables:
+            try:
+                rows = self.db.select_data(table, "pinyin, province, city", condition) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+            if rows:
+                missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
+                print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
+                return rows
+
+        if last_error:
+            print(f"[法律快车] 加载地区数据失败: {last_error}")
+        print("[法律快车] 无城市数据（已尝试 area_new/area/area2）")
+        return []
+
+    def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
+        return self._get_with_session(self.client, url, max_retries=max_retries, is_thread=False)
+
+    def _get_with_session(self, session: RequestsClient, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = session.get_text(url, timeout=15, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"请求失败 {url}: 403，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        if is_thread:
+                            self._refresh_thread_session()
+                            session = self._get_thread_session()
+                        else:
+                            self._refresh_session()
+                            session = self.client
+                        time.sleep(wait_time)
+                        continue
+                    print(f"请求失败 {url}: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error: {url}")
+                return text
+            except RequestClientError as exc:
+                print(f"请求失败 {url}: {exc}")
+                return None
+        return None
+
+    def _parse_list(self, html: str, province: str, city: str) -> int:
+        soup = BeautifulSoup(html, "html.parser")
+        links = [a.get("href", "") for a in soup.select("a.hide_link")]
+        links = [link.replace("lll", "int") for link in links if link]
+        if not links:
+            return 0
+
+        detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
+
+        results: List[Dict[str, str]] = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+            futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
+            for fut in as_completed(futs):
+                try:
+                    data = fut.result()
+                except Exception as exc:
+                    print(f"  详情解析异常: {exc}")
+                    continue
+                if data and data.get("phone"):
+                    results.append(data)
+
+        if not results:
+            return len(detail_urls)
+
+        phones = [d["phone"] for d in results if d.get("phone")]
+        existing = self._existing_phones(phones)
+
+        for data in results:
+            phone = data.get("phone")
+            if not phone:
+                continue
+            if phone in existing:
+                print(f"  -- 已存在: {data['name']} ({phone})")
+                continue
+            try:
+                self.db.insert_data("lawyer", data)
+                print(f"  -> 新增: {data['name']} ({phone})")
+            except Exception as exc:
+                print(f"  插入失败 {data.get('url')}: {exc}")
+
+        return len(detail_urls)
+
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        html = None
+        sess = self._get_thread_session()
+        html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
+        if not html:
+            return None
+
+        soup = BeautifulSoup(html, "html.parser")
+        text = soup.get_text(" ")
+
+        name = ""
+        title_tag = soup.find("title")
+        if title_tag:
+            match = re.search(r"(\S+)律师", title_tag.get_text())
+            if match:
+                name = match.group(1)
+        if not name:
+            intl_div = soup.find("div", class_="intl")
+            if intl_div:
+                match = re.search(r"(\S+)律师", intl_div.get_text())
+                if match:
+                    name = match.group(1)
+
+        phone = ""
+        phone_pattern = r"1[3-9]\d{9}"
+        for item in soup.select("div.item.flex"):
+            label = item.find("div", class_="label")
+            desc = item.find("div", class_="desc")
+            if not label or not desc:
+                continue
+            label_text = label.get_text()
+            desc_text = desc.get_text().replace("-", "")
+            if "联系电话" in label_text or "电话" in label_text:
+                matches = re.findall(phone_pattern, desc_text)
+                if matches:
+                    phone = matches[0]
+                    break
+        if not phone:
+            matches = re.findall(phone_pattern, text.replace("-", ""))
+            if matches:
+                phone = matches[0]
+        if not phone:
+            print(f"  无手机号: {url}")
+            return None
+
+        law_firm = ""
+        for item in soup.select("div.item.flex"):
+            label = item.find("div", class_="label")
+            desc = item.find("div", class_="desc")
+            if not label or not desc:
+                continue
+            if "执业律所" in label.get_text() or "律所" in label.get_text():
+                law_firm = desc.get_text(strip=True).replace("已认证", "")
+                break
+
+        params = {
+            "list_url": url,
+            "province": province,
+            "city": city,
+        }
+
+        return {
+            "name": name or "",
+            "law_firm": law_firm,
+            "province": province,
+            "city": city,
+            "phone": phone,
+            "url": url,
+            "domain": DOMAIN,
+            "create_time": int(time.time()),
+            "params": json.dumps(params, ensure_ascii=False)
+        }
+
+    def run(self):
+        print("启动法律快车采集...")
+        areas = self._load_areas()
+        if not areas:
+            print("无地区数据")
+            return
+
+        for area in areas:
+            pinyin = area.get("pinyin")
+            province = area.get("province", "")
+            city = area.get("city", "")
+            if not pinyin:
+                continue
+            page = 1
+            while True:
+                list_url = LIST_BASE.format(pinyin=pinyin, page=page)
+                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
+                html = self._get(list_url)
+                if not html:
+                    break
+                link_count = self._parse_list(html, province, city)
+                if link_count == 0:
+                    break
+                page += 1
+        print("法律快车采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = LawtimeSpider(db)
+        spider.run()