chore: initialize lawyers crawler project

2026-03-02 00:19:48 +08:00
commit 03847a4b8e
17 changed files with 1928 additions and 0 deletions
@@ -0,0 +1,325 @@
+import json
+import os
+import re
+import sys
+import time
+import random
+from typing import Dict, Optional
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+request_dir = os.path.join(project_root, "request")
+if request_dir not in sys.path:
+    sys.path.insert(0, request_dir)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+from bs4 import BeautifulSoup
+from request.requests_client import RequestClientError, RequestsClient
+
+from Db import Db
+from config import HEADERS
+
+LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
+DOMAIN = "华律"
+
+
+class HualvSpider:
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.client = self._build_session()
+        self.areas = self._load_areas()
+
+    def _build_session(self) -> RequestsClient:
+        custom_headers = HEADERS.copy()
+        custom_headers['User-Agent'] = (
+            'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
+            'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
+            'Mobile/15E148 Safari/604.1'
+        )
+        custom_headers["Connection"] = "close"
+        return RequestsClient(headers=custom_headers)
+
+    def _refresh_session(self) -> None:
+        self.client.refresh()
+
+    def _load_areas(self):
+        tables = ("area_new", "area2", "area")
+        last_error = None
+        for table in tables:
+            try:
+                provinces = self.db.select_data(
+                    table,
+                    "code, province, pinyin, id",
+                    "domain='66law' AND level=1"
+                ) or []
+                cities = self.db.select_data(
+                    table,
+                    "code, city, province, pid",
+                    "domain='66law' AND level=2"
+                ) or []
+            except Exception as exc:
+                last_error = exc
+                continue
+
+            if not cities:
+                continue
+
+            province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
+            city_map = {}
+            for city in cities:
+                province_info = province_map.get(city.get('pid'), {}) or {}
+                province_code = province_info.get('code')
+                city_map[city.get('code')] = {
+                    "name": city.get('city'),
+                    "province": city.get('province'),
+                    "province_code": province_code,
+                }
+            print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
+            return city_map
+
+        if last_error:
+            print(f"[华律] 加载地区数据失败: {last_error}")
+        print("[华律] 无城市数据（已尝试 area_new/area2/area）")
+        return {}
+
+    def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.post_text(LIST_URL, data=data, timeout=20, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print("请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error")
+                try:
+                    return json.loads(text)
+                except ValueError as exc:
+                    print(f"解析JSON失败: {exc}")
+                    return None
+            except RequestClientError as exc:
+                print(f"请求失败: {exc}")
+                return None
+        return None
+
+    def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
+        contact_url = f"{url}lawyer_contact.aspx"
+        print(f"  详情: {contact_url}")
+        existing = self.db.select_data(
+            "lawyer",
+            "id, avatar_url",
+            f"domain='{DOMAIN}' AND url='{contact_url}'"
+        )
+        existing_id = None
+        if existing:
+            existing_id = existing[0].get("id")
+            avatar = (existing[0].get("avatar_url") or "").strip()
+            if avatar:
+                print("    -- 已存在且头像已补全，跳过")
+                return None
+
+        html = self._get_detail(contact_url)
+        if not html:
+            return None
+
+        soup = BeautifulSoup(html, "html.parser")
+        info_list = soup.find("ul", class_="information-list")
+        if not info_list:
+            return None
+
+        phone = ""
+        law_firm = ""
+        for li in info_list.find_all("li"):
+            text = li.get_text(strip=True)
+            if "手机号" in text:
+                cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
+                match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
+                if match:
+                    phone = match.group(0)
+            if "执业单位" in text:
+                law_firm = text.replace("执业单位", "").strip()
+
+        name = ""
+        breadcrumb = soup.find("div", class_="weizhi")
+        if breadcrumb:
+            links = breadcrumb.find_all("a")
+            if len(links) > 2:
+                name = links[2].get_text(strip=True)
+
+        phone = phone.replace('-', '').strip()
+        if not phone or not re.fullmatch(r"1\d{10}", phone):
+            print("    无手机号，跳过")
+            return None
+
+        avatar_url, site_time = self._extract_avatar_and_time(soup)
+        data = {
+            "phone": phone,
+            "province": province,
+            "city": city,
+            "law_firm": law_firm,
+            "url": contact_url,
+            "avatar_url": avatar_url,
+            "create_time": int(time.time()),
+            "site_time": site_time,
+            "domain": DOMAIN,
+            "name": name,
+            "params": json.dumps({"source": url}, ensure_ascii=False)
+        }
+        if existing_id:
+            update_data = {
+                "avatar_url": avatar_url,
+                "site_time": site_time,
+            }
+            if name:
+                update_data["name"] = name
+            if law_firm:
+                update_data["law_firm"] = law_firm
+            if province:
+                update_data["province"] = province
+            if city:
+                update_data["city"] = city
+            if phone:
+                update_data["phone"] = phone
+            update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
+            try:
+                self.db.update_data("lawyer", update_data, f"id={existing_id}")
+                print("    -- 已存在，已补全头像/时间")
+            except Exception as exc:
+                print(f"    更新失败: {exc}")
+            return None
+        # 若手机号已存在，则更新头像/时间，不再插入新记录
+        existing_phone = self.db.select_data(
+            "lawyer",
+            "id, avatar_url, url",
+            f"domain='{DOMAIN}' AND phone='{phone}'"
+        )
+        if existing_phone:
+            existing_row = existing_phone[0]
+            avatar = (existing_row.get("avatar_url") or "").strip()
+            if avatar:
+                print("    -- 已存在手机号且头像已补全，跳过")
+                return None
+            update_data = {
+                "avatar_url": avatar_url,
+                "site_time": site_time,
+            }
+            if name:
+                update_data["name"] = name
+            if law_firm:
+                update_data["law_firm"] = law_firm
+            if province:
+                update_data["province"] = province
+            if city:
+                update_data["city"] = city
+            if phone:
+                update_data["phone"] = phone
+            if not existing_row.get("url"):
+                update_data["url"] = contact_url
+            update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
+            try:
+                self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
+                print("    -- 已存在手机号，已补全头像/时间")
+            except Exception as exc:
+                print(f"    更新失败: {exc}")
+            return None
+        return data
+
+    def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
+        avatar_url = ""
+        site_time = None
+        img_tag = soup.select_one(
+            "div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
+        )
+        if img_tag:
+            src = (img_tag.get("src") or "").strip()
+            if src:
+                if src.startswith("//"):
+                    avatar_url = f"https:{src}"
+                else:
+                    avatar_url = src
+                match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
+                if match:
+                    site_time = int(f"{match.group(1)}{match.group(2)}")
+                else:
+                    match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
+                    if match:
+                        site_time = int(f"{match.group(1)}{match.group(2)}")
+        return avatar_url, site_time
+
+    def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.get_text(url, timeout=15, verify=False)
+                status_code = resp.status_code
+                text = resp.text
+                if status_code == 403:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
+                        print(f"    403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
+                        self._refresh_session()
+                        time.sleep(wait_time)
+                        continue
+                    print("    请求失败: 403 Forbidden")
+                    return None
+                if status_code >= 400:
+                    raise RequestClientError(f"{status_code} Error")
+                return text
+            except RequestClientError as exc:
+                print(f"    请求失败: {exc}")
+                return None
+        return None
+
+    def run(self):
+        print("启动华律网采集...")
+        if not self.areas:
+            print("无城市数据")
+            return
+
+        for city_code, city_info in self.areas.items():
+            province_code = city_info.get("province_code")
+            if not province_code:
+                continue
+            province_name = city_info.get("province", "")
+            city_name = city_info.get("name", "")
+            print(f"采集 {province_name}-{city_name}")
+
+            page = 1
+            while True:
+                payload = {"pid": province_code, "cid": city_code, "page": str(page)}
+                data = self._post(payload)
+                if not data or not data.get("lawyerList"):
+                    break
+
+                for item in data["lawyerList"]:
+                    result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
+                    if not result:
+                        continue
+                    try:
+                        self.db.insert_data("lawyer", result)
+                        print(f"  -> 新增: {result['name']} ({result['phone']})")
+                    except Exception as exc:
+                        print(f"  插入失败: {exc}")
+                    time.sleep(1)
+
+                page_count = data.get("lawyerItems", {}).get("pageCount", page)
+                if page >= page_count:
+                    break
+                page += 1
+                time.sleep(2)
+
+            time.sleep(1)
+        print("华律网采集完成")
+
+
+if __name__ == "__main__":
+    with Db() as db:
+        spider = HualvSpider(db)
+        spider.run()