import json import os import sys import time import random from typing import Dict, List, Set, Optional current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) request_dir = os.path.join(project_root, "request") if request_dir not in sys.path: sys.path.insert(0, request_dir) if project_root not in sys.path: sys.path.append(project_root) from request.requests_client import RequestClientError, RequestSSLError, RequestsClient from Db import Db DOMAIN = "找法网" LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1" class FindlawSpider: def __init__(self, db_connection): self.db = db_connection self.client = self._build_session() self.cities = self._load_cities() def _build_session(self) -> RequestsClient: return RequestsClient(headers={ "User-Agent": ( "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) " "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 " "Mobile/15E148 Safari/604.1" ), "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", "Connection": "close", }) def _refresh_session(self) -> None: self.client.refresh() def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]: headers = {"Referer": referer} for attempt in range(max_retries): try: resp = self.client.get_text(url, timeout=15, verify=verify, headers=headers) status_code = resp.status_code text = resp.text if status_code == 403: if attempt < max_retries - 1: wait_time = 2 ** attempt + random.uniform(0.3, 1.0) print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") self._refresh_session() time.sleep(wait_time) continue print(f"请求失败 {url}: 403 Forbidden") return None if status_code >= 400: raise RequestClientError(f"{status_code} Error: {url}") return text except RequestSSLError: if verify: return self._get(url, referer, verify=False, max_retries=max_retries) print(f"SSL错误 {url}") return None except RequestClientError as exc: print(f"请求失败 {url}: {exc}") return None return None def _existing_phones(self, phones: List[str]) -> Set[str]: if not phones: return set() existing: Set[str] = set() cur = self.db.db.cursor() try: chunk_size = 500 for i in range(0, len(phones), chunk_size): chunk = phones[i:i + chunk_size] placeholders = ",".join(["%s"] * len(chunk)) sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})" cur.execute(sql, [DOMAIN, *chunk]) for row in cur.fetchall(): existing.add(row[0]) finally: cur.close() return existing def _load_cities(self): condition = "domain='findlaw' AND level=2" tables = ("area_new", "area2", "area") last_error = None for table in tables: try: rows = self.db.select_data(table, "city, province, pinyin", condition) or [] except Exception as exc: last_error = exc continue if rows: missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip()) print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}") return rows if last_error: print(f"[找法网] 加载地区数据失败: {last_error}") print("[找法网] 无城市数据(已尝试 area_new/area2/area)") for table in tables: try: cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition) c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0 print(f"[找法网] 校验: {table} 满足条件记录数: {c}") except Exception: pass return [] def _fetch_page(self, url: str, referer: str) -> List[Dict]: text = self._get(url, referer, verify=True) if not text: return [] try: # 某些返回体前会携带 BOM 或包装脚本,此处做兼容 text = text.strip().lstrip("\ufeff") try: data = json.loads(text) except ValueError: json_start = text.find('{') json_end = text.rfind('}') if json_start == -1 or json_end == -1: print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}") return [] cleaned = text[json_start:json_end + 1] data = json.loads(cleaned) if isinstance(data, str): try: data = json.loads(data) except ValueError: print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}") return [] except ValueError as exc: print(f"解析JSON失败 {url}: {exc}") return [] items = data.get("data", {}).get("lawyer_list", []) parsed = [] for item in items: phone = (item.get("mobile") or "").replace("-", "") parsed.append({ "name": item.get("username", ""), "law_firm": item.get("lawyer_lawroom", ""), "province": item.get("areaInfo", {}).get("province", ""), "city": item.get("areaInfo", {}).get("city", ""), "phone": phone, "url": url, "domain": DOMAIN, "create_time": int(time.time()), "params": json.dumps(item, ensure_ascii=False) }) return parsed def run(self): print("启动找法网采集...") if not self.cities: print("无城市数据") return for city in self.cities: pinyin = city.get("pinyin") province = city.get("province", "") city_name = city.get("city", "") if not pinyin: continue print(f"采集 {province}-{city_name}") page = 1 while True: url = LIST_TEMPLATE.format(pinyin=pinyin, page=page) referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/" print(f" 第 {page} 页: {url}") items = self._fetch_page(url, referer) if not items: break phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()] existing = self._existing_phones(phones) for entry in items: phone = entry.get("phone") if not phone: continue if phone in existing: print(f" -- 已存在: {entry['name']} ({phone})") continue try: self.db.insert_data("lawyer", entry) print(f" -> 新增: {entry['name']} ({phone})") except Exception as exc: print(f" 插入失败: {exc}") page += 1 print("找法网采集完成") if __name__ == "__main__": with Db() as db: spider = FindlawSpider(db) spider.run()