import json import os import sys import time import random from typing import Dict, Optional current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) request_dir = os.path.join(project_root, "request") if request_dir not in sys.path: sys.path.insert(0, request_dir) if project_root not in sys.path: sys.path.append(project_root) import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry import urllib3 from bs4 import BeautifulSoup from request.proxy_config import get_proxies, report_proxy_status # 禁用 SSL 警告 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) from Db import Db from utils.rate_limiter import wait_for_request DOMAIN = "大律师" LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}" _PROXY_TESTED = False class DlsSpider: def __init__(self, db_connection): self.db = db_connection self.session = self._build_session() self.areas = self._load_areas() def _build_session(self) -> requests.Session: """构建带重试机制的 session""" report_proxy_status() s = requests.Session() s.trust_env = False proxies = get_proxies() if proxies: s.proxies.update(proxies) else: s.proxies.clear() self._proxy_test(s, proxies) # 配置重试策略 retries = Retry( total=3, # 总共重试3次 backoff_factor=1, # 重试间隔:1s, 2s, 4s status_forcelist=(429, 500, 502, 503, 504), # 对这些状态码进行重试 allowed_methods=frozenset(["GET", "POST"]), raise_on_status=False # 不立即抛出异常,让代码处理 ) adapter = HTTPAdapter(max_retries=retries) s.mount("https://", adapter) s.mount("http://", adapter) s.headers.update({ "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", "Host": "m.maxlaw.cn", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "close", }) return s def _refresh_session(self) -> None: try: self.session.close() except Exception: pass self.session = self._build_session() def _proxy_test(self, session: requests.Session, proxies: Optional[Dict[str, str]]) -> None: global _PROXY_TESTED if _PROXY_TESTED or not os.getenv("PROXY_TEST"): return _PROXY_TESTED = True if not proxies: print("[proxy] test skipped: no proxy configured") return test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy") timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10")) try: resp = session.get( test_url, timeout=timeout, headers={"Connection": "close"}, ) print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}") except Exception as exc: print(f"[proxy] test failed: {exc}") def _load_areas(self): try: return self.db.select_data( "area_new", "province, city, pinyin", "domain='maxlaw'" ) or [] except Exception as exc: print(f"加载地区失败: {exc}") return [] def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]: """发送 GET 请求,带重试机制""" wait_for_request() for attempt in range(max_retries): try: # 使用更长的超时时间,分别设置连接和读取超时 resp = self.session.get( url, timeout=(10, 30), # (connect_timeout, read_timeout) verify=False, headers=headers, ) status_code = resp.status_code content = resp.text resp.close() if status_code == 403: if attempt < max_retries - 1: wait_time = 2 ** attempt + random.uniform(0.3, 1.0) print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") self._refresh_session() time.sleep(wait_time) continue print(f"请求失败 {url}: 403 Forbidden") return None if status_code >= 400: raise requests.exceptions.HTTPError(f"{status_code} Error: {url}") return content except requests.exceptions.ConnectTimeout as exc: if attempt < max_retries - 1: wait_time = 2 ** attempt # 指数退避:2s, 4s, 8s print(f"连接超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") time.sleep(wait_time) else: print(f"连接超时,已达到最大重试次数 {url}: {exc}") return None except requests.exceptions.Timeout as exc: if attempt < max_retries - 1: wait_time = 2 ** attempt print(f"请求超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") time.sleep(wait_time) else: print(f"请求超时,已达到最大重试次数 {url}: {exc}") return None except requests.exceptions.ConnectionError as exc: if attempt < max_retries - 1: wait_time = 2 ** attempt print(f"连接错误,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") time.sleep(wait_time) else: print(f"连接错误,已达到最大重试次数 {url}: {exc}") return None except requests.exceptions.RequestException as exc: print(f"请求失败 {url}: {exc}") return None return None def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int: soup = BeautifulSoup(html, "html.parser") cards = soup.find_all("div", class_="lstx") if not cards: return 0 inserted = 0 for card in cards: link = card.find("a") if not link or not link.get("href"): continue detail = self._parse_detail(link['href'], province, city, list_url) if not detail: continue phone = detail.get("phone") if not phone: continue condition = f"phone='{phone}' and domain='{DOMAIN}'" if self.db.is_data_exist("lawyer", condition): print(f" -- 已存在: {detail['name']} ({phone})") time.sleep(0.3) continue try: self.db.insert_data("lawyer", detail) inserted += 1 print(f" -> 新增: {detail['name']} ({phone})") except Exception as exc: print(f" 插入失败: {exc}") time.sleep(1) time.sleep(0.3) # 列表页结束后再缓一缓,降低风控 time.sleep(0.6) return inserted def _detail_headers(self, referer: str) -> Dict[str, str]: return { "Referer": referer, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", } def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]: url = f"https://m.maxlaw.cn{path}" print(f" 详情: {url}") html = self._get(url, headers=self._detail_headers(list_url)) if not html: return None soup = BeautifulSoup(html, "html.parser") name_tag = soup.find("h2", class_="lawyerName") law_firm_tag = soup.find("p", class_="law-firm") contact_list = soup.find("ul", class_="contact-content") name = name_tag.get_text(strip=True) if name_tag else "" law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else "" phone = "" if contact_list: items = contact_list.find_all("li") if len(items) > 2: phone_tag = items[2].find("p") if phone_tag: phone = phone_tag.get_text(strip=True) phone = phone.split("咨询请说明来自大律师网")[0].strip() phone = phone.replace('-', '').strip() if not name or not phone: print(" 信息不完整,跳过") return None safe_city = city if city else province return { "name": name, "law_firm": law_firm, "province": province, "city": safe_city, "phone": phone, "url": url, "domain": DOMAIN, "create_time": int(time.time()), "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False) } def run(self): print("启动大律师采集...") if not self.areas: print("无地区数据") return for area in self.areas: pinyin = area.get("pinyin") province = area.get("province", "") city = area.get("city", "") if not pinyin: continue page = 1 while True: list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page) print(f"采集 {province}-{city} 第 {page} 页: {list_url}") html = self._get(list_url) if not html: break inserted = self._parse_list(html, province, city, list_url) if inserted == 0: break page += 1 print("大律师采集完成") if __name__ == "__main__": with Db() as db: spider = DlsSpider(db) spider.run()