lawyers/common_sites/dls.py

import json
import os
import sys
import time
import random
from typing import Dict, Optional

current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
request_dir = os.path.join(project_root, "request")
if request_dir not in sys.path:
    sys.path.insert(0, request_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import urllib3
from bs4 import BeautifulSoup
from request.proxy_config import get_proxies, report_proxy_status

# 禁用 SSL 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

from Db import Db
from utils.rate_limiter import request_slot

DOMAIN = "大律师"
LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
_PROXY_TESTED = False


class DlsSpider:
    def __init__(self, db_connection):
        self.db = db_connection
        self.session = self._build_session()
        self.areas = self._load_areas()

    def _build_session(self) -> requests.Session:
        """构建带重试机制的 session"""
        report_proxy_status()
        s = requests.Session()
        s.trust_env = False
        proxies = get_proxies()
        if proxies:
            s.proxies.update(proxies)
        else:
            s.proxies.clear()
        self._proxy_test(s, proxies)
        # 配置重试策略
        retries = Retry(
            total=3,  # 总共重试3次
            backoff_factor=1,  # 重试间隔：1s, 2s, 4s
            status_forcelist=(429, 500, 502, 503, 504),  # 对这些状态码进行重试
            allowed_methods=frozenset(["GET", "POST"]),
            raise_on_status=False  # 不立即抛出异常，让代码处理
        )
        adapter = HTTPAdapter(max_retries=retries)
        s.mount("https://", adapter)
        s.mount("http://", adapter)
        s.headers.update({
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
            "Host": "m.maxlaw.cn",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "close",
        })
        return s

    def _refresh_session(self) -> None:
        try:
            self.session.close()
        except Exception:
            pass
        self.session = self._build_session()

    def _proxy_test(self, session: requests.Session, proxies: Optional[Dict[str, str]]) -> None:
        global _PROXY_TESTED
        if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
            return
        _PROXY_TESTED = True
        if not proxies:
            print("[proxy] test skipped: no proxy configured")
            return
        test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
        timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
        try:
            resp = session.get(
                test_url,
                timeout=timeout,
                headers={"Connection": "close"},
            )
            print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
        except Exception as exc:
            print(f"[proxy] test failed: {exc}")

    def _load_areas(self):
        try:
            return self.db.select_data(
                "area_new",
                "province, city, pinyin",
                "domain='maxlaw'"
            ) or []
        except Exception as exc:
            print(f"加载地区失败: {exc}")
            return []

    def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
        """发送 GET 请求，带重试机制"""
        for attempt in range(max_retries):
            try:
                # 使用更长的超时时间，分别设置连接和读取超时
                with request_slot():
                    resp = self.session.get(
                        url,
                        timeout=(10, 30),  # (connect_timeout, read_timeout)
                        verify=False,
                        headers=headers,
                    )
                status_code = resp.status_code
                content = resp.text
                resp.close()
                if status_code == 403:
                    if attempt < max_retries - 1:
                        wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
                        print(f"403被拦截，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
                        self._refresh_session()
                        time.sleep(wait_time)
                        continue
                    print(f"请求失败 {url}: 403 Forbidden")
                    return None
                if status_code >= 400:
                    raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
                return content
            except requests.exceptions.ConnectTimeout as exc:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt  # 指数退避：2s, 4s, 8s
                    print(f"连接超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
                    time.sleep(wait_time)
                else:
                    print(f"连接超时，已达到最大重试次数 {url}: {exc}")
                    return None
            except requests.exceptions.Timeout as exc:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
                    print(f"请求超时，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
                    time.sleep(wait_time)
                else:
                    print(f"请求超时，已达到最大重试次数 {url}: {exc}")
                    return None
            except requests.exceptions.ConnectionError as exc:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
                    print(f"连接错误，{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
                    time.sleep(wait_time)
                else:
                    print(f"连接错误，已达到最大重试次数 {url}: {exc}")
                    return None
            except requests.exceptions.RequestException as exc:
                print(f"请求失败 {url}: {exc}")
                return None

        return None

    def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
        soup = BeautifulSoup(html, "html.parser")
        cards = soup.find_all("div", class_="lstx")
        if not cards:
            return 0

        inserted = 0
        for card in cards:
            link = card.find("a")
            if not link or not link.get("href"):
                continue
            detail = self._parse_detail(link['href'], province, city, list_url)
            if not detail:
                continue
            phone = detail.get("phone")
            if not phone:
                continue
            condition = f"phone='{phone}' and domain='{DOMAIN}'"
            if self.db.is_data_exist("lawyer", condition):
                print(f"  -- 已存在: {detail['name']} ({phone})")
                time.sleep(0.3)
                continue
            try:
                self.db.insert_data("lawyer", detail)
                inserted += 1
                print(f"  -> 新增: {detail['name']} ({phone})")
            except Exception as exc:
                print(f"  插入失败: {exc}")
            time.sleep(1)
            time.sleep(0.3)
        # 列表页结束后再缓一缓，降低风控
        time.sleep(0.6)
        return inserted

    def _detail_headers(self, referer: str) -> Dict[str, str]:
        return {
            "Referer": referer,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control": "no-cache",
            "Pragma": "no-cache",
            "Upgrade-Insecure-Requests": "1",
        }

    def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
        url = f"https://m.maxlaw.cn{path}"
        print(f"  详情: {url}")
        html = self._get(url, headers=self._detail_headers(list_url))
        if not html:
            return None

        soup = BeautifulSoup(html, "html.parser")
        name_tag = soup.find("h2", class_="lawyerName")
        law_firm_tag = soup.find("p", class_="law-firm")
        contact_list = soup.find("ul", class_="contact-content")

        name = name_tag.get_text(strip=True) if name_tag else ""
        law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
        phone = ""

        if contact_list:
            items = contact_list.find_all("li")
            if len(items) > 2:
                phone_tag = items[2].find("p")
                if phone_tag:
                    phone = phone_tag.get_text(strip=True)
                    phone = phone.split("咨询请说明来自大律师网")[0].strip()

        phone = phone.replace('-', '').strip()
        if not name or not phone:
            print("    信息不完整，跳过")
            return None

        safe_city = city if city else province
        return {
            "name": name,
            "law_firm": law_firm,
            "province": province,
            "city": safe_city,
            "phone": phone,
            "url": url,
            "domain": DOMAIN,
            "create_time": int(time.time()),
            "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
        }

    def run(self):
        print("启动大律师采集...")
        if not self.areas:
            print("无地区数据")
            return

        for area in self.areas:
            pinyin = area.get("pinyin")
            province = area.get("province", "")
            city = area.get("city", "")
            if not pinyin:
                continue
            page = 1
            while True:
                list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
                print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
                html = self._get(list_url)
                if not html:
                    break
                inserted = self._parse_list(html, province, city, list_url)
                if inserted == 0:
                    break
                page += 1
        print("大律师采集完成")


if __name__ == "__main__":
    with Db() as db:
        spider = DlsSpider(db)
        spider.run()