From 38e7c284e81f5848eda4d229d4c135db303d981c Mon Sep 17 00:00:00 2001 From: hello-dd-code Date: Wed, 18 Mar 2026 10:02:25 +0800 Subject: [PATCH] feat: enhance project configuration and improve data export functionality - Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports. --- .gitignore | 40 +- =3.1.0 | 0 README.md | 175 +---- common_sites/dls.py | 471 +++++------ common_sites/export_lawyers_excel.py | 17 +- common_sites/findlaw.py | 603 +++++---------- common_sites/hualv.py | 1076 +++++++------------------- common_sites/lawtime.py | 822 ++++++-------------- common_sites/six4365.py | 872 +++++++-------------- common_sites/start.sh | 83 +- config.py | 119 ++- request/__init__.py | 20 +- requirements.txt | 16 +- weixin.py | 355 +++++++++ 14 files changed, 1665 insertions(+), 3004 deletions(-) create mode 100644 =3.1.0 create mode 100644 weixin.py diff --git a/.gitignore b/.gitignore index cdd6454..cf267f5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,36 +1,6 @@ -# Python -__pycache__/ -*.py[cod] -*$py.class - -# Build / packaging -build/ -dist/ -*.egg-info/ -.eggs/ - -# Virtual environments .venv/ -venv/ -env/ - -# Test / type caches -.pytest_cache/ -.mypy_cache/ -.ruff_cache/ - -# IDE -.vscode/ -.idea/ - -# OS -.DS_Store -Thumbs.db - -# Local runtime files -*.log -logs/ -data/ - -# accidental local files -=* +__pycache__/ +*.pyc +common_sites/*.log +logs/* +data/* diff --git a/=3.1.0 b/=3.1.0 new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index ea020f0..aaeae77 100644 --- a/README.md +++ b/README.md @@ -1,170 +1,27 @@ -# lawyers +# lawyers-common-sites -`common_sites` 独立采集项目。 +从 `/www/wwwroot/lawyer` 中抽离出的 `common_sites` 独立项目。 ## 目录 -- `common_sites/`:大律师、找法网、法律快车、律图、华律 5 个采集脚本 -- `one_off_sites/`:一次性/临时站点采集脚本(不纳入常用站点批量启动) -- `request/proxy_config.py`:代理配置加载逻辑 -- `request/proxy_settings.json`:代理配置文件 -- `Db.py`:数据库连接与基础操作 -- `config.py`:数据库与请求头配置 +- `common_sites/`: 站点采集脚本 +- `request/`: 代理配置 +- `utils/`: 公共工具 +- `Db.py`: 数据库封装 +- `config.py`: 项目配置 -## 运行 +## 快速启动 ```bash cd /www/wwwroot/lawyers -python3 -m venv .venv -.venv/bin/pip install -r requirements.txt -./common_sites/start.sh +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +bash common_sites/start.sh ``` -## 地区同步服务(Python) +## 说明 -新增服务脚本:`services/area_sync_service.py` - -用途: - -- 替代原 `nas.nepiedg.site:9002` 的核心接口 -- `GET /api/layer/get_area`:从数据库 `area_new` 读取地区列表并返回给 `js/douyin.js` -- `POST /api/layer/index`:接收脚本回传搜索数据,先保存原始 JSON 到本地,再按参数决定是否入库 -- `GET/POST /api/layer/progress`:多设备共享采集断点(自动建表 `layer_progress`) - -`/api/layer/index` 当前入库规则(基于 `payload.data.user_list[].user_info`): - -- 主要从 `signature`(简介)里正则提取手机号 -- 若简介未命中,再从微信相关标记(`微信/wx/vx/v`)和 `unique_id/versatile_display` 提取手机号 -- 必须命中关键词(默认:`律师,律所`)才允许入库,可通过 `DOUYIN_LAWYER_KEYWORDS` 调整 -- `url` 固定写为 `https://www.douyin.com/user/{sec_uid}`(`sec_uid` 为空则跳过不入库) - -启动: - -```bash -cd /www/wwwroot/lawyers -./.venv/bin/python ./services/area_sync_service.py -``` - -常用环境变量: - -```bash -AREA_SERVICE_HOST=0.0.0.0 -AREA_SERVICE_PORT=9002 -AREA_TARGET_TABLE=area_new -AREA_DOMAIN=maxlaw -DOUYIN_DOMAIN=抖音 -DOUYIN_RAW_DIR=/www/wwwroot/lawyers/data/douyin_raw -DOUYIN_SAVE_ONLY=1 -DOUYIN_LAWYER_KEYWORDS=律师,律所 -LAYER_PROGRESS_TABLE=layer_progress -LAYER_PROGRESS_DEFAULT_KEY=douyin_batch_default -``` - -接口示例: - -```bash -# 健康检查 -curl 'http://127.0.0.1:9002/health' - -# 读取数据库中的地区(默认直接返回数组,兼容 js/douyin.js) -curl 'http://127.0.0.1:9002/api/layer/get_area?server=1' - -# 如果需要带统计信息 -curl 'http://127.0.0.1:9002/api/layer/get_area?table=area_new&domain=maxlaw&meta=1' - -# 接收 douyin.js 回传结果并入库(默认写 lawyer.domain=抖音) -curl -X POST 'http://127.0.0.1:9002/api/layer/index?server=1&save_only=0' \ - -H 'Content-Type: application/json' \ - -d '{"source":"xhr","url":"https://www.douyin.com/aweme/v1/web/discover/search/","ts":1772811111,"cityIndex":0,"data":{"desc":"联系方式 13812345678"}}' - -# 可选:指定写入域名(用于测试) -curl -X POST 'http://127.0.0.1:9002/api/layer/index?save_domain=codex_test_douyin' \ - -H 'Content-Type: application/json' \ - -d '{"source":"xhr","url":"https://www.douyin.com/aweme/v1/web/discover/search/","ts":1772811111,"cityIndex":0,"data":{"desc":"联系方式 13812345678"}}' - -# 仅保存原始回传(不入库) -curl -X POST 'http://127.0.0.1:9002/api/layer/index?save_only=1' \ - -H 'Content-Type: application/json' \ - -d '{"source":"xhr","url":"https://www.douyin.com/aweme/v1/web/discover/search/","ts":1772811111,"cityIndex":0,"data":{"desc":"联系方式 13812345678"}}' - -# 原始数据落盘目录(按天分文件) -# /www/wwwroot/lawyers/data/douyin_raw/douyin_index_YYYYMMDD.jsonl - -# 读取共享断点(多设备) -curl 'http://127.0.0.1:9002/api/layer/progress?server=1&progress_key=douyin_batch_default' - -# 更新共享断点 -curl -X POST 'http://127.0.0.1:9002/api/layer/progress?server=1' \ - -H 'Content-Type: application/json' \ - -d '{"progress_key":"douyin_batch_default","device_id":"device-a","next_city_index":120,"area_signature":"xxxx","area_total":551,"current_city":"北京","reason":"city_done","status":"running"}' - -# 清空共享断点 -curl -X POST 'http://127.0.0.1:9002/api/layer/progress?server=1' \ - -H 'Content-Type: application/json' \ - -d '{"action":"clear","progress_key":"douyin_batch_default"}' -``` - -如果 9002 端口已有旧进程占用,可先执行: - -```bash -lsof -iTCP:9002 -sTCP:LISTEN -t -kill -``` - -## 启动参数 - -`start.sh` 默认并行启动 5 个站点采集(大律师使用 `dls_fresh.py`)。 - -- 日志目录:`/www/wwwroot/lawyers/logs` -- 大律师 JSON 输出:`/www/wwwroot/lawyers/data/dls_records.jsonl` - -常用环境变量: - -```bash -# 顺序执行(默认 parallel) -RUN_MODE=sequential ./common_sites/start.sh - -# 大律师限制采集范围 -DLS_CITY_FILTER=beijing DLS_MAX_CITIES=1 DLS_MAX_PAGES=1 ./common_sites/start.sh - -# 大律师直连(不走代理)/ 仅导出JSON不写库 -DLS_DIRECT=1 DLS_NO_DB=1 ./common_sites/start.sh -``` - -## 导出 Excel - -新增导出脚本:`common_sites/export_lawyers_excel.py` - -```bash -# 无参数:默认导出最近7天数据(含手机号/姓名/律所/省份/市区/站点名称) -# 并默认解析 params 扩展信息(邮箱/地址/执业证号/执业年限/擅长领域等) -./.venv/bin/python ./common_sites/export_lawyers_excel.py - -# 按 create_time 时间戳范围导出 -./.venv/bin/python ./common_sites/export_lawyers_excel.py \ - --start-ts 1772380000 --end-ts 1772429999 \ - --output ./data/lawyers_20260302.xlsx - -# 只导出某站点,并带技术字段(url/域名/时间等) -./.venv/bin/python ./common_sites/export_lawyers_excel.py \ - --domain 大律师 --include-extra - -# 如果不需要解析 params 扩展信息 -./.venv/bin/python ./common_sites/export_lawyers_excel.py --no-parse-params - -# 导出抖音采集数据(domain=抖音),并附带 sec_uid/抖音号/简介/API来源等字段 -./.venv/bin/python ./common_sites/export_lawyers_excel.py \ - --douyin-only --start-ts 0 --output ./data/douyin_lawyers_export.xlsx -``` - -## 一次性站点(众法利) - -脚本:`one_off_sites/zhongfali_single.py` - -```bash -# 仅采集写 JSON(默认输出到 data/one_off_sites/) -./.venv/bin/python ./one_off_sites/zhongfali_single.py --direct --no-db - -# 采集并写入 lawyer 表(domain=众法利单页) -./.venv/bin/python ./one_off_sites/zhongfali_single.py --direct -``` +- 当前项目直接复用原项目数据库配置和代理配置。 +- 采集依赖原库中的 `lawyer`、`area_new`、`area`、`area2` 等表。 +- 日志默认输出到 `common_sites/*.log`。 diff --git a/common_sites/dls.py b/common_sites/dls.py index 2ab61a9..06d4a01 100644 --- a/common_sites/dls.py +++ b/common_sites/dls.py @@ -1,14 +1,9 @@ import json import os -import random -import re import sys import time -from typing import Dict, List, Optional, Set, Tuple -from urllib.parse import urljoin - -import urllib3 -from bs4 import BeautifulSoup +import random +from typing import Dict, Optional current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) @@ -18,144 +13,191 @@ if request_dir not in sys.path: if project_root not in sys.path: sys.path.append(project_root) -from Db import Db -from request.requests_client import ( - RequestClientError, - RequestConnectTimeout, - RequestConnectionError, - RequestTimeout, - RequestsClient, -) -from utils.rate_limiter import wait_for_request +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +import urllib3 +from bs4 import BeautifulSoup +from request.proxy_config import get_proxies, report_proxy_status +# 禁用 SSL 警告 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +from Db import Db +from utils.rate_limiter import wait_for_request + DOMAIN = "大律师" -SITE_BASE = "https://m.maxlaw.cn" -LIST_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}" -PHONE_PATTERN = re.compile(r"1[3-9]\d{9}") -MAX_PAGES_PER_CITY = int(os.getenv("MAXLAW_MAX_PAGES", "0")) -PROXY_TESTED = False +LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}" +_PROXY_TESTED = False class DlsSpider: def __init__(self, db_connection): self.db = db_connection - self.client = self._build_client() + self.session = self._build_session() self.areas = self._load_areas() - def _build_client(self) -> RequestsClient: - client = RequestsClient( - headers={ - "User-Agent": ( - "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 " - "Mobile/15E148 Safari/604.1" - ), - "Host": "m.maxlaw.cn", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", - "Connection": "close", - }, - retry_total=3, - retry_backoff_factor=1, - retry_status_forcelist=(429, 500, 502, 503, 504), - retry_allowed_methods=("GET", "POST"), + def _build_session(self) -> requests.Session: + """构建带重试机制的 session""" + report_proxy_status() + s = requests.Session() + s.trust_env = False + proxies = get_proxies() + if proxies: + s.proxies.update(proxies) + else: + s.proxies.clear() + self._proxy_test(s, proxies) + # 配置重试策略 + retries = Retry( + total=3, # 总共重试3次 + backoff_factor=1, # 重试间隔:1s, 2s, 4s + status_forcelist=(429, 500, 502, 503, 504), # 对这些状态码进行重试 + allowed_methods=frozenset(["GET", "POST"]), + raise_on_status=False # 不立即抛出异常,让代码处理 ) - self._proxy_test(client, client.proxies or None) - return client + adapter = HTTPAdapter(max_retries=retries) + s.mount("https://", adapter) + s.mount("http://", adapter) + s.headers.update({ + "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", + "Host": "m.maxlaw.cn", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Connection": "close", + }) + return s - def _refresh_client(self) -> None: - self.client.refresh() - self._proxy_test(self.client, self.client.proxies or None) + def _refresh_session(self) -> None: + try: + self.session.close() + except Exception: + pass + self.session = self._build_session() - def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None: - global PROXY_TESTED - if PROXY_TESTED or not os.getenv("PROXY_TEST"): + def _proxy_test(self, session: requests.Session, proxies: Optional[Dict[str, str]]) -> None: + global _PROXY_TESTED + if _PROXY_TESTED or not os.getenv("PROXY_TEST"): return - PROXY_TESTED = True + _PROXY_TESTED = True if not proxies: print("[proxy] test skipped: no proxy configured") return test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy") timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10")) try: - resp = client.get_text(test_url, timeout=timeout, headers={"Connection": "close"}) + resp = session.get( + test_url, + timeout=timeout, + headers={"Connection": "close"}, + ) print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}") except Exception as exc: print(f"[proxy] test failed: {exc}") - def _load_areas(self) -> List[Dict[str, str]]: - tables = ("area_new", "area2", "area") - last_error = None - for table in tables: - try: - rows = self.db.select_data(table, "province, city, pinyin", "domain='maxlaw'") or [] - except Exception as exc: - last_error = exc - continue - if rows: - missing_pinyin = sum(1 for row in rows if not (row.get("pinyin") or "").strip()) - print(f"[大律师] 地区来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}") - return rows - if last_error: - print(f"[大律师] 加载地区失败: {last_error}") - print("[大律师] 无地区数据(已尝试 area_new/area2/area)") - return [] + def _load_areas(self): + try: + return self.db.select_data( + "area_new", + "province, city, pinyin", + "domain='maxlaw'" + ) or [] + except Exception as exc: + print(f"加载地区失败: {exc}") + return [] - def _get( - self, - url: str, - *, - headers: Optional[Dict[str, str]] = None, - max_retries: int = 3, - timeout: Tuple[int, int] = (10, 30), - ) -> Optional[str]: + def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]: + """发送 GET 请求,带重试机制""" wait_for_request() + for attempt in range(max_retries): try: - resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers) - if resp.status_code == 403: + # 使用更长的超时时间,分别设置连接和读取超时 + resp = self.session.get( + url, + timeout=(10, 30), # (connect_timeout, read_timeout) + verify=False, + headers=headers, + ) + status_code = resp.status_code + content = resp.text + resp.close() + if status_code == 403: if attempt < max_retries - 1: - wait_time = (2 ** attempt) + random.uniform(0.3, 1.0) - print(f"请求403,{wait_time:.2f}s后重试 ({attempt + 1}/{max_retries}): {url}") - self._refresh_client() + wait_time = 2 ** attempt + random.uniform(0.3, 1.0) + print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") + self._refresh_session() time.sleep(wait_time) continue print(f"请求失败 {url}: 403 Forbidden") return None - if resp.status_code >= 400: - raise RequestClientError(f"{resp.status_code} Error: {url}") - return resp.text - except RequestConnectTimeout as exc: + if status_code >= 400: + raise requests.exceptions.HTTPError(f"{status_code} Error: {url}") + return content + except requests.exceptions.ConnectTimeout as exc: + if attempt < max_retries - 1: + wait_time = 2 ** attempt # 指数退避:2s, 4s, 8s + print(f"连接超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") + time.sleep(wait_time) + else: + print(f"连接超时,已达到最大重试次数 {url}: {exc}") + return None + except requests.exceptions.Timeout as exc: if attempt < max_retries - 1: wait_time = 2 ** attempt - print(f"连接超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}") + print(f"请求超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") time.sleep(wait_time) - continue - print(f"连接超时,已达到最大重试次数 {url}: {exc}") - return None - except RequestTimeout as exc: + else: + print(f"请求超时,已达到最大重试次数 {url}: {exc}") + return None + except requests.exceptions.ConnectionError as exc: if attempt < max_retries - 1: wait_time = 2 ** attempt - print(f"请求超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}") + print(f"连接错误,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") time.sleep(wait_time) - continue - print(f"请求超时,已达到最大重试次数 {url}: {exc}") - return None - except RequestConnectionError as exc: - if attempt < max_retries - 1: - wait_time = 2 ** attempt - print(f"连接错误,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}") - time.sleep(wait_time) - continue - print(f"连接错误,已达到最大重试次数 {url}: {exc}") - return None - except RequestClientError as exc: + else: + print(f"连接错误,已达到最大重试次数 {url}: {exc}") + return None + except requests.exceptions.RequestException as exc: print(f"请求失败 {url}: {exc}") return None + return None + def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int: + soup = BeautifulSoup(html, "html.parser") + cards = soup.find_all("div", class_="lstx") + if not cards: + return 0 + + inserted = 0 + for card in cards: + link = card.find("a") + if not link or not link.get("href"): + continue + detail = self._parse_detail(link['href'], province, city, list_url) + if not detail: + continue + phone = detail.get("phone") + if not phone: + continue + condition = f"phone='{phone}' and domain='{DOMAIN}'" + if self.db.is_data_exist("lawyer", condition): + print(f" -- 已存在: {detail['name']} ({phone})") + time.sleep(0.3) + continue + try: + self.db.insert_data("lawyer", detail) + inserted += 1 + print(f" -> 新增: {detail['name']} ({phone})") + except Exception as exc: + print(f" 插入失败: {exc}") + time.sleep(1) + time.sleep(0.3) + # 列表页结束后再缓一缓,降低风控 + time.sleep(0.6) + return inserted + def _detail_headers(self, referer: str) -> Dict[str, str]: return { "Referer": referer, @@ -166,215 +208,72 @@ class DlsSpider: "Upgrade-Insecure-Requests": "1", } - def _extract_detail_urls(self, html: str) -> List[str]: - soup = BeautifulSoup(html, "html.parser") - urls: List[str] = [] - seen: Set[str] = set() - - # 主选择器:当前站点列表卡片 - for a_tag in soup.select("div.lstx a[href]"): - href = (a_tag.get("href") or "").strip() - if not href: - continue - url = urljoin(SITE_BASE, href) - if url in seen: - continue - seen.add(url) - urls.append(url) - - # 回退选择器:页面结构轻微变化时尽量保活 - if not urls: - for a_tag in soup.select("a[href]"): - href = (a_tag.get("href") or "").strip() - if "/lawyer/" not in href: - continue - url = urljoin(SITE_BASE, href) - if url in seen: - continue - seen.add(url) - urls.append(url) - return urls - - def _extract_name(self, soup: BeautifulSoup) -> str: - for selector in ("h2.lawyerName", "h1.lawyerName", "h1", "h2"): - tag = soup.select_one(selector) - if tag: - name = tag.get_text(strip=True) - if name: - return name - title = soup.title.get_text(strip=True) if soup.title else "" - match = re.search(r"(\S+律师)", title) - return match.group(1) if match else "" - - def _extract_law_firm(self, soup: BeautifulSoup) -> str: - for selector in ("p.law-firm", "div.law-firm", "p[class*=firm]"): - tag = soup.select_one(selector) - if tag: - text = tag.get_text(strip=True) - if text: - return text - page_text = soup.get_text(" ", strip=True) - match = re.search(r"(执业机构|律所)\s*[::]?\s*([^\s,。,;;]{2,40})", page_text) - if match: - return match.group(2).strip() - return "" - - def _normalize_phone(self, text: str) -> str: - compact = re.sub(r"\D", "", text or "") - match = PHONE_PATTERN.search(compact) - return match.group(0) if match else "" - - def _extract_phone(self, soup: BeautifulSoup) -> str: - contact = soup.select_one("ul.contact-content") - if contact: - phone = self._normalize_phone(contact.get_text(" ", strip=True)) - if phone: - return phone - for selector in ("a[href^='tel:']", "span.phone", "p.phone"): - tag = soup.select_one(selector) - if tag: - phone = self._normalize_phone(tag.get_text(" ", strip=True)) - if phone: - return phone - return self._normalize_phone(soup.get_text(" ", strip=True)) - - def _parse_detail(self, detail_url: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]: - print(f" 详情: {detail_url}") - html = self._get(detail_url, headers=self._detail_headers(list_url)) + def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]: + url = f"https://m.maxlaw.cn{path}" + print(f" 详情: {url}") + html = self._get(url, headers=self._detail_headers(list_url)) if not html: return None soup = BeautifulSoup(html, "html.parser") - name = self._extract_name(soup) - phone = self._extract_phone(soup) + name_tag = soup.find("h2", class_="lawyerName") + law_firm_tag = soup.find("p", class_="law-firm") + contact_list = soup.find("ul", class_="contact-content") + + name = name_tag.get_text(strip=True) if name_tag else "" + law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else "" + phone = "" + + if contact_list: + items = contact_list.find_all("li") + if len(items) > 2: + phone_tag = items[2].find("p") + if phone_tag: + phone = phone_tag.get_text(strip=True) + phone = phone.split("咨询请说明来自大律师网")[0].strip() + + phone = phone.replace('-', '').strip() if not name or not phone: print(" 信息不完整,跳过") return None - safe_city = city or province + safe_city = city if city else province return { "name": name, - "law_firm": self._extract_law_firm(soup), + "law_firm": law_firm, "province": province, "city": safe_city, "phone": phone, - "url": detail_url, + "url": url, "domain": DOMAIN, "create_time": int(time.time()), - "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False), + "params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False) } - def _existing_phones(self, phones: List[str]) -> Set[str]: - if not phones: - return set() - existing: Set[str] = set() - cur = self.db.db.cursor() - try: - chunk_size = 500 - for idx in range(0, len(phones), chunk_size): - chunk = phones[idx:idx + chunk_size] - placeholders = ",".join(["%s"] * len(chunk)) - sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})" - cur.execute(sql, [DOMAIN, *chunk]) - for row in cur.fetchall(): - existing.add(row[0]) - finally: - cur.close() - return existing - - def _save_lawyers(self, lawyers: List[Dict[str, str]]) -> Tuple[int, int]: - if not lawyers: - return 0, 0 - phones = [row["phone"] for row in lawyers if row.get("phone")] - existing = self._existing_phones(phones) - inserted = 0 - skipped = 0 - - for row in lawyers: - phone = row.get("phone", "") - if not phone: - skipped += 1 - continue - if phone in existing: - skipped += 1 - print(f" -- 已存在: {row.get('name', '')} ({phone})") - continue - try: - self.db.insert_data("lawyer", row) - existing.add(phone) - inserted += 1 - print(f" -> 新增: {row.get('name', '')} ({phone})") - except Exception as exc: - skipped += 1 - print(f" 插入失败 {row.get('url', '')}: {exc}") - return inserted, skipped - - def _crawl_city(self, area: Dict[str, str]) -> Tuple[int, int]: - pinyin = (area.get("pinyin") or "").strip() - province = area.get("province", "") - city = area.get("city", "") - if not pinyin: - return 0, 0 - - total_inserted = 0 - total_parsed = 0 - page = 1 - prev_fingerprint = "" - - while True: - if MAX_PAGES_PER_CITY > 0 and page > MAX_PAGES_PER_CITY: - print(f"达到分页上限({MAX_PAGES_PER_CITY}),停止 {province}-{city}") - break - - list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page) - print(f"采集 {province}-{city} 第 {page} 页: {list_url}") - html = self._get(list_url) - if not html: - break - - detail_urls = self._extract_detail_urls(html) - if not detail_urls: - print(" 列表为空,结束当前城市") - break - - fingerprint = "|".join(detail_urls[:8]) - if fingerprint and fingerprint == prev_fingerprint: - print(" 列表页重复,提前停止当前城市") - break - prev_fingerprint = fingerprint - - lawyers: List[Dict[str, str]] = [] - for detail_url in detail_urls: - row = self._parse_detail(detail_url, province, city, list_url) - if row: - lawyers.append(row) - time.sleep(0.25) - - inserted, skipped = self._save_lawyers(lawyers) - total_inserted += inserted - total_parsed += len(lawyers) - print( - f" 第 {page} 页完成: 列表{len(detail_urls)}条, " - f"解析{len(lawyers)}条, 新增{inserted}条, 跳过{skipped}条" - ) - - page += 1 - time.sleep(0.5) - return total_inserted, total_parsed - def run(self): print("启动大律师采集...") if not self.areas: print("无地区数据") return - all_inserted = 0 - all_parsed = 0 for area in self.areas: - inserted, parsed = self._crawl_city(area) - all_inserted += inserted - all_parsed += parsed - print(f"大律师采集完成: 解析{all_parsed}条, 新增{all_inserted}条") + pinyin = area.get("pinyin") + province = area.get("province", "") + city = area.get("city", "") + if not pinyin: + continue + page = 1 + while True: + list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page) + print(f"采集 {province}-{city} 第 {page} 页: {list_url}") + html = self._get(list_url) + if not html: + break + inserted = self._parse_list(html, province, city, list_url) + if inserted == 0: + break + page += 1 + print("大律师采集完成") if __name__ == "__main__": diff --git a/common_sites/export_lawyers_excel.py b/common_sites/export_lawyers_excel.py index 898996a..46e9d11 100644 --- a/common_sites/export_lawyers_excel.py +++ b/common_sites/export_lawyers_excel.py @@ -19,6 +19,9 @@ if project_root not in sys.path: from Db import Db +DEFAULT_EXPORT_START_TS = 1772932103 + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="导出律师数据到 Excel") parser.add_argument( @@ -30,7 +33,10 @@ def parse_args() -> argparse.Namespace: "--start-ts", type=int, default=None, - help="create_time 起始时间戳(含),不传时默认取最近7天", + help=( + "create_time 起始时间戳(含)," + f"不传时默认取 {DEFAULT_EXPORT_START_TS} 之后的数据" + ), ) parser.add_argument( "--end-ts", @@ -83,9 +89,9 @@ def parse_args() -> argparse.Namespace: def apply_default_time_filter(args: argparse.Namespace) -> None: - # 未显式传时间范围时,默认导出最近7天的数据 + # 未显式传时间范围时,默认导出指定时间戳之后的数据 if args.start_ts is None and args.end_ts is None: - args.start_ts = int(time.time()) - 7 * 24 * 3600 + args.start_ts = DEFAULT_EXPORT_START_TS args.end_ts = 0 return if args.start_ts is None: @@ -211,11 +217,10 @@ def export_to_excel( ws = wb.active ws.title = "lawyers" - headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"] + headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain", "URL"] if include_extra: headers.extend( [ - "URL", "站点", "create_time", "create_time_text", @@ -270,12 +275,12 @@ def export_to_excel( row.get("city", "") or "", site_name, row.get("domain", "") or "", + row.get("url", "") or "", ] if include_extra: line.extend( [ - row.get("url", "") or "", row.get("domain", "") or "", row.get("create_time", "") or "", ts_to_text(row.get("create_time")), diff --git a/common_sites/findlaw.py b/common_sites/findlaw.py index b972893..2496037 100644 --- a/common_sites/findlaw.py +++ b/common_sites/findlaw.py @@ -1,16 +1,9 @@ -import argparse -import ast -import hashlib import json import os -import random -import re import sys import time -from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Set, Tuple - -import urllib3 +import random +from typing import Dict, List, Set, Optional current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) @@ -20,460 +13,212 @@ if request_dir not in sys.path: if project_root not in sys.path: sys.path.append(project_root) +import requests +from request.proxy_config import get_proxies, report_proxy_status from Db import Db -from request.requests_client import RequestClientError, RequestsClient -from utils.rate_limiter import wait_for_request -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -SITE_NAME = "findlaw" -LEGACY_DOMAIN = "找法网" -SITE_BASE = "https://m.findlaw.cn" -CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js" -LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1" - -PHONE_RE = re.compile(r"1[3-9]\d{9}") +DOMAIN = "找法网" +LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1" -@dataclass -class CityTarget: - province_id: str - province_name: str - province_py: str - city_id: str - city_name: str - city_py: str - - -def normalize_phone(text: str) -> str: - compact = re.sub(r"\D", "", text or "") - match = PHONE_RE.search(compact) - return match.group(0) if match else "" - - -class FindlawCrawler: - def __init__( - self, - max_pages: int = 9999, - sleep_seconds: float = 0.1, - use_proxy: bool = True, - db_connection=None, - ): - self.max_pages = max_pages - self.sleep_seconds = max(0.0, sleep_seconds) +class FindlawSpider: + def __init__(self, db_connection): self.db = db_connection - self.client = RequestsClient( - headers={ - "User-Agent": ( - "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 " - "Mobile/15E148 Safari/604.1" - ), - "Accept": "application/json, text/javascript, */*; q=0.01", - "X-Requested-With": "XMLHttpRequest", - "Connection": "close", - }, - use_proxy=use_proxy, - retry_total=2, - retry_backoff_factor=1, - retry_status_forcelist=(429, 500, 502, 503, 504), - retry_allowed_methods=("GET",), - ) + self.session = self._build_session() + self.cities = self._load_cities() - def _get_text( - self, - url: str, - timeout: int = 20, - max_retries: int = 3, - referer: str = SITE_BASE, - ) -> str: - headers = {"Referer": referer} - last_error: Optional[Exception] = None + def _build_session(self) -> requests.Session: + report_proxy_status() + session = requests.Session() + session.trust_env = False + proxies = get_proxies() + if proxies: + session.proxies.update(proxies) + else: + session.proxies.clear() + session.headers.update({ + "User-Agent": ( + "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 " + "Mobile/15E148 Safari/604.1" + ), + "Accept": "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Connection": "close", + }) + return session - for attempt in range(max_retries): - wait_for_request() - try: - resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers) - code = resp.status_code - if code == 403: - if attempt < max_retries - 1: - self.client.refresh() - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - raise RequestClientError(f"{code} Error: {url}") - if code >= 500 and attempt < max_retries - 1: - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - if code >= 400: - raise RequestClientError(f"{code} Error: {url}") - return resp.text - except Exception as exc: - last_error = exc - if attempt < max_retries - 1: - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - raise - - if last_error is not None: - raise last_error - raise RequestClientError(f"Unknown request error: {url}") - - def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]: - pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);" - match = re.search(pattern, script_text) - if not match: - return [] - raw = match.group(1) + def _refresh_session(self) -> None: try: - rows = ast.literal_eval(raw) - return rows if isinstance(rows, list) else [] + self.session.close() except Exception: - return [] + pass + self.session = self._build_session() - def discover_cities(self) -> List[CityTarget]: - js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/") - provinces = self._parse_city_js_array(js_text, "iosProvinces") - cities = self._parse_city_js_array(js_text, "iosCitys") - - province_map: Dict[str, Dict] = {} - for item in provinces: - pid = str(item.get("id") or "").strip() - if pid: - province_map[pid] = item - - results: List[CityTarget] = [] - seen_py: Set[str] = set() - for city in cities: - city_py = str(city.get("pinyin") or "").strip() - city_name = str(city.get("value") or "").strip() - city_id = str(city.get("id") or "").strip() - province_id = str(city.get("parentId") or "").strip() - if not city_py or not city_name or not city_id: - continue - if city_py in seen_py: - continue - seen_py.add(city_py) - - province_row = province_map.get(province_id, {}) - province_name = str(province_row.get("value") or city_name).strip() - province_py = str(province_row.get("pinyin") or city_py).strip() - - results.append( - CityTarget( - province_id=province_id, - province_name=province_name, - province_py=province_py, - city_id=city_id, - city_name=city_name, - city_py=city_py, - ) - ) - return results - - def _parse_list_payload(self, text: str) -> Dict: - cleaned = (text or "").strip().lstrip("\ufeff") - try: - return json.loads(cleaned) - except ValueError: - start = cleaned.find("{") - end = cleaned.rfind("}") - if start == -1 or end == -1: - return {} - return json.loads(cleaned[start:end + 1]) - - def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]: - list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page) - referer = f"{SITE_BASE}/{city_py}/q_lawyer/" - text = self._get_text(list_url, referer=referer) - payload = self._parse_list_payload(text) - if payload.get("errcode") != 0: - return [], False, list_url - - data = payload.get("data", {}) or {} - items = data.get("lawyer_list", []) or [] - has_more = str(data.get("has_more", "0")) == "1" - return items, has_more, list_url - - def crawl_city(self, target: CityTarget) -> Iterable[Dict]: - for page in range(1, self.max_pages + 1): + def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]: + headers = {"Referer": referer} + for attempt in range(max_retries): try: - items, has_more, list_url = self.fetch_list_page(target.city_py, page) - except Exception as exc: - print(f"[list] 失败 {target.city_py} p{page}: {exc}") - break + resp = self.session.get(url, timeout=15, verify=verify, headers=headers) + status_code = resp.status_code + text = resp.text + resp.close() + if status_code == 403: + if attempt < max_retries - 1: + wait_time = 2 ** attempt + random.uniform(0.3, 1.0) + print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}") + self._refresh_session() + time.sleep(wait_time) + continue + print(f"请求失败 {url}: 403 Forbidden") + return None + if status_code >= 400: + raise requests.exceptions.HTTPError(f"{status_code} Error: {url}") + return text + except requests.exceptions.SSLError: + if verify: + return self._get(url, referer, verify=False, max_retries=max_retries) + print(f"SSL错误 {url}") + return None + except requests.exceptions.RequestException as exc: + print(f"请求失败 {url}: {exc}") + return None + return None - if not items: - break - - for item in items: - detail_url = item.get("siteask_m") or item.get("site_url") or "" - detail_url = str(detail_url).strip() - if not detail_url.startswith("http"): - detail_url = list_url - - phone = normalize_phone(item.get("mobile", "")) - profile = { - "uid": str(item.get("uid") or ""), - "name": str(item.get("username") or "").strip(), - "law_firm": str(item.get("lawyer_lawroom") or "").strip(), - "phone": phone, - "lawyer_year": item.get("lawyer_year"), - "service_area": str(item.get("service_area") or "").strip(), - "address": str(item.get("addr") or "").strip(), - "specialties": item.get("professionArr") or [], - "answer_count": item.get("ansnum"), - "comment_count": item.get("askcommentnum"), - } - - now = int(time.time()) - uid = profile.get("uid", "") - record_key = uid or detail_url - record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest() - - area = item.get("areaInfo", {}) or {} - yield { - "record_id": record_id, - "collected_at": now, - "source": { - "site": SITE_NAME, - "list_url": list_url, - "detail_url": detail_url, - "province": str(area.get("province") or target.province_name), - "province_py": target.province_py, - "city": str(area.get("city") or target.city_name), - "city_py": target.city_py, - "page": page, - }, - "list_snapshot": { - "uid": uid, - "name": profile["name"], - "law_firm": profile["law_firm"], - "answer_count": profile["answer_count"], - "comment_count": profile["comment_count"], - }, - "profile": profile, - "raw": item, - } - if self.sleep_seconds: - time.sleep(self.sleep_seconds) - - if not has_more: - break - - def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]: - source = record.get("source", {}) or {} - profile = record.get("profile", {}) or {} - phone = normalize_phone(profile.get("phone", "")) - if not phone: - return None - - province = (source.get("province") or "").strip() - city = (source.get("city") or province).strip() - return { - "name": (profile.get("name") or "").strip(), - "law_firm": (profile.get("law_firm") or "").strip(), - "province": province, - "city": city, - "phone": phone, - "url": (source.get("detail_url") or source.get("list_url") or "").strip(), - "domain": LEGACY_DOMAIN, - "create_time": int(record.get("collected_at") or time.time()), - "params": json.dumps(record, ensure_ascii=False), - } - - def _existing_phones_in_db(self, phones: List[str]) -> Set[str]: - if not self.db or not phones: + def _existing_phones(self, phones: List[str]) -> Set[str]: + if not phones: return set() - deduped = sorted({p for p in phones if p}) - if not deduped: - return set() - existing: Set[str] = set() cur = self.db.db.cursor() try: chunk_size = 500 - for i in range(0, len(deduped), chunk_size): - chunk = deduped[i:i + chunk_size] + for i in range(0, len(phones), chunk_size): + chunk = phones[i:i + chunk_size] placeholders = ",".join(["%s"] * len(chunk)) sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})" - cur.execute(sql, [LEGACY_DOMAIN, *chunk]) + cur.execute(sql, [DOMAIN, *chunk]) for row in cur.fetchall(): existing.add(row[0]) finally: cur.close() return existing - def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]: - if not self.db: - return 0, 0 - - rows: List[Dict[str, str]] = [] - for record in records: - row = self._to_legacy_lawyer_row(record) - if row: - rows.append(row) - if not rows: - return 0, 0 - - existing = self._existing_phones_in_db([row["phone"] for row in rows]) - inserted = 0 - skipped = 0 - for row in rows: - phone = row.get("phone", "") - if not phone or phone in existing: - skipped += 1 - continue + def _load_cities(self): + condition = "domain='findlaw' AND level=2" + tables = ("area_new", "area2", "area") + last_error = None + for table in tables: try: - self.db.insert_data("lawyer", row) - existing.add(phone) - inserted += 1 + rows = self.db.select_data(table, "city, province, pinyin", condition) or [] except Exception as exc: - skipped += 1 - print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}") - return inserted, skipped + last_error = exc + continue + if rows: + missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip()) + print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}") + return rows - def crawl( - self, - output_path: str, - max_cities: int = 0, - city_filter: Optional[str] = None, - ) -> None: - cities = self.discover_cities() - print(f"[discover] 共发现城市 {len(cities)} 个") - if city_filter: - key = city_filter.strip().lower() - cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()] - print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}") - if max_cities > 0: - cities = cities[:max_cities] - print(f"[discover] 截断城市数 {len(cities)}") + if last_error: + print(f"[找法网] 加载地区数据失败: {last_error}") + print("[找法网] 无城市数据(已尝试 area_new/area2/area)") + for table in tables: + try: + cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition) + c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0 + print(f"[找法网] 校验: {table} 满足条件记录数: {c}") + except Exception: + pass + return [] - os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + def _fetch_page(self, url: str, referer: str) -> List[Dict]: + text = self._get(url, referer, verify=True) + if not text: + return [] - seen_ids: Set[str] = set() - if os.path.exists(output_path): - with open(output_path, "r", encoding="utf-8") as old_file: - for line in old_file: - line = line.strip() - if not line: + try: + # 某些返回体前会携带 BOM 或包装脚本,此处做兼容 + text = text.strip().lstrip("\ufeff") + try: + data = json.loads(text) + except ValueError: + json_start = text.find('{') + json_end = text.rfind('}') + if json_start == -1 or json_end == -1: + print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}") + return [] + cleaned = text[json_start:json_end + 1] + data = json.loads(cleaned) + if isinstance(data, str): + try: + data = json.loads(data) + except ValueError: + print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}") + return [] + except ValueError as exc: + print(f"解析JSON失败 {url}: {exc}") + return [] + + items = data.get("data", {}).get("lawyer_list", []) + parsed = [] + for item in items: + phone = (item.get("mobile") or "").replace("-", "") + parsed.append({ + "name": item.get("username", ""), + "law_firm": item.get("lawyer_lawroom", ""), + "province": item.get("areaInfo", {}).get("province", ""), + "city": item.get("areaInfo", {}).get("city", ""), + "phone": phone, + "url": url, + "domain": DOMAIN, + "create_time": int(time.time()), + "params": json.dumps(item, ensure_ascii=False) + }) + return parsed + + def run(self): + print("启动找法网采集...") + if not self.cities: + print("无城市数据") + return + + for city in self.cities: + pinyin = city.get("pinyin") + province = city.get("province", "") + city_name = city.get("city", "") + if not pinyin: + continue + print(f"采集 {province}-{city_name}") + page = 1 + while True: + url = LIST_TEMPLATE.format(pinyin=pinyin, page=page) + referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/" + print(f" 第 {page} 页: {url}") + items = self._fetch_page(url, referer) + if not items: + break + + phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()] + existing = self._existing_phones(phones) + + for entry in items: + phone = entry.get("phone") + if not phone: + continue + if phone in existing: + print(f" -- 已存在: {entry['name']} ({phone})") continue try: - item = json.loads(line) - except Exception: - continue - rid = item.get("record_id") - if rid: - seen_ids.add(rid) - print(f"[resume] 已有记录 {len(seen_ids)} 条") + self.db.insert_data("lawyer", entry) + print(f" -> 新增: {entry['name']} ({phone})") + except Exception as exc: + print(f" 插入失败: {exc}") - total_new_json = 0 - total_new_db = 0 - total_skip_db = 0 + page += 1 - with open(output_path, "a", encoding="utf-8") as out: - for idx, target in enumerate(cities, start=1): - print( - f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} " - f"({target.city_py})" - ) - city_records = list(self.crawl_city(target)) - - city_new_json = 0 - for record in city_records: - rid = record["record_id"] - if rid in seen_ids: - continue - out.write(json.dumps(record, ensure_ascii=False) + "\n") - seen_ids.add(rid) - city_new_json += 1 - total_new_json += 1 - - city_new_db, city_skip_db = self._write_records_to_db(city_records) - total_new_db += city_new_db - total_skip_db += city_skip_db - print( - f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, " - f"DB新增{city_new_db}条, DB跳过{city_skip_db}条" - ) - - print( - f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, " - f"DB跳过{total_skip_db}条, 输出: {output_path}" - ) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="找法网全新采集脚本(重写版)") - parser.add_argument( - "--output", - default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl", - help="输出 jsonl 文件路径", - ) - parser.add_argument( - "--max-cities", - type=int, - default=0, - help="最多采集多少个城市,0 表示不限", - ) - parser.add_argument( - "--max-pages", - type=int, - default=9999, - help="每个城市最多采集多少页", - ) - parser.add_argument( - "--city-filter", - default="", - help="按城市拼音或城市名过滤,如 beijing", - ) - parser.add_argument( - "--sleep", - type=float, - default=0.1, - help="每条记录采集间隔秒数", - ) - parser.add_argument( - "--direct", - action="store_true", - help="直连模式,不使用 proxy_settings.json 代理", - ) - parser.add_argument( - "--no-db", - action="store_true", - help="只输出 JSONL,不写入数据库", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - if args.no_db: - crawler = FindlawCrawler( - max_pages=args.max_pages, - sleep_seconds=args.sleep, - use_proxy=not args.direct, - db_connection=None, - ) - crawler.crawl( - output_path=args.output, - max_cities=args.max_cities, - city_filter=args.city_filter or None, - ) - return - - with Db() as db: - crawler = FindlawCrawler( - max_pages=args.max_pages, - sleep_seconds=args.sleep, - use_proxy=not args.direct, - db_connection=db, - ) - crawler.crawl( - output_path=args.output, - max_cities=args.max_cities, - city_filter=args.city_filter or None, - ) + print("找法网采集完成") if __name__ == "__main__": - main() + with Db() as db: + spider = FindlawSpider(db) + spider.run() diff --git a/common_sites/hualv.py b/common_sites/hualv.py index 56c7b78..8eaa30d 100644 --- a/common_sites/hualv.py +++ b/common_sites/hualv.py @@ -1,19 +1,10 @@ -import argparse -import ast -import hashlib import json import os -import pymysql -import random import re import sys import time -from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Set, Tuple -from urllib.parse import urljoin - -import urllib3 -from bs4 import BeautifulSoup +import random +from typing import Dict, Optional current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) @@ -23,819 +14,328 @@ if request_dir not in sys.path: if project_root not in sys.path: sys.path.append(project_root) +import requests +from bs4 import BeautifulSoup +from request.proxy_config import get_proxies, report_proxy_status + from Db import Db -from request.requests_client import RequestClientError, RequestsClient -from utils.rate_limiter import wait_for_request +from config import HEADERS -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -SITE_NAME = "hualv" -LEGACY_DOMAIN = "华律" -SITE_BASE = "https://m.66law.cn" -CITY_DATA_URL = "https://cache.66law.cn/dist/main-v2.0.js" -LIST_API_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/" - -EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}") -PHONE_CANDIDATE_RE = re.compile(r"(? str: - if not text: - return "" - - # 避免把邮箱前缀中的数字误识别为手机号 - sanitized = EMAIL_RE.sub(" ", str(text)) - for match in PHONE_CANDIDATE_RE.finditer(sanitized): - candidate = match.group(0) - compact = re.sub(r"\D", "", candidate) - if compact.startswith("86") and len(compact) == 13: - compact = compact[2:] - if len(compact) == 11 and compact.startswith("1") and compact[1] in "3456789": - return compact - - return "" - - -def strip_html_tags(text: str) -> str: - return re.sub(r"<[^>]+>", "", text or "").strip() - - -class HualvCrawler: - def __init__( - self, - max_pages: int = 9999, - sleep_seconds: float = 0.15, - use_proxy: bool = True, - db_connection=None, - ): - self.max_pages = max_pages - self.sleep_seconds = max(0.0, sleep_seconds) +class HualvSpider: + def __init__(self, db_connection): self.db = db_connection - self.client = RequestsClient( - headers={ - "User-Agent": ( - "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 " - "Mobile/15E148 Safari/604.1" - ), - "Accept": "application/json, text/javascript, */*; q=0.01", - "X-Requested-With": "XMLHttpRequest", - "Connection": "close", - }, - use_proxy=use_proxy, - retry_total=2, - retry_backoff_factor=1, - retry_status_forcelist=(429, 500, 502, 503, 504), - retry_allowed_methods=("GET", "POST"), + self.session = self._build_session() + self.areas = self._load_areas() + + def _build_session(self) -> requests.Session: + report_proxy_status() + session = requests.Session() + session.trust_env = False + proxies = get_proxies() + if proxies: + session.proxies.update(proxies) + else: + session.proxies.clear() + custom_headers = HEADERS.copy() + custom_headers['User-Agent'] = ( + 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) ' + 'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 ' + 'Mobile/15E148 Safari/604.1' ) + custom_headers["Connection"] = "close" + session.headers.update(custom_headers) + return session - def _request_text( - self, - method: str, - url: str, - *, - timeout: int = 20, - max_retries: int = 3, - referer: str = SITE_BASE, - data: Optional[Dict] = None, - ) -> str: - headers = {"Referer": referer} - last_error: Optional[Exception] = None + def _refresh_session(self) -> None: + try: + self.session.close() + except Exception: + pass + self.session = self._build_session() - for attempt in range(max_retries): - wait_for_request() + def _load_areas(self): + tables = ("area_new", "area2", "area") + last_error = None + for table in tables: try: - if method.upper() == "POST": - resp = self.client.post_text( - url, - timeout=timeout, - verify=False, - headers=headers, - data=data, - ) - else: - resp = self.client.get_text( - url, - timeout=timeout, - verify=False, - headers=headers, - ) - - code = resp.status_code - if code == 403: - if attempt < max_retries - 1: - self.client.refresh() - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - raise RequestClientError(f"{code} Error: {url}") - if code >= 500 and attempt < max_retries - 1: - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - if code >= 400: - raise RequestClientError(f"{code} Error: {url}") - return resp.text + provinces = self.db.select_data( + table, + "code, province, pinyin, id", + "domain='66law' AND level=1" + ) or [] + cities = self.db.select_data( + table, + "code, city, province, pid", + "domain='66law' AND level=2" + ) or [] except Exception as exc: last_error = exc - if attempt < max_retries - 1: - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - raise - - if last_error is not None: - raise last_error - raise RequestClientError(f"Unknown request error: {url}") - - def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str: - return self._request_text( - "GET", - url, - timeout=timeout, - max_retries=max_retries, - referer=referer, - ) - - def _post_text( - self, - url: str, - *, - data: Dict, - timeout: int = 20, - max_retries: int = 3, - referer: str = SITE_BASE, - ) -> str: - return self._request_text( - "POST", - url, - timeout=timeout, - max_retries=max_retries, - referer=referer, - data=data, - ) - - def _extract_spc_location(self, script_text: str) -> List: - # main-v2.js 内置了 sPCLocation=new Array(...),后面紧跟 cateinfo 数组 - marker = "sPCLocation = new Array(" - start = script_text.find(marker) - if start == -1: - marker = "sPCLocation=new Array(" - start = script_text.find(marker) - if start == -1: - return [] - start += len(marker) - - next_marker = script_text.find("cateinfo = new Array(", start) - if next_marker == -1: - next_marker = script_text.find("cateinfo=new Array(", start) - - if next_marker != -1: - end = script_text.rfind(");", start, next_marker) - else: - end = script_text.find(");", start) - - if end == -1 or end <= start: - return [] - - raw = "[" + script_text[start:end] + "]" - try: - data = ast.literal_eval(raw) - except Exception: - return [] - return data if isinstance(data, list) else [] - - def discover_cities(self) -> List[CityTarget]: - script_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyer/") - rows = self._extract_spc_location(script_text) - - targets: List[CityTarget] = [] - seen: Set[Tuple[int, int]] = set() - - for province in rows: - if not isinstance(province, list) or len(province) < 3: continue - try: - province_id = int(province[0]) - except Exception: + + if not cities: continue - province_name = str(province[1] or "").strip() - city_rows = province[2] if isinstance(province[2], list) else [] - for city in city_rows: - if not isinstance(city, list) or len(city) < 2: - continue - try: - city_id = int(city[0]) - except Exception: - continue - city_name = str(city[1] or "").strip() - if city_id <= 0 or not city_name: - continue - - key = (province_id, city_id) - if key in seen: - continue - seen.add(key) - - targets.append( - CityTarget( - province_id=province_id, - province_name=province_name, - city_id=city_id, - city_name=city_name, - ) - ) - return targets - - def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[Dict], int]: - payload = { - "pid": str(target.province_id), - "cid": str(target.city_id), - "page": str(page), - } - text = self._post_text( - LIST_API_URL, - data=payload, - referer=SITE_BASE + "/findlawyer/", - ) - data = json.loads((text or "").strip().lstrip("\ufeff") or "{}") - items = data.get("lawyerList") or data.get("queryLawyerList") or [] - if not isinstance(items, list): - items = [] - - page_count = 0 - try: - page_count = int((data.get("lawyerItems") or {}).get("pageCount") or 0) - except Exception: - page_count = 0 - return items, page_count - - def parse_detail(self, detail_url: str) -> Dict: - contact_url = detail_url.rstrip("/") + "/lawyer_contact.aspx" - html = self._get_text(contact_url, referer=detail_url) - soup = BeautifulSoup(html, "html.parser") - full_text = soup.get_text(" ", strip=True) - - name = "" - law_firm = "" - phone = "" - email = "" - address = "" - license_no = "" - practice_years: Optional[int] = None - - name_tag = soup.select_one(".logo-box .title b") - if name_tag: - name = name_tag.get_text(strip=True).replace("律师", "").strip() - if not name and soup.title: - match = re.search(r"([^\s,,。_]+?)律师", soup.title.get_text(" ", strip=True)) - if match: - name = match.group(1).strip() - - phone_candidates = [ - soup.select_one(".logo-box .r-bar .tel").get_text(" ", strip=True) - if soup.select_one(".logo-box .r-bar .tel") - else "", - soup.select_one(".lawyer-show ul.info").get_text(" ", strip=True) - if soup.select_one(".lawyer-show ul.info") - else "", - full_text, - ] - for candidate in phone_candidates: - phone = normalize_phone(candidate) - if phone: - break - - for li in soup.select(".lawyer-show ul.info li"): - li_text = li.get_text(" ", strip=True) - if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm: - law_firm = li_text - - if not law_firm: - match = re.search(r'"affiliation":\{"@type":"Organization","name":"([^"]+)"', html) - if match: - law_firm = match.group(1).strip() - - match = re.search(r'"identifier":"([^"]+)"', html) - if match: - license_no = match.group(1).strip() - - match = re.search(r'"streetAddress":"([^"]+)"', html) - if match: - address = match.group(1).strip() - - email_match = EMAIL_RE.search(html) - if email_match: - email = email_match.group(0).strip() - - year_match = YEAR_RE.search(full_text) - if year_match: - try: - practice_years = int(year_match.group(1)) - except Exception: - practice_years = None - - specialties = [node.get_text(strip=True) for node in soup.select(".tag-h38 span")] - specialties = [x for x in specialties if x] - - return { - "name": name, - "law_firm": law_firm, - "phone": phone, - "email": email, - "address": address, - "license_no": license_no, - "practice_years": practice_years, - "specialties": specialties, - "detail_url": detail_url, - "contact_url": contact_url, - } - - def crawl_city(self, target: CityTarget) -> Iterable[Dict]: - seen_details: Set[str] = set() - - for page in range(1, self.max_pages + 1): - try: - items, page_count = self.fetch_list_page(target, page) - except Exception as exc: - print(f"[list] 失败 pid={target.province_id} cid={target.city_id} p{page}: {exc}") - break - - if not items: - break - - for item in items: - detail_url = str(item.get("lawyerUrl") or "").strip() - if not detail_url: - continue - if detail_url.startswith("//"): - detail_url = "https:" + detail_url - if not detail_url.startswith("http"): - detail_url = urljoin(SITE_BASE, detail_url) - - if detail_url in seen_details: - continue - seen_details.add(detail_url) - - try: - detail = self.parse_detail(detail_url) - except Exception as exc: - print(f"[detail] 失败 {detail_url}: {exc}") - continue - - now = int(time.time()) - uid = str(item.get("lawyerId") or item.get("globalUserId") or detail_url) - record_id = hashlib.md5(uid.encode("utf-8")).hexdigest() - - list_name = str(item.get("name") or "").replace("律师", "").strip() - category_text = str(item.get("categoryNames") or "").strip() - category_arr = [x.strip() for x in re.split(r"[、,,]", category_text) if x.strip()] - - yield { - "record_id": record_id, - "collected_at": now, - "source": { - "site": SITE_NAME, - "province_id": target.province_id, - "province": target.province_name, - "city_id": target.city_id, - "city": target.city_name, - "page": page, - "detail_url": detail_url, - "contact_url": detail.get("contact_url", ""), - }, - "list_snapshot": { - "lawyer_id": item.get("lawyerId"), - "name": list_name, - "category_names": category_arr, - "help_count": strip_html_tags(str(item.get("helpCount") or "")), - "comment_score": strip_html_tags(str(item.get("commentScore") or "")), - "response_time": str(item.get("responseTime") or "").strip(), - "year": item.get("year"), - "is_adv": bool(item.get("isAdv")), - }, - "profile": { - "name": detail.get("name") or list_name, - "law_firm": detail.get("law_firm") or "", - "phone": detail.get("phone") or "", - "email": detail.get("email") or "", - "address": detail.get("address") or "", - "license_no": detail.get("license_no") or "", - "practice_years": detail.get("practice_years"), - "specialties": detail.get("specialties") or category_arr, - }, - "raw": item, + province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces} + city_map = {} + for city in cities: + province_info = province_map.get(city.get('pid'), {}) or {} + province_code = province_info.get('code') + city_map[city.get('code')] = { + "name": city.get('city'), + "province": city.get('province'), + "province_code": province_code, } + print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}") + return city_map - if self.sleep_seconds: - time.sleep(self.sleep_seconds) + if last_error: + print(f"[华律] 加载地区数据失败: {last_error}") + print("[华律] 无城市数据(已尝试 area_new/area2/area)") + return {} - if page_count > 0 and page >= page_count: - break + def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]: + for attempt in range(max_retries): + try: + resp = self.session.post(LIST_URL, data=data, timeout=20, verify=False) + status_code = resp.status_code + text = resp.text + resp.close() + if status_code == 403: + if attempt < max_retries - 1: + wait_time = 2 ** attempt + random.uniform(0.3, 1.0) + print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})") + self._refresh_session() + time.sleep(wait_time) + continue + print("请求失败: 403 Forbidden") + return None + if status_code >= 400: + raise requests.exceptions.HTTPError(f"{status_code} Error") + try: + return json.loads(text) + except ValueError as exc: + print(f"解析JSON失败: {exc}") + return None + except requests.exceptions.RequestException as exc: + print(f"请求失败: {exc}") + return None + return None - def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]: - source = record.get("source", {}) or {} - profile = record.get("profile", {}) or {} + def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]: + contact_url = f"{url}lawyer_contact.aspx" + print(f" 详情: {contact_url}") + existing = self.db.select_data( + "lawyer", + "id, avatar_url", + f"domain='{DOMAIN}' AND url='{contact_url}'" + ) + existing_id = None + if existing: + existing_id = existing[0].get("id") + avatar = (existing[0].get("avatar_url") or "").strip() + if avatar: + print(" -- 已存在且头像已补全,跳过") + return None - phone = normalize_phone(profile.get("phone", "")) - if not phone: + html = self._get_detail(contact_url) + if not html: return None - province = (source.get("province") or "").strip() - city = (source.get("city") or province).strip() - return { - "name": (profile.get("name") or "").strip(), - "law_firm": (profile.get("law_firm") or "").strip(), + soup = BeautifulSoup(html, "html.parser") + info_list = soup.find("ul", class_="information-list") + if not info_list: + return None + + phone = "" + law_firm = "" + for li in info_list.find_all("li"): + text = li.get_text(strip=True) + if "手机号" in text: + cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip() + match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', '')) + if match: + phone = match.group(0) + if "执业单位" in text: + law_firm = text.replace("执业单位", "").strip() + + name = "" + breadcrumb = soup.find("div", class_="weizhi") + if breadcrumb: + links = breadcrumb.find_all("a") + if len(links) > 2: + name = links[2].get_text(strip=True) + + phone = phone.replace('-', '').strip() + if not phone or not re.fullmatch(r"1\d{10}", phone): + print(" 无手机号,跳过") + return None + + avatar_url, site_time = self._extract_avatar_and_time(soup) + data = { + "phone": phone, "province": province, "city": city, - "phone": phone, - "url": (source.get("contact_url") or source.get("detail_url") or "").strip(), - "domain": LEGACY_DOMAIN, - "create_time": int(record.get("collected_at") or time.time()), - "params": json.dumps(record, ensure_ascii=False), + "law_firm": law_firm, + "url": contact_url, + "avatar_url": avatar_url, + "create_time": int(time.time()), + "site_time": site_time, + "domain": DOMAIN, + "name": name, + "params": json.dumps({"source": url}, ensure_ascii=False) } - - def _existing_phones_in_db(self, phones: List[str]) -> Set[str]: - if not self.db or not phones: - return set() - - deduped = sorted({p for p in phones if p}) - if not deduped: - return set() - - existing: Set[str] = set() - cur = self.db.db.cursor() - try: - chunk_size = 500 - for i in range(0, len(deduped), chunk_size): - chunk = deduped[i:i + chunk_size] - placeholders = ",".join(["%s"] * len(chunk)) - sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})" - cur.execute(sql, [LEGACY_DOMAIN, *chunk]) - for row in cur.fetchall(): - existing.add(row[0]) - finally: - cur.close() - - return existing - - def _extract_email_from_params_text(self, params_text: str) -> str: - if not params_text: - return "" - try: - data = json.loads(params_text) - except Exception: - return "" - if not isinstance(data, dict): - return "" - profile = data.get("profile") or {} - if not isinstance(profile, dict): - return "" - return str(profile.get("email") or "").strip() - - def _is_phone_from_email_prefix(self, phone: str, email: str) -> bool: - phone_text = str(phone or "").strip() - email_text = str(email or "").strip() - if not phone_text or not email_text or "@" not in email_text: - return False - prefix = email_text.split("@", 1)[0] - prefix_phone = normalize_phone(prefix) - return bool(prefix_phone) and prefix_phone == phone_text - - def _existing_rows_by_urls(self, urls: List[str]) -> Dict[str, List[Dict[str, str]]]: - if not self.db or not urls: - return {} - - deduped = sorted({u for u in urls if u}) - if not deduped: - return {} - - result: Dict[str, List[Dict[str, str]]] = {} - cur = self.db.db.cursor(pymysql.cursors.DictCursor) - try: - chunk_size = 200 - for i in range(0, len(deduped), chunk_size): - chunk = deduped[i:i + chunk_size] - placeholders = ",".join(["%s"] * len(chunk)) - sql = ( - "SELECT id, phone, url, params FROM lawyer " - f"WHERE domain=%s AND url IN ({placeholders})" - ) - cur.execute(sql, [LEGACY_DOMAIN, *chunk]) - for row in cur.fetchall() or []: - key = str(row.get("url") or "") - if not key: - continue - result.setdefault(key, []).append(row) - finally: - cur.close() - - return result - - def _cleanup_dirty_duplicates_for_urls(self, urls: List[str]) -> int: - if not self.db: - return 0 - - by_url = self._existing_rows_by_urls(urls) - if not by_url: - return 0 - - delete_ids: List[int] = [] - for _, rows in by_url.items(): - if len(rows) <= 1: - continue - - dirty_ids: List[int] = [] - has_clean = False - for row in rows: - row_id = int(row.get("id") or 0) - row_phone = str(row.get("phone") or "").strip() - row_email = self._extract_email_from_params_text(str(row.get("params") or "")) - if row_id <= 0: - continue - if self._is_phone_from_email_prefix(row_phone, row_email): - dirty_ids.append(row_id) - else: - has_clean = True - - if has_clean and dirty_ids: - delete_ids.extend(dirty_ids) - - if not delete_ids: - return 0 - - removed = 0 - cur = self.db.db.cursor() - try: - chunk_size = 300 - for i in range(0, len(delete_ids), chunk_size): - chunk = delete_ids[i:i + chunk_size] - placeholders = ",".join(["%s"] * len(chunk)) - sql = f"DELETE FROM lawyer WHERE id IN ({placeholders})" - cur.execute(sql, chunk) - removed += cur.rowcount - self.db.db.commit() - finally: - cur.close() - - return removed - - def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int, int, int]: - if not self.db: - return 0, 0, 0, 0 - - rows: List[Dict[str, str]] = [] - for record in records: - row = self._to_legacy_lawyer_row(record) - if row: - rows.append(row) - if not rows: - return 0, 0, 0, 0 - - existing = self._existing_phones_in_db([row["phone"] for row in rows]) - existing_by_url = self._existing_rows_by_urls([str(row.get("url") or "") for row in rows]) - inserted = 0 - skipped = 0 - repaired = 0 - - cur = self.db.db.cursor() - update_sql = ( - "UPDATE lawyer SET name=%s, phone=%s, law_firm=%s, province=%s, city=%s, " - "url=%s, domain=%s, create_time=%s, params=%s WHERE id=%s" - ) - - for row in rows: - phone = str(row.get("phone") or "").strip() - url = str(row.get("url") or "").strip() - if not phone: - skipped += 1 - continue - - same_url_rows = existing_by_url.get(url, []) if url else [] - if same_url_rows: - if any(str(item.get("phone") or "").strip() == phone for item in same_url_rows): - skipped += 1 - continue - - row_email = self._extract_email_from_params_text(str(row.get("params") or "")) - new_is_dirty = self._is_phone_from_email_prefix(phone, row_email) - - repair_target = None - for item in same_url_rows: - old_phone = str(item.get("phone") or "").strip() - old_email = self._extract_email_from_params_text(str(item.get("params") or "")) - if self._is_phone_from_email_prefix(old_phone, old_email): - repair_target = item - break - - if repair_target and not new_is_dirty: - try: - cur.execute( - update_sql, - ( - row.get("name") or "", - phone, - row.get("law_firm") or "", - row.get("province") or "", - row.get("city") or "", - row.get("url") or "", - row.get("domain") or LEGACY_DOMAIN, - int(row.get("create_time") or time.time()), - row.get("params") or "{}", - int(repair_target.get("id") or 0), - ), - ) - self.db.db.commit() - repaired += 1 - existing.add(phone) - old_phone = str(repair_target.get("phone") or "").strip() - if old_phone: - existing.discard(old_phone) - repair_target["phone"] = phone - repair_target["params"] = row.get("params") or "{}" - continue - except Exception as exc: - print(f"[db] 修复失败 phone={phone} url={url}: {exc}") - - if not phone or phone in existing: - skipped += 1 - continue + if existing_id: + update_data = { + "avatar_url": avatar_url, + "site_time": site_time, + } + if name: + update_data["name"] = name + if law_firm: + update_data["law_firm"] = law_firm + if province: + update_data["province"] = province + if city: + update_data["city"] = city + if phone: + update_data["phone"] = phone + update_data["params"] = json.dumps({"source": url}, ensure_ascii=False) try: - self.db.insert_data("lawyer", row) - existing.add(phone) - inserted += 1 + self.db.update_data("lawyer", update_data, f"id={existing_id}") + print(" -- 已存在,已补全头像/时间") except Exception as exc: - skipped += 1 - print(f"[db] 插入失败 phone={phone} url={url}: {exc}") + print(f" 更新失败: {exc}") + return None + # 若手机号已存在,则更新头像/时间,不再插入新记录 + existing_phone = self.db.select_data( + "lawyer", + "id, avatar_url, url", + f"domain='{DOMAIN}' AND phone='{phone}'" + ) + if existing_phone: + existing_row = existing_phone[0] + avatar = (existing_row.get("avatar_url") or "").strip() + if avatar: + print(" -- 已存在手机号且头像已补全,跳过") + return None + update_data = { + "avatar_url": avatar_url, + "site_time": site_time, + } + if name: + update_data["name"] = name + if law_firm: + update_data["law_firm"] = law_firm + if province: + update_data["province"] = province + if city: + update_data["city"] = city + if phone: + update_data["phone"] = phone + if not existing_row.get("url"): + update_data["url"] = contact_url + update_data["params"] = json.dumps({"source": url}, ensure_ascii=False) + try: + self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}") + print(" -- 已存在手机号,已补全头像/时间") + except Exception as exc: + print(f" 更新失败: {exc}") + return None + return data - cur.close() + def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]): + avatar_url = "" + site_time = None + img_tag = soup.select_one( + "div.fixed-bottom-bar div.contact-lawye a.lr-photo img" + ) + if img_tag: + src = (img_tag.get("src") or "").strip() + if src: + if src.startswith("//"): + avatar_url = f"https:{src}" + else: + avatar_url = src + match = re.search(r"/(20\d{2})(\d{2})/", avatar_url) + if match: + site_time = int(f"{match.group(1)}{match.group(2)}") + else: + match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url) + if match: + site_time = int(f"{match.group(1)}{match.group(2)}") + return avatar_url, site_time - cleaned = self._cleanup_dirty_duplicates_for_urls([str(row.get("url") or "") for row in rows]) + def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]: + for attempt in range(max_retries): + try: + resp = self.session.get(url, timeout=15, verify=False) + status_code = resp.status_code + text = resp.text + resp.close() + if status_code == 403: + if attempt < max_retries - 1: + wait_time = 2 ** attempt + random.uniform(0.3, 1.0) + print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})") + self._refresh_session() + time.sleep(wait_time) + continue + print(" 请求失败: 403 Forbidden") + return None + if status_code >= 400: + raise requests.exceptions.HTTPError(f"{status_code} Error") + return text + except requests.exceptions.RequestException as exc: + print(f" 请求失败: {exc}") + return None + return None - return inserted, skipped, repaired, cleaned + def run(self): + print("启动华律网采集...") + if not self.areas: + print("无城市数据") + return - def crawl( - self, - output_path: str, - max_cities: int = 0, - city_filter: Optional[str] = None, - ) -> None: - cities = self.discover_cities() - print(f"[discover] 共发现城市 {len(cities)} 个") + for city_code, city_info in self.areas.items(): + province_code = city_info.get("province_code") + if not province_code: + continue + province_name = city_info.get("province", "") + city_name = city_info.get("name", "") + print(f"采集 {province_name}-{city_name}") - if city_filter: - key = city_filter.strip().lower() - cities = [ - c for c in cities - if key in c.city_name.lower() or key in str(c.city_id).lower() - ] - print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}") + page = 1 + while True: + payload = {"pid": province_code, "cid": city_code, "page": str(page)} + data = self._post(payload) + if not data or not data.get("lawyerList"): + break - if max_cities > 0: - cities = cities[:max_cities] - print(f"[discover] 截断城市数 {len(cities)}") - - os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) - - seen_ids: Set[str] = set() - if os.path.exists(output_path): - with open(output_path, "r", encoding="utf-8") as old_file: - for line in old_file: - line = line.strip() - if not line: + for item in data["lawyerList"]: + result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name) + if not result: continue try: - item = json.loads(line) - except Exception: - continue - rid = item.get("record_id") - if rid: - seen_ids.add(rid) - print(f"[resume] 已有记录 {len(seen_ids)} 条") + self.db.insert_data("lawyer", result) + print(f" -> 新增: {result['name']} ({result['phone']})") + except Exception as exc: + print(f" 插入失败: {exc}") + time.sleep(1) - total_new_json = 0 - total_new_db = 0 - total_skip_db = 0 - total_repair_db = 0 - total_clean_db = 0 + page_count = data.get("lawyerItems", {}).get("pageCount", page) + if page >= page_count: + break + page += 1 + time.sleep(2) - with open(output_path, "a", encoding="utf-8") as out: - for idx, target in enumerate(cities, start=1): - print( - f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} " - f"(pid={target.province_id}, cid={target.city_id})" - ) - city_records = list(self.crawl_city(target)) - - city_new_json = 0 - for record in city_records: - rid = record["record_id"] - if rid in seen_ids: - continue - out.write(json.dumps(record, ensure_ascii=False) + "\n") - seen_ids.add(rid) - city_new_json += 1 - total_new_json += 1 - - city_new_db, city_skip_db, city_repair_db, city_clean_db = self._write_records_to_db(city_records) - total_new_db += city_new_db - total_skip_db += city_skip_db - total_repair_db += city_repair_db - total_clean_db += city_clean_db - - print( - f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, " - f"DB新增{city_new_db}条, DB修复{city_repair_db}条, " - f"DB清理{city_clean_db}条, DB跳过{city_skip_db}条" - ) - - print( - f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, " - f"DB修复{total_repair_db}条, DB清理{total_clean_db}条, " - f"DB跳过{total_skip_db}条, 输出: {output_path}" - ) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="华律网全新采集脚本(站点数据直采)") - parser.add_argument( - "--output", - default="/www/wwwroot/lawyers/data/hualv_records_all.jsonl", - help="输出 jsonl 文件路径", - ) - parser.add_argument( - "--max-cities", - type=int, - default=0, - help="最多采集多少个城市,0 表示不限", - ) - parser.add_argument( - "--max-pages", - type=int, - default=9999, - help="每个城市最多采集多少页", - ) - parser.add_argument( - "--city-filter", - default="", - help="按城市名称或城市编码过滤,如 beijing / 110100", - ) - parser.add_argument( - "--sleep", - type=float, - default=0.15, - help="详情页请求间隔秒数", - ) - parser.add_argument( - "--direct", - action="store_true", - help="直连模式,不使用 proxy_settings.json 代理", - ) - parser.add_argument( - "--no-db", - action="store_true", - help="只输出 JSONL,不写入数据库", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - - if args.no_db: - crawler = HualvCrawler( - max_pages=args.max_pages, - sleep_seconds=args.sleep, - use_proxy=not args.direct, - db_connection=None, - ) - crawler.crawl( - output_path=args.output, - max_cities=args.max_cities, - city_filter=args.city_filter or None, - ) - return - - with Db() as db: - crawler = HualvCrawler( - max_pages=args.max_pages, - sleep_seconds=args.sleep, - use_proxy=not args.direct, - db_connection=db, - ) - crawler.crawl( - output_path=args.output, - max_cities=args.max_cities, - city_filter=args.city_filter or None, - ) + time.sleep(1) + print("华律网采集完成") if __name__ == "__main__": - main() + with Db() as db: + spider = HualvSpider(db) + spider.run() diff --git a/common_sites/lawtime.py b/common_sites/lawtime.py index a3c0285..2ce89aa 100644 --- a/common_sites/lawtime.py +++ b/common_sites/lawtime.py @@ -1,16 +1,13 @@ -import argparse -import hashlib import json import os -import random import re import sys import time -from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Set, Tuple - -import urllib3 -from bs4 import BeautifulSoup +import random +from typing import Dict, Optional, List, Set +from urllib.parse import urljoin +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) @@ -20,628 +17,281 @@ if request_dir not in sys.path: if project_root not in sys.path: sys.path.append(project_root) -from Db import Db -from request.requests_client import RequestClientError, RequestsClient -from utils.rate_limiter import wait_for_request +import requests +import urllib3 +from bs4 import BeautifulSoup +from request.proxy_config import get_proxies, report_proxy_status urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -SITE_NAME = "lawtime" -LEGACY_DOMAIN = "法律快车" -SITE_BASE = "https://www.lawtime.cn" -PROVINCE_API = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode=0" -CITY_API_TEMPLATE = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode={province_id}" -LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/lawyer" +from Db import Db +from config import LAWTIME_CONFIG -PHONE_RE = re.compile(r"1[3-9]\d{9}") -YEAR_RE = re.compile(r"执业\s*(\d+)\s*年") +LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}" +DETAIL_BASE = "https://m.lawtime.cn" +DOMAIN = "法律快车" -@dataclass -class CityTarget: - province_id: str - province_name: str - province_py: str - city_id: str - city_name: str - city_py: str - - -@dataclass -class ListCard: - detail_url: str - name: str - phone: str - address: str = "" - specialties: List[str] = field(default_factory=list) - metric_text: str = "" - - -def normalize_phone(text: str) -> str: - compact = re.sub(r"\D", "", text or "") - match = PHONE_RE.search(compact) - return match.group(0) if match else "" - - -class LawtimeCrawler: - def __init__( - self, - max_pages: int = 9999, - sleep_seconds: float = 0.1, - use_proxy: bool = True, - db_connection=None, - ): - self.max_pages = max_pages - self.sleep_seconds = max(0.0, sleep_seconds) +class LawtimeSpider: + def __init__(self, db_connection): self.db = db_connection - self.client = RequestsClient( - headers={ - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/122.0.0.0 Safari/537.36" - ), - "Accept": "text/html,application/json,*/*;q=0.8", - "Connection": "close", - }, - use_proxy=use_proxy, - retry_total=2, - retry_backoff_factor=1, - retry_status_forcelist=(429, 500, 502, 503, 504), - retry_allowed_methods=("GET",), - ) + self.session = self._build_session() + self.max_workers = int(os.getenv("SPIDER_WORKERS", "8")) + self._tls = threading.local() - def _get_text( - self, - url: str, - *, - timeout: int = 20, - max_retries: int = 3, - referer: str = SITE_BASE, - ) -> str: - headers = {"Referer": referer} - last_error: Optional[Exception] = None + def _build_session(self) -> requests.Session: + report_proxy_status() + session = requests.Session() + session.trust_env = False + proxies = get_proxies() + if proxies: + session.proxies.update(proxies) + else: + session.proxies.clear() + headers = LAWTIME_CONFIG.get("HEADERS", {}) + if headers: + session.headers.update(headers) + session.headers.setdefault("Connection", "close") + return session - for attempt in range(max_retries): - wait_for_request() - try: - resp = self.client.get_text( - url, - timeout=timeout, - verify=False, - headers=headers, - ) - code = resp.status_code - if code == 403: - if attempt < max_retries - 1: - self.client.refresh() - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - raise RequestClientError(f"{code} Error: {url}") - if code >= 500 and attempt < max_retries - 1: - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - if code >= 400: - raise RequestClientError(f"{code} Error: {url}") - return resp.text - except Exception as exc: - last_error = exc - if attempt < max_retries - 1: - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - raise - - if last_error is not None: - raise last_error - raise RequestClientError(f"Unknown request error: {url}") - - def _get_json(self, url: str, *, referer: str) -> List[Dict]: - text = self._get_text(url, referer=referer) - cleaned = (text or "").strip().lstrip("\ufeff") - if not cleaned or cleaned.startswith("<"): - return [] + def _refresh_session(self) -> None: try: - data = json.loads(cleaned) - except ValueError: - return [] - return data if isinstance(data, list) else [] + self.session.close() + except Exception: + pass + self.session = self._build_session() - def discover_cities(self) -> List[CityTarget]: - provinces = self._get_json(PROVINCE_API, referer=SITE_BASE) - if not provinces: - print("[discover] 地区接口未返回有效数据") - return [] + def _get_thread_session(self) -> requests.Session: + s = getattr(self._tls, "session", None) + if s is not None: + return s + s = self._build_session() + s.headers.update(dict(self.session.headers)) + self._tls.session = s + return s - results: List[CityTarget] = [] - seen_py: Set[str] = set() - - for province in provinces: - province_id = str(province.get("id") or "").strip() - province_name = str(province.get("province") or province.get("city") or "").strip() - province_py = str(province.get("pinyin") or "").strip() - if not province_id or not province_name: - continue - - city_api = CITY_API_TEMPLATE.format(province_id=province_id) + def _refresh_thread_session(self) -> None: + s = getattr(self._tls, "session", None) + if s is not None: try: - cities = self._get_json(city_api, referer=LIST_URL_TEMPLATE.format(city_py=province_py or "")) - except Exception as exc: - print(f"[city] 获取失败 province={province_id}: {exc}") - continue - - if not cities: - cities = [ - { - "id": province_id, - "province": province_name, - "city": province_name, - "pinyin": province_py, - } - ] - - for city in cities: - city_id = str(city.get("id") or "").strip() - city_name = str(city.get("city") or city.get("province") or "").strip() - city_py = str(city.get("pinyin") or "").strip() - if not city_id or not city_name or not city_py: - continue - if city_py in seen_py: - continue - seen_py.add(city_py) - - results.append( - CityTarget( - province_id=province_id, - province_name=province_name, - province_py=province_py, - city_id=city_id, - city_name=city_name, - city_py=city_py, - ) - ) - - return results - - def _build_list_url(self, city_py: str, page: int) -> str: - base = LIST_URL_TEMPLATE.format(city_py=city_py) - if page <= 1: - return base - return f"{base}?page={page}" - - def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[ListCard], bool, str]: - list_url = self._build_list_url(target.city_py, page) - html = self._get_text(list_url, referer=SITE_BASE + "/") - - cards = self.parse_list_cards(html) - - soup = BeautifulSoup(html, "html.parser") - next_link = soup.select_one(f"div.page a[href*='page={page + 1}']") - has_next = next_link is not None - - return cards, has_next, list_url - - def parse_list_cards(self, html: str) -> List[ListCard]: - soup = BeautifulSoup(html, "html.parser") - cards: List[ListCard] = [] - seen: Set[str] = set() - - for item in soup.select("li.lawyer-item-card"): - link_tag = item.select_one("a.name[href]") or item.select_one("a.lawyer-img-box[href]") - if not link_tag: - continue - detail_url = (link_tag.get("href") or "").strip() - if not detail_url.startswith("http"): - continue - if detail_url in seen: - continue - seen.add(detail_url) - - name = link_tag.get_text(strip=True) - phone = "" - phone_tag = item.select_one("div.phone") - if phone_tag: - phone = normalize_phone(phone_tag.get_text(" ", strip=True)) - - address = "" - addr_tag = item.select_one("div.location .txt") - if addr_tag: - address = addr_tag.get_text(" ", strip=True) - - specialties: List[str] = [] - prof_tag = item.select_one("div.prof .txt") - if prof_tag: - specialties = [ - x.strip() for x in re.split(r"[、,,]", prof_tag.get_text(" ", strip=True)) if x.strip() - ] - - metric_text = "" - metric_tag = item.select_one("div.num-msg") - if metric_tag: - metric_text = metric_tag.get_text(" ", strip=True) - - cards.append( - ListCard( - detail_url=detail_url, - name=name, - phone=phone, - address=address, - specialties=specialties, - metric_text=metric_text, - ) - ) - - return cards - - def parse_detail(self, detail_url: str) -> Dict: - html = self._get_text(detail_url, referer=SITE_BASE) - if "网站防火墙" in html or "访问被拒绝" in html or "/ipfilter/verify" in html: - raise RequestClientError(f"firewall blocked: {detail_url}") - - soup = BeautifulSoup(html, "html.parser") - text = soup.get_text(" ", strip=True) - - name = "" - law_firm = "" - phone = "" - address = "" - practice_years: Optional[int] = None - specialties: List[str] = [] - - if soup.title: - title = soup.title.get_text(" ", strip=True) - match = re.search(r"([^\s_,,。]+?)律师", title) - if match: - name = match.group(1).strip() - - phone_candidates = [ - soup.select_one(".data-w .tel-b b").get_text(" ", strip=True) - if soup.select_one(".data-w .tel-b b") - else "", - soup.select_one(".law-info-b .item .two-r.b").get_text(" ", strip=True) - if soup.select_one(".law-info-b .item .two-r.b") - else "", - text, - ] - for candidate in phone_candidates: - phone = normalize_phone(candidate) - if phone: - break - - law_firm_tag = soup.select_one(".law-info-b .item .two-nowrap") - if law_firm_tag: - law_firm = law_firm_tag.get_text(" ", strip=True) - - for li in soup.select(".law-info-b .item"): - li_text = li.get_text(" ", strip=True) - if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm: - law_firm = li_text - - addr_tag = soup.select_one(".law-info-b .item .two-r[title]") - if addr_tag: - addr_value = (addr_tag.get("title") or "").strip() - if len(addr_value) > 8: - address = addr_value - - if not address: - addr_tag = soup.select_one(".law-info-b .item .two-r") - if addr_tag: - addr_value = addr_tag.get_text(" ", strip=True) - if len(addr_value) > 8 and "律师" not in addr_value: - address = addr_value - - year_match = YEAR_RE.search(text) - if year_match: - try: - practice_years = int(year_match.group(1)) + s.close() except Exception: - practice_years = None + pass + self._tls.session = None - specialties = [x.get_text(strip=True) for x in soup.select(".profession-b .item") if x.get_text(strip=True)] - - return { - "name": name, - "law_firm": law_firm, - "phone": phone, - "address": address, - "practice_years": practice_years, - "specialties": specialties, - "detail_url": detail_url, - } - - def crawl_city(self, target: CityTarget) -> Iterable[Dict]: - seen_details: Set[str] = set() - - for page in range(1, self.max_pages + 1): - try: - cards, has_next, list_url = self.fetch_list_page(target, page) - except Exception as exc: - print(f"[list] 失败 {target.city_py} p{page}: {exc}") - break - - if not cards: - break - - for card in cards: - if card.detail_url in seen_details: - continue - seen_details.add(card.detail_url) - - detail: Dict = {} - try: - detail = self.parse_detail(card.detail_url) - except Exception as exc: - print(f"[detail] 失败 {card.detail_url}: {exc}") - - phone = normalize_phone(detail.get("phone") or card.phone) - profile_name = (detail.get("name") or card.name).replace("律师", "").strip() - - now = int(time.time()) - record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest() - - yield { - "record_id": record_id, - "collected_at": now, - "source": { - "site": SITE_NAME, - "province_id": target.province_id, - "province": target.province_name, - "province_py": target.province_py, - "city_id": target.city_id, - "city": target.city_name, - "city_py": target.city_py, - "page": page, - "list_url": list_url, - "detail_url": card.detail_url, - }, - "list_snapshot": { - "name": card.name, - "phone": card.phone, - "address": card.address, - "specialties": card.specialties, - "metric_text": card.metric_text, - }, - "profile": { - "name": profile_name, - "law_firm": (detail.get("law_firm") or "").strip(), - "phone": phone, - "address": (detail.get("address") or card.address or "").strip(), - "practice_years": detail.get("practice_years"), - "specialties": detail.get("specialties") or card.specialties, - }, - } - - if self.sleep_seconds: - time.sleep(self.sleep_seconds) - - if not has_next: - break - - def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]: - source = record.get("source", {}) or {} - profile = record.get("profile", {}) or {} - - phone = normalize_phone(profile.get("phone", "")) - if not phone: - return None - - province = (source.get("province") or "").strip() - city = (source.get("city") or province).strip() - return { - "name": (profile.get("name") or "").strip(), - "law_firm": (profile.get("law_firm") or "").strip(), - "province": province, - "city": city, - "phone": phone, - "url": (source.get("detail_url") or source.get("list_url") or "").strip(), - "domain": LEGACY_DOMAIN, - "create_time": int(record.get("collected_at") or time.time()), - "params": json.dumps(record, ensure_ascii=False), - } - - def _existing_phones_in_db(self, phones: List[str]) -> Set[str]: - if not self.db or not phones: + def _existing_phones(self, phones: List[str]) -> Set[str]: + if not phones: return set() - - deduped = sorted({p for p in phones if p}) - if not deduped: - return set() - existing: Set[str] = set() cur = self.db.db.cursor() try: chunk_size = 500 - for i in range(0, len(deduped), chunk_size): - chunk = deduped[i:i + chunk_size] + for i in range(0, len(phones), chunk_size): + chunk = phones[i:i + chunk_size] placeholders = ",".join(["%s"] * len(chunk)) sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})" - cur.execute(sql, [LEGACY_DOMAIN, *chunk]) + cur.execute(sql, [DOMAIN, *chunk]) for row in cur.fetchall(): existing.add(row[0]) finally: cur.close() - return existing - def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]: - if not self.db: - return 0, 0 + def _load_areas(self): + condition = "level = 2 and domain='法律快车'" + tables = ("area_new", "area", "area2") + last_error = None + for table in tables: + try: + rows = self.db.select_data(table, "pinyin, province, city", condition) or [] + except Exception as exc: + last_error = exc + continue + if rows: + missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip()) + print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}") + return rows - rows: List[Dict[str, str]] = [] - for record in records: - row = self._to_legacy_lawyer_row(record) - if row: - rows.append(row) - if not rows: - return 0, 0 + if last_error: + print(f"[法律快车] 加载地区数据失败: {last_error}") + print("[法律快车] 无城市数据(已尝试 area_new/area/area2)") + return [] - existing = self._existing_phones_in_db([row["phone"] for row in rows]) - inserted = 0 - skipped = 0 + def _get(self, url: str, max_retries: int = 3) -> Optional[str]: + return self._get_with_session(self.session, url, max_retries=max_retries, is_thread=False) - for row in rows: - phone = row.get("phone", "") - if not phone or phone in existing: - skipped += 1 + def _get_with_session(self, session: requests.Session, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]: + for attempt in range(max_retries): + try: + resp = session.get(url, timeout=15, verify=False) + status_code = resp.status_code + text = resp.text + resp.close() + if status_code == 403: + if attempt < max_retries - 1: + wait_time = 2 ** attempt + random.uniform(0.3, 1.0) + print(f"请求失败 {url}: 403,{wait_time}秒后重试 ({attempt + 1}/{max_retries})") + if is_thread: + self._refresh_thread_session() + session = self._get_thread_session() + else: + self._refresh_session() + session = self.session + time.sleep(wait_time) + continue + print(f"请求失败 {url}: 403 Forbidden") + return None + if status_code >= 400: + raise requests.exceptions.HTTPError(f"{status_code} Error: {url}") + return text + except requests.exceptions.RequestException as exc: + print(f"请求失败 {url}: {exc}") + return None + return None + + def _parse_list(self, html: str, province: str, city: str) -> int: + soup = BeautifulSoup(html, "html.parser") + links = [a.get("href", "") for a in soup.select("a.hide_link")] + links = [link.replace("lll", "int") for link in links if link] + if not links: + return 0 + + detail_urls = [urljoin(DETAIL_BASE, link) for link in links] + + results: List[Dict[str, str]] = [] + with ThreadPoolExecutor(max_workers=self.max_workers) as ex: + futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls] + for fut in as_completed(futs): + try: + data = fut.result() + except Exception as exc: + print(f" 详情解析异常: {exc}") + continue + if data and data.get("phone"): + results.append(data) + + if not results: + return len(detail_urls) + + phones = [d["phone"] for d in results if d.get("phone")] + existing = self._existing_phones(phones) + + for data in results: + phone = data.get("phone") + if not phone: + continue + if phone in existing: + print(f" -- 已存在: {data['name']} ({phone})") continue try: - self.db.insert_data("lawyer", row) - existing.add(phone) - inserted += 1 + self.db.insert_data("lawyer", data) + print(f" -> 新增: {data['name']} ({phone})") except Exception as exc: - skipped += 1 - print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}") + print(f" 插入失败 {data.get('url')}: {exc}") - return inserted, skipped + return len(detail_urls) - def crawl( - self, - output_path: str, - max_cities: int = 0, - city_filter: Optional[str] = None, - ) -> None: - cities = self.discover_cities() - print(f"[discover] 共发现城市 {len(cities)} 个") + def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]: + html = None + sess = self._get_thread_session() + html = self._get_with_session(sess, url, max_retries=3, is_thread=True) + if not html: + return None - if city_filter: - key = city_filter.strip().lower() - cities = [ - c for c in cities - if key in c.city_py.lower() or key in c.city_name.lower() - ] - print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}") + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text(" ") - if max_cities > 0: - cities = cities[:max_cities] - print(f"[discover] 截断城市数 {len(cities)}") + name = "" + title_tag = soup.find("title") + if title_tag: + match = re.search(r"(\S+)律师", title_tag.get_text()) + if match: + name = match.group(1) + if not name: + intl_div = soup.find("div", class_="intl") + if intl_div: + match = re.search(r"(\S+)律师", intl_div.get_text()) + if match: + name = match.group(1) - os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + phone = "" + phone_pattern = r"1[3-9]\d{9}" + for item in soup.select("div.item.flex"): + label = item.find("div", class_="label") + desc = item.find("div", class_="desc") + if not label or not desc: + continue + label_text = label.get_text() + desc_text = desc.get_text().replace("-", "") + if "联系电话" in label_text or "电话" in label_text: + matches = re.findall(phone_pattern, desc_text) + if matches: + phone = matches[0] + break + if not phone: + matches = re.findall(phone_pattern, text.replace("-", "")) + if matches: + phone = matches[0] + if not phone: + print(f" 无手机号: {url}") + return None - seen_ids: Set[str] = set() - if os.path.exists(output_path): - with open(output_path, "r", encoding="utf-8") as old_file: - for line in old_file: - line = line.strip() - if not line: - continue - try: - item = json.loads(line) - except Exception: - continue - rid = item.get("record_id") - if rid: - seen_ids.add(rid) - print(f"[resume] 已有记录 {len(seen_ids)} 条") + law_firm = "" + for item in soup.select("div.item.flex"): + label = item.find("div", class_="label") + desc = item.find("div", class_="desc") + if not label or not desc: + continue + if "执业律所" in label.get_text() or "律所" in label.get_text(): + law_firm = desc.get_text(strip=True).replace("已认证", "") + break - total_new_json = 0 - total_new_db = 0 - total_skip_db = 0 + params = { + "list_url": url, + "province": province, + "city": city, + } - with open(output_path, "a", encoding="utf-8") as out: - for idx, target in enumerate(cities, start=1): - print( - f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} " - f"({target.city_py})" - ) - city_records = list(self.crawl_city(target)) + return { + "name": name or "", + "law_firm": law_firm, + "province": province, + "city": city, + "phone": phone, + "url": url, + "domain": DOMAIN, + "create_time": int(time.time()), + "params": json.dumps(params, ensure_ascii=False) + } - city_new_json = 0 - for record in city_records: - rid = record["record_id"] - if rid in seen_ids: - continue - out.write(json.dumps(record, ensure_ascii=False) + "\n") - seen_ids.add(rid) - city_new_json += 1 - total_new_json += 1 + def run(self): + print("启动法律快车采集...") + areas = self._load_areas() + if not areas: + print("无地区数据") + return - city_new_db, city_skip_db = self._write_records_to_db(city_records) - total_new_db += city_new_db - total_skip_db += city_skip_db - - print( - f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, " - f"DB新增{city_new_db}条, DB跳过{city_skip_db}条" - ) - - print( - f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, " - f"DB跳过{total_skip_db}条, 输出: {output_path}" - ) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="法律快车全新采集脚本(站点数据直采)") - parser.add_argument( - "--output", - default="/www/wwwroot/lawyers/data/lawtime_records_all.jsonl", - help="输出 jsonl 文件路径", - ) - parser.add_argument( - "--max-cities", - type=int, - default=0, - help="最多采集多少个城市,0 表示不限", - ) - parser.add_argument( - "--max-pages", - type=int, - default=9999, - help="每个城市最多采集多少页", - ) - parser.add_argument( - "--city-filter", - default="", - help="按城市拼音或城市名过滤,如 beijing", - ) - parser.add_argument( - "--sleep", - type=float, - default=0.1, - help="详情页请求间隔秒数", - ) - parser.add_argument( - "--direct", - action="store_true", - help="直连模式,不使用 proxy_settings.json 代理", - ) - parser.add_argument( - "--no-db", - action="store_true", - help="只输出 JSONL,不写入数据库", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - - if args.no_db: - crawler = LawtimeCrawler( - max_pages=args.max_pages, - sleep_seconds=args.sleep, - use_proxy=not args.direct, - db_connection=None, - ) - crawler.crawl( - output_path=args.output, - max_cities=args.max_cities, - city_filter=args.city_filter or None, - ) - return - - with Db() as db: - crawler = LawtimeCrawler( - max_pages=args.max_pages, - sleep_seconds=args.sleep, - use_proxy=not args.direct, - db_connection=db, - ) - crawler.crawl( - output_path=args.output, - max_cities=args.max_cities, - city_filter=args.city_filter or None, - ) + for area in areas: + pinyin = area.get("pinyin") + province = area.get("province", "") + city = area.get("city", "") + if not pinyin: + continue + page = 1 + while True: + list_url = LIST_BASE.format(pinyin=pinyin, page=page) + print(f"采集 {province}-{city} 第 {page} 页: {list_url}") + html = self._get(list_url) + if not html: + break + link_count = self._parse_list(html, province, city) + if link_count == 0: + break + page += 1 + print("法律快车采集完成") if __name__ == "__main__": - main() + with Db() as db: + spider = LawtimeSpider(db) + spider.run() diff --git a/common_sites/six4365.py b/common_sites/six4365.py index 84da880..9fb3651 100644 --- a/common_sites/six4365.py +++ b/common_sites/six4365.py @@ -1,17 +1,11 @@ -import argparse -import hashlib import json import os -import random -import re import sys import time -from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Set, Tuple -from urllib.parse import urljoin - -import urllib3 -from bs4 import BeautifulSoup +import random +from typing import Dict, Optional, List, Set +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) @@ -21,237 +15,165 @@ if request_dir not in sys.path: if project_root not in sys.path: sys.path.append(project_root) -from Db import Db -from request.requests_client import RequestClientError, RequestsClient -from utils.rate_limiter import wait_for_request +import requests +import urllib3 +from bs4 import BeautifulSoup +from request.proxy_config import get_proxies, report_proxy_status urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -SITE_NAME = "64365" -LEGACY_DOMAIN = "律图" -SITE_BASE = "https://m.64365.com" -AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js" -LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/" +from Db import Db -PHONE_RE = re.compile(r"1[3-9]\d{9}") -YEAR_RE = re.compile(r"(\d+)\s*年") +DOMAIN = "律图" +LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/" -@dataclass -class CityTarget: - area_id: str - province_id: str - province_name: str - province_py: str - city_name: str - city_py: str - - -@dataclass -class ListCard: - detail_url: str - name: str - specialties: List[str] - score_text: str - service_text: str - - -def normalize_phone(text: str) -> str: - compact = re.sub(r"\D", "", text or "") - match = PHONE_RE.search(compact) - return match.group(0) if match else "" - - -class Six4365Crawler: - def __init__( - self, - max_pages: int = 9999, - sleep_seconds: float = 0.1, - use_proxy: bool = True, - db_connection=None, - ): - self.max_pages = max_pages - self.sleep_seconds = max(0.0, sleep_seconds) +class Six4365Spider: + def __init__(self, db_connection): self.db = db_connection - self.client = RequestsClient( - headers={ - "User-Agent": ( - "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 " - "Mobile/15E148 Safari/604.1" - ), - "Accept": "text/html, */*; q=0.01", - "Connection": "close", - }, - use_proxy=use_proxy, - retry_total=2, - retry_backoff_factor=1, - retry_status_forcelist=(429, 500, 502, 503, 504), - retry_allowed_methods=("GET", "POST"), - ) + self.session = self._build_session() + self.max_workers = int(os.getenv("SPIDER_WORKERS", "8")) + self._tls = threading.local() + self.cities = self._load_cities() - def _request_text( - self, - method: str, - url: str, - *, - timeout: int = 20, - max_retries: int = 3, - referer: str = SITE_BASE, - data: Optional[Dict] = None, - ) -> str: - headers = {"Referer": referer} - last_error: Optional[Exception] = None + def _build_session(self) -> requests.Session: + report_proxy_status() + session = requests.Session() + session.trust_env = False + proxies = get_proxies() + if proxies: + session.proxies.update(proxies) + else: + session.proxies.clear() + session.headers.update({ + "User-Agent": ( + "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 " + "Mobile/15E148 Safari/604.1" + ), + "Connection": "close", + }) + return session - for attempt in range(max_retries): - wait_for_request() + def _refresh_session(self) -> None: + try: + self.session.close() + except Exception: + pass + self.session = self._build_session() + + def _get_thread_session(self) -> requests.Session: + """requests.Session 不是严格线程安全:每个线程用独立 session(但共享同样代理/headers)""" + s = getattr(self._tls, "session", None) + if s is not None: + return s + s = self._build_session() + s.headers.update(dict(self.session.headers)) + self._tls.session = s + return s + + def _refresh_thread_session(self) -> None: + s = getattr(self._tls, "session", None) + if s is not None: try: - if method.upper() == "POST": - resp = self.client.post_text( - url, - timeout=timeout, - verify=False, - headers=headers, - data=data, - ) - else: - resp = self.client.get_text( - url, - timeout=timeout, - verify=False, - headers=headers, - ) + s.close() + except Exception: + pass + self._tls.session = None - code = resp.status_code - if code == 403: - if attempt < max_retries - 1: - self.client.refresh() - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - raise RequestClientError(f"{code} Error: {url}") - if code >= 500 and attempt < max_retries - 1: - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - if code >= 400: - raise RequestClientError(f"{code} Error: {url}") - return resp.text + def _existing_urls(self, urls: List[str]) -> Set[str]: + """批量查重,减少 N 次 is_data_exist""" + if not urls: + return set() + existing: Set[str] = set() + cur = self.db.db.cursor() + try: + # IN 参数过多会失败,分批 + chunk_size = 500 + for i in range(0, len(urls), chunk_size): + chunk = urls[i:i + chunk_size] + placeholders = ",".join(["%s"] * len(chunk)) + sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})" + cur.execute(sql, chunk) + for row in cur.fetchall(): + # pymysql 默认返回 tuple + existing.add(row[0]) + finally: + cur.close() + return existing + + def _load_cities(self): + tables = ("area_new", "area2", "area") + last_error = None + for table in tables: + try: + provinces = self.db.select_data( + table, + "id, code, province", + "domain='64365' AND level=1" + ) or [] + cities = self.db.select_data( + table, + "code, city, province, pid", + "domain='64365' AND level=2" + ) or [] except Exception as exc: last_error = exc - if attempt < max_retries - 1: - time.sleep((2 ** attempt) + random.uniform(0.2, 0.8)) - continue - raise + continue - if last_error is not None: - raise last_error - raise RequestClientError(f"Unknown request error: {url}") + if not cities: + continue - def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str: - return self._request_text( - "GET", - url, - timeout=timeout, - max_retries=max_retries, - referer=referer, - ) + province_map = {row.get('id'): row for row in provinces} + data = {} + for city in cities: + province_row = province_map.get(city.get('pid'), {}) or {} + data[str(city.get('code'))] = { + "name": city.get('city'), + "province": city.get('province'), + "province_name": province_row.get('province', city.get('province')), + } + print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}") + return data - def _post_text( - self, - url: str, - *, - data: Dict, - timeout: int = 20, - max_retries: int = 3, - referer: str = SITE_BASE, - ) -> str: - return self._request_text( - "POST", - url, - timeout=timeout, - max_retries=max_retries, - referer=referer, - data=data, - ) + if last_error: + print(f"[律图] 加载地区数据失败: {last_error}") + print("[律图] 无城市数据(已尝试 area_new/area2/area)") + return {} - def _extract_area_data(self, text: str) -> List[Dict]: - match = re.search( - r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData", - text, - re.S, - ) - if not match: - return [] - - raw = match.group(1) - try: - data = json.loads(raw) - except Exception: - return [] - return data if isinstance(data, list) else [] - - def discover_cities(self) -> List[CityTarget]: - text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/") - provinces = self._extract_area_data(text) - - targets: List[CityTarget] = [] - seen_area: Set[str] = set() - - for province in provinces: - province_id = str(province.get("id") or "").strip() - province_name = str(province.get("name") or "").strip() - province_py = str(province.get("py") or "").strip() - child_rows = province.get("child") or [] - - # 常规省份 child 是地级市;直辖市 child 是区县,此时使用省级 id 抓取 - if child_rows and any((row.get("child") or []) for row in child_rows): - for city in child_rows: - area_id = str(city.get("id") or "").strip() - city_name = str(city.get("name") or "").strip() - city_py = str(city.get("py") or "").strip() - if not area_id or not city_name: + def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]: + for attempt in range(max_retries): + try: + resp = self.session.post(LIST_URL, data=payload, timeout=10, verify=False) + status_code = resp.status_code + text = resp.text + resp.close() + if status_code == 403: + if attempt < max_retries - 1: + wait_time = 2 ** attempt + random.uniform(0.3, 1.0) + print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})") + self._refresh_session() + time.sleep(wait_time) continue - if area_id in seen_area: - continue - seen_area.add(area_id) - targets.append( - CityTarget( - area_id=area_id, - province_id=province_id, - province_name=province_name, - province_py=province_py, - city_name=city_name, - city_py=city_py, - ) - ) - else: - if not province_id or not province_name: - continue - if province_id in seen_area: - continue - seen_area.add(province_id) - targets.append( - CityTarget( - area_id=province_id, - province_id=province_id, - province_name=province_name, - province_py=province_py, - city_name=province_name, - city_py=province_py, - ) - ) + print("请求失败: 403 Forbidden") + return None + if status_code >= 400: + raise requests.exceptions.HTTPError(f"{status_code} Error") + return text + except requests.exceptions.RequestException as exc: + print(f"请求失败: {exc}") + return None + return None - return targets - - def _build_payload(self, area_id: str, page: int) -> Dict[str, str]: - ua = self.client.headers.get("User-Agent", "") + def _build_payload(self, city_code: str, page: int) -> Dict[str, str]: return { "AdCode": "", - "RegionId": str(area_id), + "RegionId": str(city_code), "CategoryId": "", "MaxNumber": "", "OnlyData": "true", "IgnoreButton": "", - "LawyerRecommendRequest[AreaId]": str(area_id), + "LawyerRecommendRequest[AreaId]": str(city_code), "LawyerRecommendRequest[LawCategoryIds]": "", "LawyerRecommendRequest[LawFirmPersonCount]": "", "LawyerRecommendRequest[LawFirmScale]": "", @@ -268,429 +190,163 @@ class Six4365Crawler: "LawyerRecommendRequest[RefferUrl]": "", "LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/", "LawyerRecommendRequest[resource_type_name]": "", - "LawyerRecommendRequest[UserAgent]": ua, + "LawyerRecommendRequest[UserAgent]": self.session.headers["User-Agent"], "LawyerRecommendRequest[AddLawyerWithNoData]": "false", "ShowCaseButton": "true", } - def fetch_list_html(self, target: CityTarget, page: int) -> str: - payload = self._build_payload(target.area_id, page) - return self._post_text( - LIST_API_URL, - data=payload, - referer=SITE_BASE + "/findlawyer/", - ) - - def parse_list_cards(self, html: str) -> List[ListCard]: + def _parse_list(self, html: str, province: str, city: str) -> int: soup = BeautifulSoup(html, "html.parser") - cards: List[ListCard] = [] - seen: Set[str] = set() + lawyers = soup.find_all("a", class_="lawyer") + if not lawyers: + return 0 - for anchor in soup.select("a.lawyer[href]"): - href = (anchor.get("href") or "").strip() + detail_urls: List[str] = [] + for lawyer in lawyers: + href = lawyer.get("href") if not href: continue - detail_url = urljoin(SITE_BASE, href) - if detail_url in seen: + detail_urls.append(f"{href.rstrip('/')}/info/") + + if not detail_urls: + return 0 + + results: List[Dict[str, str]] = [] + with ThreadPoolExecutor(max_workers=self.max_workers) as ex: + futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls] + for fut in as_completed(futs): + try: + data = fut.result() + except Exception as exc: + print(f" 详情解析异常: {exc}") + continue + if data: + results.append(data) + + if not results: + return len(detail_urls) + + existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")]) + for data in results: + if not data: continue - seen.add(detail_url) + url = data.get("url", "") + if not url: + continue + if url in existing: + print(f" -- 已存在URL: {url}") + continue + try: + self.db.insert_data("lawyer", data) + print(f" -> 新增: {data['name']} ({data['phone']})") + except Exception as exc: + print(f" 插入失败 {url}: {exc}") - name = "" - name_tag = anchor.select_one("b.name") - if name_tag: - name = name_tag.get_text(strip=True) + return len(detail_urls) - specialties: List[str] = [] - skill_tag = anchor.select_one("div.skill") - if skill_tag: - raw = skill_tag.get_text(" ", strip=True).replace("擅长:", "") - specialties = [x.strip() for x in re.split(r"[、,,]", raw) if x.strip()] + def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]: + html = self._get_detail(url) + if not html: + return None - score_text = "" - score_tag = anchor.select_one("div.info span[title='评分'] em") - if score_tag: - score_text = score_tag.get_text(strip=True) - - service_text = "" - service_tag = anchor.select_one("div.info") - if service_tag: - service_text = service_tag.get_text(" ", strip=True) - - cards.append( - ListCard( - detail_url=detail_url, - name=name, - specialties=specialties, - score_text=score_text, - service_text=service_text, - ) - ) - - return cards - - def parse_detail(self, detail_url: str) -> Dict: - info_url = detail_url.rstrip("/") + "/info/" - html = self._get_text(info_url, referer=detail_url) soup = BeautifulSoup(html, "html.parser") + base_info = soup.find("ul", class_="intro-basic-bar") + if not base_info: + return None name = "" law_firm = "" phone = "" - practice_years: Optional[int] = None - office_area = "" - address = "" - specialties: List[str] = [] - for li in soup.select("ul.intro-basic-bar li"): - label_tag = li.select_one("span.label") - value_tag = li.select_one("div.txt") - if not label_tag or not value_tag: + for li in base_info.find_all("li"): + label = li.find("span", class_="label") + txt = li.find("div", class_="txt") + if not label or not txt: continue + label_text = label.get_text(strip=True) + if "姓名" in label_text: + name = txt.get_text(strip=True) + if "执业律所" in label_text: + law_firm = txt.get_text(strip=True) - label = label_tag.get_text(" ", strip=True).replace(":", "") - value = value_tag.get_text(" ", strip=True) + more_section = soup.find("div", class_="more-intro-basic") + if more_section: + phone_ul = more_section.find("ul", class_="intro-basic-bar") + if phone_ul: + for li in phone_ul.find_all("li"): + label = li.find("span", class_="label") + txt = li.find("div", class_="txt") + if label and txt and "联系电话" in label.get_text(strip=True): + phone = txt.get_text(strip=True).replace(" ", "") + break - if "姓名" in label and not name: - name = value - elif "执业律所" in label and not law_firm: - law_firm = value - elif "联系电话" in label and not phone: - phone = normalize_phone(value) - elif "执业年限" in label and practice_years is None: - year_match = YEAR_RE.search(value) - if year_match: - try: - practice_years = int(year_match.group(1)) - except Exception: - practice_years = None - elif "办公地区" in label and not office_area: - office_area = value - elif "办公地址" in label and not address: - address = value - - text = soup.get_text(" ", strip=True) - if not phone: - phone = normalize_phone(text) - - if not name and soup.title: - title = soup.title.get_text(" ", strip=True) - match = re.search(r"([^\s_,,。]+?)律师", title) - if match: - name = match.group(1).strip() - - skill_match = re.search(r"擅长:([^\n]+)", text) - if skill_match: - specialties = [x.strip() for x in re.split(r"[、,,]", skill_match.group(1)) if x.strip()] - - return { - "name": name, - "law_firm": law_firm, - "phone": phone, - "practice_years": practice_years, - "office_area": office_area, - "address": address, - "specialties": specialties, - "detail_url": detail_url, - "info_url": info_url, - } - - def crawl_city(self, target: CityTarget) -> Iterable[Dict]: - seen_detail_urls: Set[str] = set() - page_first_seen: Set[str] = set() - - for page in range(1, self.max_pages + 1): - try: - html = self.fetch_list_html(target, page) - except Exception as exc: - print(f"[list] 失败 area={target.area_id} p{page}: {exc}") - break - - cards = self.parse_list_cards(html) - if not cards: - break - - first_url = cards[0].detail_url - if first_url in page_first_seen: - break - page_first_seen.add(first_url) - - for card in cards: - if card.detail_url in seen_detail_urls: - continue - seen_detail_urls.add(card.detail_url) - - try: - detail = self.parse_detail(card.detail_url) - except Exception as exc: - print(f"[detail] 失败 {card.detail_url}: {exc}") - continue - - now = int(time.time()) - uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url) - uid = uid_match.group(1) if uid_match else card.detail_url - record_id = hashlib.md5(uid.encode("utf-8")).hexdigest() - - yield { - "record_id": record_id, - "collected_at": now, - "source": { - "site": SITE_NAME, - "province_id": target.province_id, - "province": target.province_name, - "province_py": target.province_py, - "area_id": target.area_id, - "city": target.city_name, - "city_py": target.city_py, - "page": page, - "detail_url": card.detail_url, - "info_url": detail.get("info_url", ""), - }, - "list_snapshot": { - "name": card.name, - "specialties": card.specialties, - "score_text": card.score_text, - "service_text": card.service_text, - }, - "profile": { - "name": detail.get("name") or card.name, - "law_firm": detail.get("law_firm") or "", - "phone": detail.get("phone") or "", - "practice_years": detail.get("practice_years"), - "office_area": detail.get("office_area") or "", - "address": detail.get("address") or "", - "specialties": detail.get("specialties") or card.specialties, - }, - } - - if self.sleep_seconds: - time.sleep(self.sleep_seconds) - - def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]: - source = record.get("source", {}) or {} - profile = record.get("profile", {}) or {} - - phone = normalize_phone(profile.get("phone", "")) - if not phone: + phone = phone.replace('-', '').strip() + if not name or not phone: return None - province = (source.get("province") or "").strip() - city = (source.get("city") or province).strip() - return { - "name": (profile.get("name") or "").strip(), - "law_firm": (profile.get("law_firm") or "").strip(), + data = { + "phone": phone, "province": province, "city": city, - "phone": phone, - "url": (source.get("info_url") or source.get("detail_url") or "").strip(), - "domain": LEGACY_DOMAIN, - "create_time": int(record.get("collected_at") or time.time()), - "params": json.dumps(record, ensure_ascii=False), + "law_firm": law_firm, + "url": url, + "domain": DOMAIN, + "name": name, + "create_time": int(time.time()), + "params": json.dumps({"province": province, "city": city}, ensure_ascii=False) } + return data - def _existing_phones_in_db(self, phones: List[str]) -> Set[str]: - if not self.db or not phones: - return set() - - deduped = sorted({p for p in phones if p}) - if not deduped: - return set() - - existing: Set[str] = set() - cur = self.db.db.cursor() - try: - chunk_size = 500 - for i in range(0, len(deduped), chunk_size): - chunk = deduped[i:i + chunk_size] - placeholders = ",".join(["%s"] * len(chunk)) - sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})" - cur.execute(sql, [LEGACY_DOMAIN, *chunk]) - for row in cur.fetchall(): - existing.add(row[0]) - finally: - cur.close() - - return existing - - def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]: - if not self.db: - return 0, 0 - - rows: List[Dict[str, str]] = [] - for record in records: - row = self._to_legacy_lawyer_row(record) - if row: - rows.append(row) - if not rows: - return 0, 0 - - existing = self._existing_phones_in_db([row["phone"] for row in rows]) - inserted = 0 - skipped = 0 - - for row in rows: - phone = row.get("phone", "") - if not phone or phone in existing: - skipped += 1 - continue + def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]: + session = self._get_thread_session() + for attempt in range(max_retries): try: - self.db.insert_data("lawyer", row) - existing.add(phone) - inserted += 1 - except Exception as exc: - skipped += 1 - print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}") - - return inserted, skipped - - def crawl( - self, - output_path: str, - max_cities: int = 0, - city_filter: Optional[str] = None, - ) -> None: - cities = self.discover_cities() - print(f"[discover] 共发现地区 {len(cities)} 个") - - if city_filter: - key = city_filter.strip().lower() - cities = [ - c for c in cities - if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id - ] - print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}") - - if max_cities > 0: - cities = cities[:max_cities] - print(f"[discover] 截断地区数 {len(cities)}") - - os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) - - seen_ids: Set[str] = set() - if os.path.exists(output_path): - with open(output_path, "r", encoding="utf-8") as old_file: - for line in old_file: - line = line.strip() - if not line: + resp = session.get(url, timeout=10, verify=False) + status_code = resp.status_code + text = resp.text + resp.close() + if status_code == 403: + if attempt < max_retries - 1: + wait_time = 2 ** attempt + random.uniform(0.3, 1.0) + print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})") + self._refresh_thread_session() + session = self._get_thread_session() + time.sleep(wait_time) continue - try: - item = json.loads(line) - except Exception: - continue - rid = item.get("record_id") - if rid: - seen_ids.add(rid) - print(f"[resume] 已有记录 {len(seen_ids)} 条") + print(" 请求失败: 403 Forbidden") + return None + if status_code >= 400: + raise requests.exceptions.HTTPError(f"{status_code} Error") + return text + except requests.exceptions.RequestException as exc: + print(f" 请求失败: {exc}") + return None + return None - total_new_json = 0 - total_new_db = 0 - total_skip_db = 0 + def run(self): + print("启动律图采集...") + if not self.cities: + print("无城市数据") + return - with open(output_path, "a", encoding="utf-8") as out: - for idx, target in enumerate(cities, start=1): - print( - f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} " - f"(area={target.area_id})" - ) - city_records = list(self.crawl_city(target)) - - city_new_json = 0 - for record in city_records: - rid = record["record_id"] - if rid in seen_ids: - continue - out.write(json.dumps(record, ensure_ascii=False) + "\n") - seen_ids.add(rid) - city_new_json += 1 - total_new_json += 1 - - city_new_db, city_skip_db = self._write_records_to_db(city_records) - total_new_db += city_new_db - total_skip_db += city_skip_db - - print( - f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, " - f"DB新增{city_new_db}条, DB跳过{city_skip_db}条" - ) - - print( - f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, " - f"DB跳过{total_skip_db}条, 输出: {output_path}" - ) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="律图全新采集脚本(站点数据直采)") - parser.add_argument( - "--output", - default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl", - help="输出 jsonl 文件路径", - ) - parser.add_argument( - "--max-cities", - type=int, - default=0, - help="最多采集多少个地区,0 表示不限", - ) - parser.add_argument( - "--max-pages", - type=int, - default=9999, - help="每个地区最多采集多少页", - ) - parser.add_argument( - "--city-filter", - default="", - help="按城市名称/拼音/编码过滤", - ) - parser.add_argument( - "--sleep", - type=float, - default=0.1, - help="详情页请求间隔秒数", - ) - parser.add_argument( - "--direct", - action="store_true", - help="直连模式,不使用 proxy_settings.json 代理", - ) - parser.add_argument( - "--no-db", - action="store_true", - help="只输出 JSONL,不写入数据库", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - - if args.no_db: - crawler = Six4365Crawler( - max_pages=args.max_pages, - sleep_seconds=args.sleep, - use_proxy=not args.direct, - db_connection=None, - ) - crawler.crawl( - output_path=args.output, - max_cities=args.max_cities, - city_filter=args.city_filter or None, - ) - return - - with Db() as db: - crawler = Six4365Crawler( - max_pages=args.max_pages, - sleep_seconds=args.sleep, - use_proxy=not args.direct, - db_connection=db, - ) - crawler.crawl( - output_path=args.output, - max_cities=args.max_cities, - city_filter=args.city_filter or None, - ) + for city_code, info in self.cities.items(): + province = info.get("province_name", "") + city = info.get("name", "") + print(f"采集 {province}-{city}") + page = 1 + while True: + payload = self._build_payload(city_code, page) + html = self._post(payload) + if not html: + break + link_count = self._parse_list(html, province, city) + if link_count == 0: + break + page += 1 + print("律图采集完成") if __name__ == "__main__": - main() + with Db() as db: + spider = Six4365Spider(db) + spider.run() diff --git a/common_sites/start.sh b/common_sites/start.sh index 9f849b8..a4d2a7d 100755 --- a/common_sites/start.sh +++ b/common_sites/start.sh @@ -1,80 +1,13 @@ #!/usr/bin/env bash set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -LOG_DIR="${PROJECT_ROOT}/logs" -DATA_DIR="${PROJECT_ROOT}/data" +# 切换到脚本所在目录,确保相对路径正确 +cd "$(dirname "$0")" -mkdir -p "${LOG_DIR}" "${DATA_DIR}" +echo "使用 request/proxy_settings.json 读取代理配置" -if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then - PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python" -else - PYTHON_BIN="python3" -fi - -RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential - -echo "[start] project=${PROJECT_ROOT}" -echo "[start] python=${PYTHON_BIN}" -echo "[start] mode=${RUN_MODE}" -echo "[start] proxy=request/proxy_settings.json" - -# 大律师(新结构采集 + 写库)可通过环境变量控制 -DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}" -DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}" -DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}" -DLS_SLEEP="${DLS_SLEEP:-0.2}" -DLS_CITY_FILTER="${DLS_CITY_FILTER:-}" -DLS_EXTRA_ARGS=() - -if [[ "${DLS_MAX_CITIES}" != "0" ]]; then - DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}") -fi -if [[ "${DLS_MAX_PAGES}" != "0" ]]; then - DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}") -fi -if [[ -n "${DLS_CITY_FILTER}" ]]; then - DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}") -fi -DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}") - -if [[ "${DLS_DIRECT:-0}" == "1" ]]; then - DLS_EXTRA_ARGS+=(--direct) -fi -if [[ "${DLS_NO_DB:-0}" == "1" ]]; then - DLS_EXTRA_ARGS+=(--no-db) -fi - -run_bg() { - local name="$1" - shift - local logfile="${LOG_DIR}/${name}.log" - nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 & - echo "[start] ${name} pid=$! log=${logfile}" -} - -run_fg() { - local name="$1" - shift - local logfile="${LOG_DIR}/${name}.log" - echo "[start] ${name} fg log=${logfile}" - env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 -} - -if [[ "${RUN_MODE}" == "sequential" ]]; then - run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}" - run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py" - run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py" - run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py" - run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py" - echo "[done] sequential completed" -else - run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}" - run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py" - run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py" - run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py" - run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py" - echo "[done] all crawlers started in background" -fi +nohup python ../common_sites/dls.py > dls.log 2>&1 & # 大律师 +nohup python ../common_sites/findlaw.py > findlaw.log 2>&1 & # 找法网 +nohup python ../common_sites/lawtime.py > lawtime.log 2>&1 & # 法律快车 +nohup python ../common_sites/six4365.py > six4365.log 2>&1 & # 律图 +nohup python ../common_sites/hualv.py > hualv.log 2>&1 & # 华律 diff --git a/config.py b/config.py index 4afd5ea..f290702 100644 --- a/config.py +++ b/config.py @@ -1,22 +1,119 @@ -# common_sites 独立项目配置 - +# 数据库连接配置 DB_CONFIG = { - "host": "8.134.219.222", - "user": "lawyer", - "password": "CTxr8yGwsSX3NdfJ", - "database": "lawyer", + "host": "8.134.219.222", # 数据库地址 + "user": "lawyer", # 数据库用户名 + "password": "CTxr8yGwsSX3NdfJ", # 数据库密码 + "database": "lawyer", # 数据库名称 "charset": "utf8mb4", } -HEADERS = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", - "Accept": "*/*", - "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7", - "X-Requested-With": "XMLHttpRequest", +# 微信爬虫特定的配置 +WEIXIN_CONFIG = { + "TOKEN": "756858506", # 您的Token + "FINGERPRINT": "1caa5fc52ac489e20a175e153dd3ef21", + "COOKIE": { + "appmsglist_action_3258147150": "card", + "mm_lang": "zh_CN", + "ts_uid": "8295434560", + "markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838", + "_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1", + "_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f", + "_qimei_h38": "b885c955f8e9995f103aac140200000421811e", + "_qimei_i_1": "4ddd76d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd83abc2e0", + "_qpsvr_localtk": "0.2780749298744084", + "RK": "ZGEMOpzbOS", + "ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055", + "pac_uid": "0_HXj3iphPm0Y4a", + "_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55", + "rewardsn": "", + "wxtokenkey": "777", + "omgid": "0_HXj3iphPm0Y4a", + "sig_login": "h01218fdccf5b63c15a6c5edb19ce20d0481c52723ee44ab56b9fc1415ff39c9ff0dd2000e12f1de8ae", + "ua_id": "QXSOTQUjDFjoH63yAAAAAPILc15EwzRTwdqntEiCGSE=", + "mp_token": "1331492699", + "appletToken": "2105598806", + "__wx_phantom_mark__": "breQbE92JS", + "mmad_session": "2bd2e1824d701b521c16fa35de0378e55273ce93a68ac0cc9ca30e8ad5b2e9f6fc419dd5fed1cd17f0a57fc3c327e03ccf325c1e1e97dde41374a9d8067d9aa700c8b87a29b0d3caf7f949761d8f4eeb56a1e3ddbc5a5d3a573e5b83971cd92e11de1c56c245721266e7088080fefde3", + "pgv_info": "ssid=s5739471549", + "pgv_pvid": "2616937300", + "_gcl_au": "1.1.954868153.1769494261", + "wxuin": "69676812527831", + "_clck": "3258147150|1|g35|0", + "uuid": "e07aa2889db56b1901e1fb6b1286d9a7", + "rand_info": "CAESIBnfIxLJoUVe5wP4SI/ADWnrnPUBlJDb4yyA7Et1+ZfF", + "slave_bizuin": "3258147150", + "data_bizuin": "3258147150", + "bizuin": "3258147150", + "data_ticket": "kv+SnLJADgPlcKQPIbYnfbEAxogpIMfAo/n0/HjtChnfDmQSogWvkO82/mUtzpcc", + "slave_sid": "eFNMcEZ3bnhvRkppZVNkTDE4dFFnM0ZzdFM1REhpemZORHRnVnlnRHhKU29vY1ZBY0dJZkFHcXB5Nko4aV9pbVlnRTBRVDE0NzdIUDF4T3NTSDVzdXBJS2d3WFFuR3hiMWVVbG5ZTURfYmh3YTFTallIb2JXOWpyTWxXS25jbVFRVmtXWHVaWGdCN2lqZzVm", + "slave_user": "gh_fe76760560d0", + "xid": "34f577adf2c28e5b9f04de93c614c5c4", + "_clsk": "639w4k|1769742296130|3|1|mp.weixin.qq.com/weheat-agent/payload/record" + }, + "COUNT": 20, # 单页条数 + "REQUESTS_PER_SECOND": 8, # 每秒最大请求数(调高更快,但有风控风险) + "PAGE_DELAY": 0.8, # 每页采集后的等待秒数 + "CITY_DELAY": 0.3, # 每城市采集后的等待秒数 } +# 通用请求头 +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36', + 'Accept': '*/*', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', + 'X-Requested-With': 'XMLHttpRequest', +} + +# 法律快车爬虫配置 LAWTIME_CONFIG = { "HEADERS": { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1" } } + +# Redis配置 - 用于采集索引和断点恢复 +REDIS_CONFIG = { + "host": "127.0.0.1", + "port": 6379, + "password": "", + "db": 0, # 使用数据库0 + "decode_responses": True, # 自动解码响应 + "socket_timeout": 5, # 连接超时时间 + "socket_connect_timeout": 5, # 连接建立超时时间 + "health_check_interval": 30, # 健康检查间隔 + "retry_on_timeout": True, # 超时重试 + "max_connections": 20, # 最大连接数 +} + +# Redis键名配置 +REDIS_KEYS = { + "spider_progress": "lawyer:spider:progress:{spider_name}", # 爬虫进度 + "url_processed": "lawyer:url:processed:{spider_name}", # 已处理URL集合 + "url_failed": "lawyer:url:failed:{spider_name}", # 失败URL集合 + "spider_stats": "lawyer:stats:{spider_name}", # 爬虫统计信息 + "global_stats": "lawyer:global:stats", # 全局统计 + "session_info": "lawyer:session:{session_id}", # 会话信息 + "url_queue": "lawyer:queue:{spider_name}", # URL队列 + "duplicate_filter": "lawyer:duplicate:{spider_name}", # 去重过滤器 +} + +# MongoDB配置 - 用于日志存储 +MONGO_CONFIG = { + "uri": "mongodb://127.0.0.1:27017/", + "database": "lawyer", + "collections": { + "logs": "logs", # 通用日志 + "spider_logs": "spider_logs", # 爬虫专用日志 + "error_logs": "error_logs", # 错误日志 + "system_logs": "system_logs", # 系统日志 + "performance_logs": "performance_logs" # 性能日志 + }, + "options": { + "maxPoolSize": 10, # 连接池最大连接数 + "minPoolSize": 1, # 连接池最小连接数 + "maxIdleTimeMS": 30000, # 最大空闲时间 + "serverSelectionTimeoutMS": 5000, # 服务器选择超时 + "connectTimeoutMS": 10000, # 连接超时 + "socketTimeoutMS": 30000, # Socket超时 + } +} diff --git a/request/__init__.py b/request/__init__.py index 87b6c07..1918e15 100644 --- a/request/__init__.py +++ b/request/__init__.py @@ -1,19 +1 @@ -from request.requests_client import ( - RequestClientError, - RequestConnectTimeout, - RequestConnectionError, - RequestSSLError, - RequestTimeout, - RequestsClient, - ResponseData, -) - -__all__ = [ - "RequestsClient", - "ResponseData", - "RequestClientError", - "RequestConnectTimeout", - "RequestTimeout", - "RequestConnectionError", - "RequestSSLError", -] +# Package marker for request utilities. diff --git a/requirements.txt b/requirements.txt index 6a3fec7..8bf4aa0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,18 @@ +# 数据库驱动 pymysql>=1.0.2 +pymongo>=4.0.0 + +# 调度器 +schedule>=1.2.0 + +# 其他可能需要的依赖 requests>=2.28.0 beautifulsoup4>=4.11.0 -urllib3>=1.26.0 lxml>=4.9.0 -openpyxl>=3.1.0 +redis>=4.0.0 +pyppeteer>=1.0.2 +# 可选:提升反检测能力 +pyppeteer-stealth>=2.7.4 + +# 日志相关 +python-dateutil>=2.8.2 diff --git a/weixin.py b/weixin.py new file mode 100644 index 0000000..59bf18d --- /dev/null +++ b/weixin.py @@ -0,0 +1,355 @@ +import json +import os +import re +import sys +import time +from html import unescape +from http.cookies import SimpleCookie +from typing import Dict, Optional +from urllib.parse import urlencode + +import requests +import urllib3 + +current_dir = os.path.dirname(os.path.abspath(__file__)) +project_root = os.path.dirname(current_dir) +for path in (current_dir, project_root): + if path not in sys.path: + sys.path.append(path) + +import config as project_config +from utils.rate_limiter import wait_for_request, global_rate_limiter + +API_ENDPOINT = "https://mp.weixin.qq.com/cgi-bin/videosnap" +DOMAIN = "mp.weixin.qq.com" +DEFAULT_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/138.0.0.0 Safari/537.36" + ), + "Accept": "*/*", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7", + "X-Requested-With": "XMLHttpRequest", +} +DEFAULT_WEIXIN_CONFIG = { + "TOKEN": "32299576", + "FINGERPRINT": "64a1c659b8b944d6e7fe596b0794ab35", + "COOKIE": { + "appmsglist_action_3876849679": "card", + "mm_lang": "zh_CN", + "ts_uid": "8295434560", + "markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838", + "_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1", + "_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f", + "_qimei_h38": "b885c955f8e9995f103aac140200000421811e", + "RK": "ZGEMOpzbOS", + "ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055", + "pac_uid": "0_HXj3iphPm0Y4a", + "_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55", + "wxuin": "70085167371972", + "omgid": "0_HXj3iphPm0Y4a", + "rewardsn": "", + "wxtokenkey": "777", + "sig_login": "h017c22e8921e6bf5a1f8659d9f34ee0db2be31cdcf03786b9ab4b787a9821ad84d3046473d9076181a", + "_qpsvr_localtk": "0.9079082151544442", + "appletToken": "880792228", + "mmad_session": "ae5215dd3c930e6256d8f0656bd8497e719817e0df77a677766e128e2135218486f674b88b349db0d47039f54cb99c8753beb8d4b921ae452b66773db51ad3006ab1f0d19253ae83e2cb9ba53ff5b5b4f45f2fe160db66fd300a1fb4e04a92bd11de1c56c245721266e7088080fefde3", + "qq_domain_video_guid_verify": "6cce52525a146907", + "_qimei_q36": "", + "pgv_info": "ssid=s4741843528", + "pgv_pvid": "9337874960", + "_qimei_i_2": "47e96bdff700", + "_qimei_i_1": "40bb51d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd838fd0da", + "_qimei_q32": "", + "mp_token": "1555009133", + "ua_id": "390pNywJFJA6BsgOAAAAADO0TqlmW7NBB1GD0Y7OVwk=", + "__wx_phantom_mark__": "UTRZE71JZ7", + "_clck": "3841887471|1|g4a|0", + "uuid": "6ae7cb97104627c5d3b9d1d9ab2eef60", + "rand_info": "CAESIGjvJyiJ58Ii0enQVKBwl6d4IyCrWeN7kzhIAVTgM2lc", + "slave_bizuin": "3876849679", + "data_bizuin": "3876849679", + "bizuin": "3876849679", + "data_ticket": "8wg11/LIrTLHAbJdbAH2HWdqlW/K2jijwP27oPSrH2myYNpuSR1NedfmSbzeq5go", + "slave_sid": "TjBzVV83WThEaThRdUhlcFpqRFhQejFSUzRfOWdGa0l3S0dPSW41QWdkSk9qSkQ2ZTljbWRHa0poQ1lNTXlub25WMUJORVluVU5HaFBGRXVJS19yeG53SUNWWU14YjNQeWpxTUczalBHV1dTY0V3TDZ6aE14bFNaS2ExeGNhb3J0WlRWMlM4NnNmNGFST0ZD", + "slave_user": "gh_6c1283858808", + "xid": "116378d10877a35558158970698ca0c3", + "_clsk": "3okzsf|1773282377657|6|1|mp.weixin.qq.com/weheat-agent/payload/record" + }, + "COUNT": 20, + "REQUESTS_PER_SECOND": 5, + "PAGE_DELAY": 5, + "CITY_DELAY": 2, +} + + +def _parse_cookie_value(cookie_value) -> Dict[str, str]: + if isinstance(cookie_value, dict): + return {str(key): str(value) for key, value in cookie_value.items()} + + if not cookie_value: + return {} + + if isinstance(cookie_value, str): + text = cookie_value.strip() + if not text: + return {} + try: + parsed = json.loads(text) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, dict): + return {str(key): str(value) for key, value in parsed.items()} + + cookie = SimpleCookie() + cookie.load(text) + return {key: morsel.value for key, morsel in cookie.items()} + + return {} + + +def _load_weixin_config() -> Dict: + config = DEFAULT_WEIXIN_CONFIG.copy() + module_config = getattr(project_config, "WEIXIN_CONFIG", None) + if isinstance(module_config, dict): + config.update(module_config) + + env_mapping = { + "TOKEN": os.getenv("WEIXIN_TOKEN"), + "FINGERPRINT": os.getenv("WEIXIN_FINGERPRINT"), + "COOKIE": os.getenv("WEIXIN_COOKIE"), + "COUNT": os.getenv("WEIXIN_COUNT"), + "REQUESTS_PER_SECOND": os.getenv("WEIXIN_REQUESTS_PER_SECOND"), + "PAGE_DELAY": os.getenv("WEIXIN_PAGE_DELAY"), + "CITY_DELAY": os.getenv("WEIXIN_CITY_DELAY"), + } + for key, value in env_mapping.items(): + if value not in (None, ""): + config[key] = value + + config["COOKIE"] = _parse_cookie_value(config.get("COOKIE")) + + for key in ("COUNT", "REQUESTS_PER_SECOND"): + try: + config[key] = int(config[key]) + except (TypeError, ValueError): + config[key] = DEFAULT_WEIXIN_CONFIG[key] + + for key in ("PAGE_DELAY", "CITY_DELAY"): + try: + config[key] = float(config[key]) + except (TypeError, ValueError): + config[key] = DEFAULT_WEIXIN_CONFIG[key] + + return config + + +def _strip_html(text: str) -> str: + if not text: + return "" + return re.sub(r"<[^>]+>", "", unescape(text)).strip() + + +class WeixinSpider: + """基于 requests 的微信视频号采集器""" + + def __init__(self, db_connection): + self.db = db_connection + self.config = _load_weixin_config() + self.token = str(self.config.get("TOKEN", "")).strip() + self.fingerprint = str(self.config.get("FINGERPRINT", "")).strip() + self.cookies = self.config.get("COOKIE", {}) + self.count = str(self.config.get("COUNT", DEFAULT_WEIXIN_CONFIG["COUNT"])) + self.page_delay = max(0.0, float(self.config.get("PAGE_DELAY", DEFAULT_WEIXIN_CONFIG["PAGE_DELAY"]))) + self.city_delay = max(0.0, float(self.config.get("CITY_DELAY", DEFAULT_WEIXIN_CONFIG["CITY_DELAY"]))) + max_rps = self.config.get("REQUESTS_PER_SECOND") + if max_rps: + global_rate_limiter.max_requests = int(max_rps) + + headers = getattr(project_config, "HEADERS", DEFAULT_HEADERS).copy() + headers["Referer"] = "https://mp.weixin.qq.com/" + self.session = requests.Session() + self.session.trust_env = False + self.session.headers.update(headers) + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + def _validate_runtime_config(self) -> bool: + missing = [] + if not self.token: + missing.append("TOKEN") + if not self.fingerprint: + missing.append("FINGERPRINT") + if not self.cookies: + missing.append("COOKIE") + + if not missing: + return True + + print( + "[微信] 配置不完整,缺少: " + + ", ".join(missing) + + "。请在 config.py 的 WEIXIN_CONFIG 中补齐," + + "或通过环境变量 WEIXIN_TOKEN / WEIXIN_FINGERPRINT / WEIXIN_COOKIE 提供。" + ) + return False + + def _load_areas(self): + condition = "domain='maxlaw' AND level=2" + tables = ("area_new", "area", "area2") + last_error = None + for table in tables: + try: + rows = self.db.select_data(table, "province, city", condition) or [] + except Exception as exc: + last_error = exc + continue + if rows: + print(f"[微信] 城市来源表: {table}, 城市数: {len(rows)}") + return rows + + if last_error: + print(f"[微信] 加载地区数据失败: {last_error}") + print("[微信] 无城市数据(已尝试 area_new/area/area2)") + return [] + + def _build_query_url(self, query: str, buffer: str) -> str: + params = { + "action": "search", + "scene": "1", + "query": query, + "count": self.count, + "buffer": buffer, + "fingerprint": self.fingerprint, + "token": self.token, + "lang": "zh_CN", + "f": "json", + "ajax": "1", + } + return f"{API_ENDPOINT}?{urlencode(params)}" + + def _extract_phone(self, text: str) -> Optional[str]: + if not text: + return None + match = re.search(r"1[3-9]\d{9}", text) + return match.group(0) if match else None + + def _parse_name(self, acct: Dict) -> str: + highlight = _strip_html(acct.get("highlight_nickname", "")) + if highlight: + return highlight + return _strip_html(acct.get("nickname", "")) + + def _store_account(self, acct: Dict, province: str, city: str) -> None: + signature = acct.get("signature", "") + phone = self._extract_phone(signature) + if not phone: + return + + if self.db.is_data_exist("lawyer", f"phone='{phone}' and domain='{DOMAIN}'"): + name = self._parse_name(acct) + print(f" -- 已存在律师: {name} ({phone})") + return + + params = json.dumps(acct, ensure_ascii=False) + lawyer_data = { + "phone": phone, + "province": province, + "city": city, + "law_firm": acct.get("auth_info", {}).get("auth_profession"), + "url": f"https://channels.weixin.qq.com/finder/creator/feeds?finder_username={acct.get('username', '')}", + "create_time": int(time.time()), + "domain": DOMAIN, + "name": self._parse_name(acct), + "params": params, + } + + try: + inserted_id = self.db.insert_data("lawyer", lawyer_data) + print(f" -> 新增律师: {lawyer_data['name']} ({phone}), ID: {inserted_id}") + except Exception as exc: + print(f" 插入失败 {lawyer_data['name']} ({phone}): {exc}") + + def _search_city(self, province: str, city: str) -> None: + city_name = city.replace('市', '') + query = f"{city_name}律所" + print(f"--- [微信] 开始采集城市: {province} - {city_name} ---") + + buffer = "" + has_more = True + page_no = 0 + + while has_more: + page_no += 1 + url = self._build_query_url(query, buffer) + print(f"正在采集 '{query}' 第 {page_no} 页: {url}") + + wait_for_request() + try: + response = self.session.get( + url, + timeout=15, + cookies=self.cookies, + proxies={}, # 明确禁用代理 + verify=False, + ) + response.raise_for_status() + data = response.json() + except requests.exceptions.RequestException as exc: + print(f"网络请求失败: {exc}") + break + except json.JSONDecodeError: + print("解析返回的JSON失败。返回内容:", response.text[:200]) + break + + base_resp = data.get("base_resp", {}) + if base_resp.get("ret") != 0: + print(f"API返回错误: {base_resp.get('err_msg')}") + if "invalid ticket" in (base_resp.get('err_msg') or ""): + print("Token 或 Cookie 可能失效,请更新配置。") + break + + accounts = data.get("acct_list", []) + if not accounts: + print("本页未找到更多律师信息。") + break + + for acct in accounts: + self._store_account(acct, province, city_name) + + has_more = bool(data.get("acct_continue_flag")) + buffer = data.get("last_buff", "") + time.sleep(self.page_delay) + + print(f"--- [微信] 城市: {city_name} 采集完成 ---\n") + + def run(self) -> None: + print("启动微信视频号律师信息采集...") + if not self._validate_runtime_config(): + return + + areas = self._load_areas() + if not areas: + print("[微信] 未能从 `area_new` 表获取到地区信息。") + return + + for area in areas: + province = area.get("province", "") + city = area.get("city", "") + if not city: + continue + try: + self._search_city(province, city) + except Exception as exc: + print(f"采集 {province}-{city} 时发生错误: {exc}") + time.sleep(self.city_delay) + + print("微信视频号律师信息采集完成。") + + +if __name__ == "__main__": + from Db import Db + + with Db() as db: + spider = WeixinSpider(db) + spider.run()