diff --git a/README.md b/README.md index aaeae77..32cb778 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,49 @@ pip install -r requirements.txt bash common_sites/start.sh ``` +## 拆分运行(直连/代理) + +本仓库支持用环境变量 `PROXY_ENABLED` 在一次运行内强制开/关代理: + +- **直连**:`PROXY_ENABLED=0`(不使用代理 IP) +- **代理**:`PROXY_ENABLED=1`(强制使用 `request/proxy_settings.json` 的代理配置) +- **默认**:不设置(跟随 `request/proxy_settings.json` 的 `enabled` 字段) + +对应提供两套入口脚本: + +```bash +# 直连(默认包含:大律师/大律师PC/找法网/法律快车) +bash common_sites/start_direct_twice_weekly.sh + +# 代理(默认包含:华律/律图) +bash common_sites/start_proxy_weekly.sh +``` + +## cron 示例(每周两次直连 + 每周一次代理) + +> 下面仅给示例,你可以按机器负载调整时间;日志会输出到 `common_sites/*.log`。 + +```bash +# 编辑定时任务 +crontab -e + +# 每周二、周五 02:10 直连跑一次 +10 2 * * 2,5 cd /www/wwwroot/lawyers && bash common_sites/start_direct_twice_weekly.sh + +# 每周日 03:20 走代理跑一次(你手动续费代理 IP) +20 3 * * 0 cd /www/wwwroot/lawyers && bash common_sites/start_proxy_weekly.sh +``` + +### 常用参数(可选) + +```bash +# 限流(跨进程共享),直连可适当调高,代理建议保守 +export PROXY_MAX_REQUESTS_PER_SECOND=8 + +# 代理连通性输出(部分脚本会打印测试信息) +export PROXY_TEST=1 +``` + ## 说明 - 当前项目直接复用原项目数据库配置和代理配置。 diff --git a/common_sites/dls.py b/common_sites/dls.py index 4cca085..7951d0e 100644 --- a/common_sites/dls.py +++ b/common_sites/dls.py @@ -86,11 +86,12 @@ class DlsSpider: test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy") timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10")) try: - resp = session.get( - test_url, - timeout=timeout, - headers={"Connection": "close"}, - ) + with request_slot(): + resp = session.get( + test_url, + timeout=timeout, + headers={"Connection": "close"}, + ) print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}") except Exception as exc: print(f"[proxy] test failed: {exc}") diff --git a/common_sites/smoke_test_sites.py b/common_sites/smoke_test_sites.py new file mode 100644 index 0000000..cb41b8c --- /dev/null +++ b/common_sites/smoke_test_sites.py @@ -0,0 +1,220 @@ +import json +import os +import sys +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +current_dir = os.path.dirname(os.path.abspath(__file__)) +project_root = os.path.dirname(current_dir) +request_dir = os.path.join(project_root, "request") +if request_dir not in sys.path: + sys.path.insert(0, request_dir) +if project_root not in sys.path: + sys.path.append(project_root) + +import requests + +from request.proxy_config import get_proxies, report_proxy_status + + +@dataclass +class CheckResult: + site: str + url: str + method: str + ok: bool + status_code: Optional[int] + error: str + hint: str + elapsed_ms: int + + +def _now_ms() -> int: + return int(time.time() * 1000) + + +def _short_hint(text: str) -> str: + s = (text or "").strip().lower() + flags = [] + for key, label in [ + ("403", "403"), + ("429", "429"), + ("captcha", "captcha"), + ("验证码", "captcha_cn"), + ("人机", "bot_check_cn"), + ("access denied", "access_denied"), + ("forbidden", "forbidden"), + ("too many requests", "rate_limited"), + ("cloudflare", "cloudflare"), + ("challenge", "challenge"), + ]: + if key in s: + flags.append(label) + return ",".join(flags)[:120] + + +def _build_session() -> requests.Session: + report_proxy_status() + s = requests.Session() + s.trust_env = False + proxies = get_proxies() + if proxies: + s.proxies.update(proxies) + else: + s.proxies.clear() + s.headers.update( + { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/136.0.0.0 Safari/537.36" + ), + "Accept": "*/*", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Connection": "close", + } + ) + return s + + +def _check( + session: requests.Session, + *, + site: str, + method: str, + url: str, + timeout: Tuple[float, float] = (10.0, 15.0), + headers: Optional[Dict[str, str]] = None, + data: Optional[Dict[str, Any]] = None, +) -> CheckResult: + start = _now_ms() + try: + resp = session.request( + method=method, + url=url, + timeout=timeout, + verify=False, + headers=headers, + data=data, + ) + text = resp.text or "" + status = resp.status_code + hint = _short_hint(text[:1200]) + ok = 200 <= status < 400 + return CheckResult( + site=site, + url=url, + method=method, + ok=ok, + status_code=status, + error="", + hint=hint, + elapsed_ms=_now_ms() - start, + ) + except Exception as exc: + return CheckResult( + site=site, + url=url, + method=method, + ok=False, + status_code=None, + error=str(exc)[:200], + hint="", + elapsed_ms=_now_ms() - start, + ) + finally: + try: + resp.close() # type: ignore[name-defined] + except Exception: + pass + + +def _tests() -> List[Dict[str, Any]]: + # 每个站点选一个“代表性列表/API”作为冒烟:能快速暴露 403/验证码/限频。 + return [ + { + "site": "大律师(m站)", + "method": "GET", + "url": "https://m.maxlaw.cn/", + }, + { + "site": "大律师(PC站)", + "method": "GET", + "url": "https://www.maxlaw.cn/law/beijing?page=1", + "headers": {"Referer": "https://www.maxlaw.cn/"}, + }, + { + "site": "找法网(m站)", + "method": "GET", + "url": "https://m.findlaw.cn/beijing/q_lawyer/p1?ajax=1&order=0&sex=-1", + "headers": { + "Referer": "https://m.findlaw.cn/beijing/q_lawyer/", + "X-Requested-With": "XMLHttpRequest", + "Accept": "application/json, text/javascript, */*; q=0.01", + }, + }, + { + "site": "法律快车(m站)", + "method": "GET", + "url": "https://m.lawtime.cn/beijing/lawyer/?page=1", + }, + { + "site": "律图(m站)", + "method": "POST", + "url": "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/", + "data": { + "RegionId": "110100", # 北京市 + "OnlyData": "true", + "LawyerRecommendRequest[AreaId]": "110100", + "LawyerRecommendRequest[PageIndex]": "1", + "LawyerRecommendRequest[PageSize]": "10", + "LawyerRecommendRequest[OrderType]": "0", + "LawyerRecommendRequest[Type]": "1", + }, + }, + { + "site": "华律(m站)", + "method": "POST", + "url": "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/", + "data": { + "pid": "110000", # 北京 + "cid": "110100", # 北京市 + "page": "1", + }, + }, + ] + + +def main() -> int: + mode = os.getenv("PROXY_ENABLED") + print(f"[smoke] PROXY_ENABLED={mode!r}") + s = _build_session() + results: List[CheckResult] = [] + for item in _tests(): + res = _check( + s, + site=item["site"], + method=item["method"], + url=item["url"], + headers=item.get("headers"), + data=item.get("data"), + ) + results.append(res) + print( + f"[smoke] {res.site} {res.method} {res.status_code} ok={res.ok} " + f"{res.elapsed_ms}ms hint={res.hint or '-'} err={res.error or '-'}" + ) + time.sleep(0.3) + + summary = { + "proxy_enabled": mode, + "results": [res.__dict__ for res in results], + } + print("[smoke] summary_json=" + json.dumps(summary, ensure_ascii=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/common_sites/start.sh b/common_sites/start.sh index f117cbc..60d42e8 100755 --- a/common_sites/start.sh +++ b/common_sites/start.sh @@ -6,11 +6,27 @@ cd "$(dirname "$0")" echo "使用 request/proxy_settings.json 读取代理配置" export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}" +export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}" + +is_job_running() { + local script="$1" + local script_regex="${script//./\\.}" + pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true +} start_job() { local script="$1" local log_file="$2" local label="$3" + local existing + + existing="$(is_job_running "${script}")" + if [[ -n "${existing}" ]]; then + echo "跳过 ${label}: ${script} 已在运行" + echo "${existing}" | head -n 1 + return 0 + fi + nohup python "../common_sites/${script}" > "${log_file}" 2>&1 & echo "启动 ${label}: ${script} -> ${log_file}" sleep 1 diff --git a/common_sites/start_direct_twice_weekly.sh b/common_sites/start_direct_twice_weekly.sh new file mode 100755 index 0000000..75ad938 --- /dev/null +++ b/common_sites/start_direct_twice_weekly.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 切换到脚本所在目录,确保相对路径正确 +cd "$(dirname "$0")" + +# 强制直连:不使用代理 IP +export PROXY_ENABLED=0 + +# 直连模式建议更保守一些,降低被临时风控的概率 +export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}" +export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}" + +is_job_running() { + local script="$1" + local script_regex="${script//./\\.}" + pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true +} + +start_job() { + local script="$1" + local log_file="$2" + local label="$3" + local existing + + existing="$(is_job_running "${script}")" + if [[ -n "${existing}" ]]; then + echo "跳过 ${label}: ${script} 已在运行" + echo "${existing}" | head -n 1 + return 0 + fi + + nohup python "../common_sites/${script}" > "${log_file}" 2>&1 & + echo "启动 ${label}: ${script} -> ${log_file}" + sleep 1 +} + +echo "直连模式(PROXY_ENABLED=0),每周两次建议用 cron 调度" +echo "当前归入直连组:大律师(m/PC)、华律、律图" + +# 直连优先站点: +# - 大律师(m站/PC站):当前可直接访问,未见明显强风控 +# - 华律:当前网页可直接访问,未见明显强风控 +# - 律图:当前网页可直接访问,未见明显强风控 +start_job "dls.py" "direct_dls.log" "大律师(直连)" +start_job "dls_pc.py" "direct_dls_pc.log" "大律师PC站(直连)" +start_job "hualv.py" "direct_hualv.log" "华律(直连)" +start_job "six4365.py" "direct_six4365.log" "律图(直连)" diff --git a/common_sites/start_proxy_weekly.sh b/common_sites/start_proxy_weekly.sh new file mode 100755 index 0000000..d7da22e --- /dev/null +++ b/common_sites/start_proxy_weekly.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 切换到脚本所在目录,确保相对路径正确 +cd "$(dirname "$0")" + +# 强制开启代理:用于容易被限频/拦截的站点 +export PROXY_ENABLED=1 + +# 代理模式下默认更保守一点,避免冲爆代理与触发风控 +export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}" +export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}" + +# 可选:开启代理连通性测试输出(部分脚本会打印测试信息/代理状态) +export PROXY_TEST="${PROXY_TEST:-0}" + +is_job_running() { + local script="$1" + local script_regex="${script//./\\.}" + pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true +} + +start_job() { + local script="$1" + local log_file="$2" + local label="$3" + local existing + + existing="$(is_job_running "${script}")" + if [[ -n "${existing}" ]]; then + echo "跳过 ${label}: ${script} 已在运行" + echo "${existing}" | head -n 1 + return 0 + fi + + nohup python "../common_sites/${script}" > "${log_file}" 2>&1 & + echo "启动 ${label}: ${script} -> ${log_file}" + sleep 1 +} + +echo "代理模式(PROXY_ENABLED=1),每周一次建议用 cron 调度" +echo "代理配置读取自 request/proxy_settings.json" +echo "每周一次代理任务 = 全量采集所有站点" + +# 每周一次代理任务做全量采集: +# - 强风控/更敏感站点:找法网、法律快车 +# - 其余站点也一并跑,保证每周至少有一次“全量最新数据”刷新 +start_job "dls.py" "proxy_dls.log" "大律师(代理全量)" +start_job "dls_pc.py" "proxy_dls_pc.log" "大律师PC站(代理全量)" +start_job "findlaw.py" "proxy_findlaw.log" "找法网(代理)" +start_job "lawtime.py" "proxy_lawtime.log" "法律快车(代理)" +start_job "hualv.py" "proxy_hualv.log" "华律(代理全量)" +start_job "six4365.py" "proxy_six4365.log" "律图(代理全量)" diff --git a/compare_firm_phones.py b/compare_firm_phones.py new file mode 100644 index 0000000..d2da77e --- /dev/null +++ b/compare_firm_phones.py @@ -0,0 +1,565 @@ +#!/usr/bin/env python3 +import argparse +import re +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Sequence, Tuple + +import pymysql +from openpyxl import Workbook, load_workbook +from openpyxl.styles import Font + +from config import DB_CONFIG + + +@dataclass(frozen=True) +class LawyerRecord: + id: int + name: str + phone: str + law_firm: str + province: str + city: str + domain: str + create_time: int + + +@dataclass(frozen=True) +class PhoneBackfill: + matched_phones: List[str] + records: List[LawyerRecord] + best_name: str + best_law_firm: str + best_domain: str + candidate_names: List[str] + candidate_firms: List[str] + candidate_domains: List[str] + + +DOMAIN_PRIORITY = { + "华律": 90, + "大律师": 85, + "找法网": 82, + "法律快车": 80, + "律图": 72, + "众法利单页": 68, + "众法利": 66, + "六四三六五": 64, + "智飞律师在线": 40, + "高德地图": 10, +} + +GENERIC_FIRMS = {"高德搜索"} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="按律所名从数据库补手机号并导出对比表") + parser.add_argument("--input", default="man.xlsx", help="原始 xlsx 文件路径") + parser.add_argument( + "--output", + default="man_firm_phone_compare.xlsx", + help="输出 xlsx 文件路径", + ) + return parser.parse_args() + + +def normalize_text(value: object) -> str: + text = str(value or "").strip() + text = text.replace("(", "(").replace(")", ")") + text = re.sub(r"\s+", "", text) + return text + + +def normalize_firm(value: object) -> str: + text = normalize_text(value) + text = text.replace("本地大所", "").replace("特色律所", "") + return text + + +def normalize_name(value: object) -> str: + text = normalize_text(value) + return text.replace("律师", "") + + +def normalize_province(value: object) -> str: + text = str(value or "").strip() + mapping = { + "北京市": "北京", + "天津市": "天津", + "上海市": "上海", + "重庆市": "重庆", + "内蒙古自治区": "内蒙古", + "广西壮族自治区": "广西", + "宁夏回族自治区": "宁夏", + "新疆维吾尔自治区": "新疆", + "西藏自治区": "西藏", + "香港特别行政区": "香港", + "澳门特别行政区": "澳门", + "新疆生产建设兵团": "新疆", + } + if text in mapping: + return mapping[text] + if text.endswith("省") and len(text) > 1: + return text[:-1] + return text + + +def normalize_city(value: object) -> str: + text = str(value or "").strip() + for suffix in ("市", "地区", "盟"): + if text.endswith(suffix) and len(text) > len(suffix): + return text[: -len(suffix)] + return text + + +def split_phones(value: object) -> List[str]: + return re.findall(r"1\d{10}", str(value or "")) + + +def unique_phones(records: Sequence[LawyerRecord]) -> List[str]: + output: List[str] = [] + seen = set() + for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True): + if record.phone and record.phone not in seen: + seen.add(record.phone) + output.append(record.phone) + return output + + +def unique_values(records: Sequence[LawyerRecord], attr: str) -> List[str]: + output: List[str] = [] + seen = set() + for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True): + value = getattr(record, attr, "") + if value and value not in seen: + seen.add(value) + output.append(value) + return output + + +def phone_record_sort_key( + record: LawyerRecord, + target_name: object, + target_province: object, + target_city: object, +) -> Tuple[int, int, int]: + score = 0 + normalized_target_name = normalize_name(target_name) + normalized_target_province = normalize_province(target_province) + normalized_target_city = normalize_city(target_city) + + if normalized_target_name: + if normalize_name(record.name) == normalized_target_name: + score += 400 + elif record.name: + score -= 40 + + if record.law_firm and record.law_firm not in GENERIC_FIRMS: + score += 220 + elif record.law_firm: + score += 40 + + if record.name: + score += 100 + + if normalized_target_city: + if normalize_city(record.city) == normalized_target_city: + score += 45 + elif record.city: + score -= 10 + + if normalized_target_province: + if normalize_province(record.province) == normalized_target_province: + score += 25 + elif record.province: + score -= 5 + + score += DOMAIN_PRIORITY.get(record.domain, 50) + return score, record.create_time, record.id + + +def compare_result(original_phones: Sequence[str], candidate_phones: Sequence[str]) -> str: + if not candidate_phones: + return "未匹配" + if not original_phones: + return "原手机号为空" + + original_set = set(original_phones) + candidate_set = set(candidate_phones) + if original_set == candidate_set: + return "完全一致" + if original_set & candidate_set: + return "候选包含原手机号" + return "不包含原手机号" + + +def infer_firm_from_address(address: object, ordered_firms: Sequence[str]) -> str: + normalized_address = normalize_text(address) + if not normalized_address: + return "" + for firm in ordered_firms: + if len(firm) < 4: + continue + if firm in normalized_address: + return firm + return "" + + +def load_db_indexes() -> Tuple[Dict[str, List[LawyerRecord]], List[str], Dict[str, List[LawyerRecord]]]: + conn = pymysql.connect(**DB_CONFIG) + firm_index: Dict[str, List[LawyerRecord]] = defaultdict(list) + phone_index: Dict[str, List[LawyerRecord]] = defaultdict(list) + try: + with conn.cursor() as cur: + cur.execute( + """ + SELECT id, name, phone, law_firm, province, city, domain, create_time + FROM lawyer + WHERE phone IS NOT NULL + AND phone <> '' + """ + ) + for row in cur.fetchall(): + record = LawyerRecord( + id=int(row[0]), + name=str(row[1] or "").strip(), + phone=str(row[2] or "").strip(), + law_firm=str(row[3] or "").strip(), + province=str(row[4] or "").strip(), + city=str(row[5] or "").strip(), + domain=str(row[6] or "").strip(), + create_time=int(row[7] or 0), + ) + phone_index[record.phone].append(record) + normalized_firm = normalize_firm(record.law_firm) + if normalized_firm: + firm_index[normalized_firm].append(record) + finally: + conn.close() + + ordered_firms = sorted(firm_index.keys(), key=len, reverse=True) + return firm_index, ordered_firms, phone_index + + +def build_phone_backfill( + original_phone: object, + name: object, + province: object, + city: object, + phone_index: Dict[str, List[LawyerRecord]], +) -> PhoneBackfill: + def pick_best_name(records: Sequence[LawyerRecord], target_name: object) -> str: + normalized_target_name = normalize_name(target_name) + if normalized_target_name: + for item in records: + if item.name and normalize_name(item.name) == normalized_target_name: + return item.name + for item in records: + if item.name: + return item.name + return "" + + records: List[LawyerRecord] = [] + seen_ids = set() + for phone in split_phones(original_phone): + for record in phone_index.get(phone, []): + if record.id in seen_ids: + continue + seen_ids.add(record.id) + records.append(record) + + sorted_records = sorted( + records, + key=lambda item: phone_record_sort_key(item, name, province, city), + reverse=True, + ) + candidate_names = unique_values(sorted_records, "name") + candidate_firms = unique_values( + [item for item in sorted_records if item.law_firm and item.law_firm not in GENERIC_FIRMS], + "law_firm", + ) + if not candidate_firms: + candidate_firms = unique_values( + [item for item in sorted_records if item.law_firm], + "law_firm", + ) + candidate_domains = unique_values(sorted_records, "domain") + matched_phones = unique_values(sorted_records, "phone") + + best_name = pick_best_name(sorted_records, name) + best_law_firm = "" + best_domain = "" + preferred_name = normalize_name(name) or normalize_name(best_name) + + for record in sorted_records: + if not record.law_firm or record.law_firm in GENERIC_FIRMS: + continue + if preferred_name and normalize_name(record.name) != preferred_name: + continue + best_law_firm = record.law_firm + best_domain = record.domain + break + + if not best_law_firm: + for record in sorted_records: + if record.law_firm and record.law_firm not in GENERIC_FIRMS: + best_law_firm = record.law_firm + best_domain = record.domain + break + + if not best_domain and sorted_records: + best_domain = sorted_records[0].domain + + return PhoneBackfill( + matched_phones=matched_phones, + records=sorted_records, + best_name=best_name, + best_law_firm=best_law_firm, + best_domain=best_domain, + candidate_names=candidate_names, + candidate_firms=candidate_firms, + candidate_domains=candidate_domains, + ) + + +def match_row( + name: object, + original_phone: object, + law_firm: object, + province: object, + city: object, + address: object, + phone_backfill: PhoneBackfill, + firm_index: Dict[str, List[LawyerRecord]], + ordered_firms: Sequence[str], +) -> Tuple[str, str, List[LawyerRecord]]: + def add_method(part: str, method_parts: List[str]) -> None: + if part and part not in method_parts: + method_parts.append(part) + + matched_firm = normalize_firm(law_firm) + used_phone_backfill_firm = False + inferred_from_address = False + if not matched_firm: + matched_firm = normalize_firm(phone_backfill.best_law_firm) + used_phone_backfill_firm = bool(matched_firm) + if not matched_firm: + matched_firm = infer_firm_from_address(address, ordered_firms) + inferred_from_address = bool(matched_firm) + if not matched_firm: + return "", "无可用律所名", [] + + candidates = firm_index.get(matched_firm, []) + if not candidates: + return matched_firm, "数据库无此律所", [] + + method_parts = ["律所"] + chosen = list(candidates) + + normalized_name = normalize_name(name) + if not normalized_name: + normalized_name = normalize_name(phone_backfill.best_name) + if normalized_name: + name_filtered = [item for item in chosen if normalize_name(item.name) == normalized_name] + if name_filtered: + chosen = name_filtered + add_method("姓名", method_parts) + + if len(unique_phones(chosen)) != 1: + normalized_province = normalize_province(province) + normalized_city = normalize_city(city) + + if normalized_province and normalized_city: + province_city_filtered = [ + item + for item in chosen + if normalize_province(item.province) == normalized_province + and normalize_city(item.city) == normalized_city + ] + if province_city_filtered: + chosen = province_city_filtered + add_method("省份", method_parts) + add_method("城市", method_parts) + + if len(unique_phones(chosen)) != 1 and normalized_city: + city_filtered = [ + item for item in chosen if normalize_city(item.city) == normalized_city + ] + if city_filtered: + chosen = city_filtered + add_method("城市", method_parts) + + if len(unique_phones(chosen)) != 1 and normalized_province: + province_filtered = [ + item + for item in chosen + if normalize_province(item.province) == normalized_province + ] + if province_filtered: + chosen = province_filtered + add_method("省份", method_parts) + + method = "+".join(method_parts) + if used_phone_backfill_firm: + method = "手机号回填律所|" + method + elif inferred_from_address: + method = "地址推断律所|" + method + return matched_firm, method, chosen + + +def autosize_columns(ws) -> None: + for column_cells in ws.columns: + values = [str(cell.value or "") for cell in column_cells] + max_length = min(max((len(value) for value in values), default=0), 60) + column_letter = column_cells[0].column_letter + ws.column_dimensions[column_letter].width = max_length + 2 + + +def iter_input_rows(ws) -> Iterable[Tuple[int, List[object]]]: + for row_idx in range(1, ws.max_row + 1): + yield row_idx, [ws.cell(row_idx, col_idx).value for col_idx in range(1, 8)] + + +def build_output(input_path: str, output_path: str) -> Dict[str, int]: + workbook = load_workbook(input_path) + source_ws = workbook.active + + firm_index, ordered_firms, phone_index = load_db_indexes() + + out_wb = Workbook() + out_ws = out_wb.active + out_ws.title = "firm_phone_compare" + headers = [ + "原始行号", + "原姓名", + "原手机号", + "原律所", + "原省份", + "原城市", + "原地址", + "原备注", + "手机号命中记录数", + "手机号命中手机号", + "手机号补全姓名", + "手机号补全律所", + "手机号补全来源", + "手机号候选姓名", + "手机号候选律所", + "用于匹配的律所", + "匹配方式", + "数据库候选手机号", + "候选数量", + "原手机号对比", + "数据库候选姓名", + "数据库候选省市", + "数据库来源", + ] + out_ws.append(headers) + for cell in out_ws[1]: + cell.font = Font(bold=True) + + stats = defaultdict(int) + for row_idx, row in iter_input_rows(source_ws): + name, original_phone, law_firm, province, city, address, remark = row + needs_phone_completion = not normalize_firm(law_firm) + phone_backfill = build_phone_backfill( + original_phone=original_phone, + name=name, + province=province, + city=city, + phone_index=phone_index, + ) + matched_firm, method, matched_records = match_row( + name=name, + original_phone=original_phone, + law_firm=law_firm, + province=province, + city=city, + address=address, + phone_backfill=phone_backfill, + firm_index=firm_index, + ordered_firms=ordered_firms, + ) + candidate_phones = unique_phones(matched_records) + compare = compare_result(split_phones(original_phone), candidate_phones) + candidate_names = unique_values(matched_records, "name") + candidate_domains = unique_values(matched_records, "domain") + city_province_pairs = [] + seen_pairs = set() + for record in matched_records: + pair = f"{record.province}-{record.city}".strip("-") + if pair and pair not in seen_pairs: + seen_pairs.add(pair) + city_province_pairs.append(pair) + + out_ws.append( + [ + row_idx, + name or "", + original_phone or "", + law_firm or "", + province or "", + city or "", + address or "", + remark or "", + len(phone_backfill.records) if needs_phone_completion else "", + " / ".join(phone_backfill.matched_phones) if needs_phone_completion else "", + phone_backfill.best_name if needs_phone_completion else "", + phone_backfill.best_law_firm if needs_phone_completion else "", + phone_backfill.best_domain if needs_phone_completion else "", + " / ".join(phone_backfill.candidate_names) if needs_phone_completion else "", + " / ".join(phone_backfill.candidate_firms) if needs_phone_completion else "", + matched_firm or "", + method or "", + " / ".join(candidate_phones) or "", + len(candidate_phones), + compare, + " / ".join(candidate_names) or "", + " / ".join(city_province_pairs) or "", + " / ".join(candidate_domains) or "", + ] + ) + + if needs_phone_completion and phone_backfill.records: + stats["phone_backfill_hit_rows"] += 1 + if needs_phone_completion and phone_backfill.best_name: + stats["phone_backfill_name_rows"] += 1 + if needs_phone_completion and phone_backfill.best_law_firm: + stats["phone_backfill_firm_rows"] += 1 + if needs_phone_completion and method.startswith("手机号回填律所|"): + stats["phone_backfill_used_for_match_rows"] += 1 + + if candidate_phones: + stats["matched_rows"] += 1 + if len(candidate_phones) == 1: + stats["unique_rows"] += 1 + else: + stats["multi_rows"] += 1 + else: + stats["unmatched_rows"] += 1 + + if compare == "完全一致": + stats["same_rows"] += 1 + elif compare == "候选包含原手机号": + stats["contains_rows"] += 1 + elif compare == "不包含原手机号": + stats["diff_rows"] += 1 + elif compare == "原手机号为空": + stats["blank_phone_rows"] += 1 + + out_ws.freeze_panes = "A2" + autosize_columns(out_ws) + out_wb.save(output_path) + return dict(stats) + + +def main() -> None: + args = parse_args() + stats = build_output(args.input, args.output) + print(f"已生成: {args.output}") + for key in sorted(stats): + print(f"{key}={stats[key]}") + + +if __name__ == "__main__": + main() diff --git a/config.py b/config.py index f290702..01902cf 100644 --- a/config.py +++ b/config.py @@ -7,48 +7,43 @@ DB_CONFIG = { "charset": "utf8mb4", } +# 高德地图 API 配置 +GAODE_CONFIG = { + "API_KEY": "f261575fb28003761c433f6c9379e89d", +} + # 微信爬虫特定的配置 WEIXIN_CONFIG = { - "TOKEN": "756858506", # 您的Token - "FINGERPRINT": "1caa5fc52ac489e20a175e153dd3ef21", + "TOKEN": "553117235", # 您的Token + "FINGERPRINT": "3c02c35093184e9a9a668ac3c81e53f9", "COOKIE": { "appmsglist_action_3258147150": "card", + "_qimei_uuid42": "1a302160d051008226aec905b63f99ff3989f30009", + "_qimei_i_3": "63b22b84c15204dfc595ac6452d722b1f0bdf0f6145b568ae68a7c0e70947438686637943989e2a1d792", + "_qimei_h38": "215986ce26aec905b63f99ff0200000e81a302", + "ua_id": "S7gglu0eZh9NkAzLAAAAADH8dynpnFZVN29lxm7BQo0=", + "wxuin": "73074968761097", "mm_lang": "zh_CN", - "ts_uid": "8295434560", - "markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838", - "_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1", - "_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f", - "_qimei_h38": "b885c955f8e9995f103aac140200000421811e", - "_qimei_i_1": "4ddd76d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd83abc2e0", - "_qpsvr_localtk": "0.2780749298744084", - "RK": "ZGEMOpzbOS", - "ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055", - "pac_uid": "0_HXj3iphPm0Y4a", - "_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55", - "rewardsn": "", - "wxtokenkey": "777", - "omgid": "0_HXj3iphPm0Y4a", - "sig_login": "h01218fdccf5b63c15a6c5edb19ce20d0481c52723ee44ab56b9fc1415ff39c9ff0dd2000e12f1de8ae", - "ua_id": "QXSOTQUjDFjoH63yAAAAAPILc15EwzRTwdqntEiCGSE=", - "mp_token": "1331492699", - "appletToken": "2105598806", - "__wx_phantom_mark__": "breQbE92JS", - "mmad_session": "2bd2e1824d701b521c16fa35de0378e55273ce93a68ac0cc9ca30e8ad5b2e9f6fc419dd5fed1cd17f0a57fc3c327e03ccf325c1e1e97dde41374a9d8067d9aa700c8b87a29b0d3caf7f949761d8f4eeb56a1e3ddbc5a5d3a573e5b83971cd92e11de1c56c245721266e7088080fefde3", - "pgv_info": "ssid=s5739471549", - "pgv_pvid": "2616937300", - "_gcl_au": "1.1.954868153.1769494261", - "wxuin": "69676812527831", - "_clck": "3258147150|1|g35|0", - "uuid": "e07aa2889db56b1901e1fb6b1286d9a7", - "rand_info": "CAESIBnfIxLJoUVe5wP4SI/ADWnrnPUBlJDb4yyA7Et1+ZfF", + "eas_sid": "91X7I7K4K5k364U2z3k2I980F5", + "_qimei_q36": "", + "_qimei_fingerprint": "d895c46d5fda98cab67d9daec00068ed", + "_qimei_i_1": "4dc76680945f59d3c7c4ab325dd526b3feeea1a31458558bbdd97e582493206c6163629d39d8e1dcd4b2c28f", + "pgv_pvid": "6923507145", + "ts_uid": "9585717820", + "_t_qbtool_uid": "aaaa2vn5byd280l00iglw701zci788cb", + "_ga": "GA1.1.1323926288.1775838938", + "_ga_TPFW0KPXC1": "GS2.1.s1775841484$o2$g1$t1775841485$j59$l0$h0", + "uuid": "20d1cfb540221c6e7b6d665ab1d4a8f7", + "rand_info": "CAESIA8LYV6dvWh5dYrgQLPhZb8TXwUJoWdcdDzN0TTdztSj", "slave_bizuin": "3258147150", "data_bizuin": "3258147150", "bizuin": "3258147150", - "data_ticket": "kv+SnLJADgPlcKQPIbYnfbEAxogpIMfAo/n0/HjtChnfDmQSogWvkO82/mUtzpcc", - "slave_sid": "eFNMcEZ3bnhvRkppZVNkTDE4dFFnM0ZzdFM1REhpemZORHRnVnlnRHhKU29vY1ZBY0dJZkFHcXB5Nko4aV9pbVlnRTBRVDE0NzdIUDF4T3NTSDVzdXBJS2d3WFFuR3hiMWVVbG5ZTURfYmh3YTFTallIb2JXOWpyTWxXS25jbVFRVmtXWHVaWGdCN2lqZzVm", + "data_ticket": "dgLFmSrI8f1q6JnYOd2Y/sKJIWjh6YlLSau1n1+Mv5iOTR5hgsm1qjNLypWflGd6", + "slave_sid": "VGVnNmM5NmFpV19ESElmVlZOTGZfVVJfWE5HanlHNjN0WEswZVkxVk9vc2FTenQzVGRsWUxDT0xGQVBJRVZzU0JNVV9RckRJVE9jSVUwbjl4Z2VHaEZKSzE5WVc3THRCRW96T0Z1V1VwbnBLSnkxSWdKaHdaN1dYdzI1SmdpZ0IyOFJtUE45OTR2Q2NvM1FB", "slave_user": "gh_fe76760560d0", - "xid": "34f577adf2c28e5b9f04de93c614c5c4", - "_clsk": "639w4k|1769742296130|3|1|mp.weixin.qq.com/weheat-agent/payload/record" + "xid": "4893c62dc8518b6a1628fd34bc9aa276", + "_clck": "3258147150|1|g5g|0", + "_clsk": "1p4oo3h|1776957001796|5|1|mp.weixin.qq.com/weheat-agent/payload/record" }, "COUNT": 20, # 单页条数 "REQUESTS_PER_SECOND": 8, # 每秒最大请求数(调高更快,但有风控风险) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..083234b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +services: + mongodb: + image: mongo:7 + container_name: lawyers_mongodb + restart: always + ports: + - "27017:27017" + volumes: + - mongodb_data:/data/db + environment: + MONGO_INITDB_DATABASE: lawyer + +volumes: + mongodb_data: diff --git a/gaode.py b/gaode.py index 8ed32f0..8a9b5ab 100644 --- a/gaode.py +++ b/gaode.py @@ -23,6 +23,7 @@ if project_root not in sys.path: sys.path.append(project_root) from Db import Db # 你的 DB 封装 +import config as project_config # logging 配置 logging.basicConfig( @@ -45,9 +46,14 @@ class GaodeSpider: sleep_between_cities: float = 3.0, ): self.db = db_connection - self.api_key = (api_key or os.environ.get("AMAP_API_KEY", "")).strip() + config_api_key = "" + gaode_config = getattr(project_config, "GAODE_CONFIG", None) + if isinstance(gaode_config, dict): + config_api_key = str(gaode_config.get("API_KEY", "")).strip() + + self.api_key = (api_key or os.environ.get("AMAP_API_KEY", "") or config_api_key).strip() if not self.api_key: - raise ValueError("AMAP_API_KEY environment variable is required") + raise ValueError("高德 API Key 未配置,请在 config.py 的 GAODE_CONFIG.API_KEY 或环境变量 AMAP_API_KEY 中填写") self.api_base = "https://restapi.amap.com/v3/place/text" self.offset = offset self.session = self._build_session() @@ -369,7 +375,7 @@ class GaodeSpider: return total_stored = 0 - keywords_suffix = "律所" + keywords_suffix = "律师" for city_id, city_info in self.cities.items(): try: diff --git a/man.xlsx b/man.xlsx new file mode 100644 index 0000000..875fc44 Binary files /dev/null and b/man.xlsx differ diff --git a/man_firm_phone_compare.xlsx b/man_firm_phone_compare.xlsx new file mode 100644 index 0000000..1d94368 Binary files /dev/null and b/man_firm_phone_compare.xlsx differ diff --git a/request/proxy_config.py b/request/proxy_config.py index 6cbc1ae..38c700b 100644 --- a/request/proxy_config.py +++ b/request/proxy_config.py @@ -24,6 +24,19 @@ def _normalize_bool(value, default: bool = True) -> bool: return text not in ("0", "false", "no", "off", "") +def _env_proxy_override() -> Optional[bool]: + """ + 环境变量覆盖代理开关: + - PROXY_ENABLED 未设置:返回 None(不覆盖,仍读取 proxy_settings.json) + - PROXY_ENABLED=0/false/off:强制关闭代理 + - PROXY_ENABLED=1/true/on:强制开启代理(前提是配置字段齐全) + """ + raw = os.getenv("PROXY_ENABLED") + if raw is None: + return None + return _normalize_bool(raw, True) + + def _load_config() -> Dict[str, str]: if not os.path.exists(CONFIG_PATH): return dict(DEFAULT_CONFIG) @@ -48,7 +61,12 @@ def report_proxy_status() -> None: _PROXY_STATUS_REPORTED = True config = _load_config() - enabled = _normalize_bool(config.get("enabled"), True) + override = _env_proxy_override() + if override is False: + print("[proxy] disabled by env (PROXY_ENABLED=0)") + return + + enabled = _normalize_bool(config.get("enabled"), True) if override is None else True if not enabled: print("[proxy] disabled by config") return @@ -66,7 +84,10 @@ def get_proxies() -> Optional[Dict[str, str]]: 代理配置从 proxy_settings.json 读取,不依赖环境变量。 """ config = _load_config() - if not _normalize_bool(config.get("enabled"), True): + override = _env_proxy_override() + if override is False: + return None + if override is None and not _normalize_bool(config.get("enabled"), True): return None tunnel = str(config.get("tunnel") or "").strip() @@ -95,3 +116,22 @@ def apply_proxy(session) -> Optional[Dict[str, str]]: __all__ = ["get_proxies", "apply_proxy", "report_proxy_status"] + + +def is_proxy_enabled() -> bool: + """ + 判断当前进程是否启用了代理。 + + 优先遵循环境变量 PROXY_ENABLED; + 未设置时回退到 proxy_settings.json 的 enabled 配置。 + """ + config = _load_config() + override = _env_proxy_override() + if override is False: + return False + if override is True: + return True + return _normalize_bool(config.get("enabled"), True) + + +__all__ = ["get_proxies", "apply_proxy", "report_proxy_status", "is_proxy_enabled"] diff --git a/utils/rate_limiter.py b/utils/rate_limiter.py index 23ef0b5..7b22f59 100644 --- a/utils/rate_limiter.py +++ b/utils/rate_limiter.py @@ -14,6 +14,7 @@ from pathlib import Path from uuid import uuid4 import fcntl +from request.proxy_config import is_proxy_enabled class RateLimiter: @@ -167,13 +168,27 @@ global_rate_limiter = RateLimiter( ) +def _should_limit_proxy_requests() -> bool: + """ + 仅在当前进程实际启用代理时启用全局代理限流。 + """ + try: + return is_proxy_enabled() + except Exception: + return True + + def wait_for_request(): """等待直到可以发起请求。""" + if not _should_limit_proxy_requests(): + return global_rate_limiter.acquire() def can_request_now() -> bool: """检查是否可以立即发起请求。""" + if not _should_limit_proxy_requests(): + return True return global_rate_limiter.can_make_request() @@ -184,6 +199,10 @@ def request_slot(): 这样既能限制“每秒启动多少请求”,也能限制“同时在飞多少请求”。 """ + if not _should_limit_proxy_requests(): + yield + return + token = global_rate_limiter.try_acquire_slot() try: yield diff --git a/weixin.py b/weixin.py index 59bf18d..16c707d 100644 --- a/weixin.py +++ b/weixin.py @@ -1,3 +1,4 @@ +import copy import json import os import re @@ -26,64 +27,73 @@ DEFAULT_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/138.0.0.0 Safari/537.36" + "Chrome/146.0.0.0 Safari/537.36" ), "Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7", + "DNT": "1", + "Priority": "u=1, i", + "Sec-CH-UA": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"', + "Sec-CH-UA-Mobile": "?0", + "Sec-CH-UA-Platform": '"Windows"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", "X-Requested-With": "XMLHttpRequest", } DEFAULT_WEIXIN_CONFIG = { - "TOKEN": "32299576", - "FINGERPRINT": "64a1c659b8b944d6e7fe596b0794ab35", + "TOKEN": "609153506", + "FINGERPRINT": "46a7e6ac6ccf205986adc0aa99127860", "COOKIE": { - "appmsglist_action_3876849679": "card", + "appmsglist_action_3258147150": "card", + "_qimei_uuid42": "1a302160d051008226aec905b63f99ff3989f30009", + "_qimei_i_3": "63b22b84c15204dfc595ac6452d722b1f0bdf0f6145b568ae68a7c0e70947438686637943989e2a1d792", + "_qimei_h38": "215986ce26aec905b63f99ff0200000e81a302", + "ua_id": "S7gglu0eZh9NkAzLAAAAADH8dynpnFZVN29lxm7BQo0=", + "wxuin": "73074968761097", "mm_lang": "zh_CN", - "ts_uid": "8295434560", - "markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838", - "_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1", - "_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f", - "_qimei_h38": "b885c955f8e9995f103aac140200000421811e", - "RK": "ZGEMOpzbOS", - "ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055", - "pac_uid": "0_HXj3iphPm0Y4a", - "_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55", - "wxuin": "70085167371972", - "omgid": "0_HXj3iphPm0Y4a", - "rewardsn": "", - "wxtokenkey": "777", - "sig_login": "h017c22e8921e6bf5a1f8659d9f34ee0db2be31cdcf03786b9ab4b787a9821ad84d3046473d9076181a", - "_qpsvr_localtk": "0.9079082151544442", - "appletToken": "880792228", - "mmad_session": "ae5215dd3c930e6256d8f0656bd8497e719817e0df77a677766e128e2135218486f674b88b349db0d47039f54cb99c8753beb8d4b921ae452b66773db51ad3006ab1f0d19253ae83e2cb9ba53ff5b5b4f45f2fe160db66fd300a1fb4e04a92bd11de1c56c245721266e7088080fefde3", - "qq_domain_video_guid_verify": "6cce52525a146907", + "eas_sid": "91X7I7K4K5k364U2z3k2I980F5", "_qimei_q36": "", - "pgv_info": "ssid=s4741843528", - "pgv_pvid": "9337874960", - "_qimei_i_2": "47e96bdff700", - "_qimei_i_1": "40bb51d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd838fd0da", - "_qimei_q32": "", - "mp_token": "1555009133", - "ua_id": "390pNywJFJA6BsgOAAAAADO0TqlmW7NBB1GD0Y7OVwk=", - "__wx_phantom_mark__": "UTRZE71JZ7", - "_clck": "3841887471|1|g4a|0", - "uuid": "6ae7cb97104627c5d3b9d1d9ab2eef60", - "rand_info": "CAESIGjvJyiJ58Ii0enQVKBwl6d4IyCrWeN7kzhIAVTgM2lc", - "slave_bizuin": "3876849679", - "data_bizuin": "3876849679", - "bizuin": "3876849679", - "data_ticket": "8wg11/LIrTLHAbJdbAH2HWdqlW/K2jijwP27oPSrH2myYNpuSR1NedfmSbzeq5go", - "slave_sid": "TjBzVV83WThEaThRdUhlcFpqRFhQejFSUzRfOWdGa0l3S0dPSW41QWdkSk9qSkQ2ZTljbWRHa0poQ1lNTXlub25WMUJORVluVU5HaFBGRXVJS19yeG53SUNWWU14YjNQeWpxTUczalBHV1dTY0V3TDZ6aE14bFNaS2ExeGNhb3J0WlRWMlM4NnNmNGFST0ZD", - "slave_user": "gh_6c1283858808", - "xid": "116378d10877a35558158970698ca0c3", - "_clsk": "3okzsf|1773282377657|6|1|mp.weixin.qq.com/weheat-agent/payload/record" - }, - "COUNT": 20, + "_qimei_fingerprint": "d895c46d5fda98cab67d9daec00068ed", + "_clck": "501quy|1|g4t|0", + "uuid": "210d1c199a63afd4c774eccd9a06a27f", + "rand_info": "CAESIE4WqrFFVVjqrrNflbCUM7wPD5NXjuGbjfHolAEsMmEm", + "slave_bizuin": "3258147150", + "data_bizuin": "3258147150", + "bizuin": "3258147150", + "data_ticket": "tpcLjRB7B7AlUY3rFe/ILEjtCKs7dEEGsn8kXnHVzdTb9dgIpSPN1aP8FlE6FDhj", + "slave_sid": "U3hfU1Z0UV91N0U5d0lkRDhyTzh3d3hmbnBHMjBnbmFNdzVJeGlJeTJ6OTVxRjJQVVE2VkNhejYzTkxETVVSZkF3eWRORmtRS01XWFBjdnFZZWFLNjR2ZGtwdUJ2MzByclg0NjF4SHlDeVJneEhsczdSYUJVNE45VEhNRWVTQXg1dlpGdWQ0bU5VM3pnRzJN", + "slave_user": "gh_fe76760560d0", + "xid": "ef503a6864cceaef225c615a45606e4a", + "_clsk": "12arnf1|1774975723874|4|1|mp.weixin.qq.com/weheat-agent/payload/record", + "_qimei_i_1": "2ddc6a80945f59d3c7c4ab325dd526b3feeea1a31458558bbdd97e582493206c6163629d39d8e1dcd49fddc7" + }, + "COUNT": 21, + "REFERER": "https://mp.weixin.qq.com/", + "HEADERS": {}, + "REQUEST_PARAMS": { + "action": "search", + "scene": "1", + "lang": "zh_CN", + "f": "json", + "ajax": "1", + }, "REQUESTS_PER_SECOND": 5, "PAGE_DELAY": 5, "CITY_DELAY": 2, } +def _deep_merge_dict(base: Dict, incoming: Dict) -> Dict: + merged = copy.deepcopy(base) + for key, value in incoming.items(): + if isinstance(value, dict) and isinstance(merged.get(key), dict): + merged[key] = _deep_merge_dict(merged[key], value) + else: + merged[key] = value + return merged + + def _parse_cookie_value(cookie_value) -> Dict[str, str]: if isinstance(cookie_value, dict): return {str(key): str(value) for key, value in cookie_value.items()} @@ -110,15 +120,16 @@ def _parse_cookie_value(cookie_value) -> Dict[str, str]: def _load_weixin_config() -> Dict: - config = DEFAULT_WEIXIN_CONFIG.copy() + config = copy.deepcopy(DEFAULT_WEIXIN_CONFIG) module_config = getattr(project_config, "WEIXIN_CONFIG", None) if isinstance(module_config, dict): - config.update(module_config) + config = _deep_merge_dict(config, module_config) env_mapping = { "TOKEN": os.getenv("WEIXIN_TOKEN"), "FINGERPRINT": os.getenv("WEIXIN_FINGERPRINT"), "COOKIE": os.getenv("WEIXIN_COOKIE"), + "REFERER": os.getenv("WEIXIN_REFERER"), "COUNT": os.getenv("WEIXIN_COUNT"), "REQUESTS_PER_SECOND": os.getenv("WEIXIN_REQUESTS_PER_SECOND"), "PAGE_DELAY": os.getenv("WEIXIN_PAGE_DELAY"), @@ -161,17 +172,32 @@ class WeixinSpider: self.fingerprint = str(self.config.get("FINGERPRINT", "")).strip() self.cookies = self.config.get("COOKIE", {}) self.count = str(self.config.get("COUNT", DEFAULT_WEIXIN_CONFIG["COUNT"])) + self.referer = str(self.config.get("REFERER", DEFAULT_WEIXIN_CONFIG["REFERER"])).strip() + self.request_params = { + str(key): str(value) + for key, value in (self.config.get("REQUEST_PARAMS", {}) or {}).items() + if value is not None + } self.page_delay = max(0.0, float(self.config.get("PAGE_DELAY", DEFAULT_WEIXIN_CONFIG["PAGE_DELAY"]))) self.city_delay = max(0.0, float(self.config.get("CITY_DELAY", DEFAULT_WEIXIN_CONFIG["CITY_DELAY"]))) max_rps = self.config.get("REQUESTS_PER_SECOND") if max_rps: global_rate_limiter.max_requests = int(max_rps) - headers = getattr(project_config, "HEADERS", DEFAULT_HEADERS).copy() - headers["Referer"] = "https://mp.weixin.qq.com/" + headers = DEFAULT_HEADERS.copy() + project_headers = getattr(project_config, "HEADERS", None) + if isinstance(project_headers, dict): + headers.update(project_headers) + config_headers = self.config.get("HEADERS", {}) + if isinstance(config_headers, dict): + headers.update({str(key): str(value) for key, value in config_headers.items()}) + if self.referer: + headers["Referer"] = self.referer self.session = requests.Session() self.session.trust_env = False self.session.headers.update(headers) + if self.cookies: + self.session.cookies.update(self.cookies) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def _validate_runtime_config(self) -> bool: @@ -214,18 +240,14 @@ class WeixinSpider: return [] def _build_query_url(self, query: str, buffer: str) -> str: - params = { - "action": "search", - "scene": "1", + params = self.request_params.copy() + params.update({ "query": query, "count": self.count, "buffer": buffer, "fingerprint": self.fingerprint, "token": self.token, - "lang": "zh_CN", - "f": "json", - "ajax": "1", - } + }) return f"{API_ENDPOINT}?{urlencode(params)}" def _extract_phone(self, text: str) -> Optional[str]: