chore: 暂存本地修改
This commit is contained in:
@@ -20,6 +20,49 @@ pip install -r requirements.txt
|
|||||||
bash common_sites/start.sh
|
bash common_sites/start.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 拆分运行(直连/代理)
|
||||||
|
|
||||||
|
本仓库支持用环境变量 `PROXY_ENABLED` 在一次运行内强制开/关代理:
|
||||||
|
|
||||||
|
- **直连**:`PROXY_ENABLED=0`(不使用代理 IP)
|
||||||
|
- **代理**:`PROXY_ENABLED=1`(强制使用 `request/proxy_settings.json` 的代理配置)
|
||||||
|
- **默认**:不设置(跟随 `request/proxy_settings.json` 的 `enabled` 字段)
|
||||||
|
|
||||||
|
对应提供两套入口脚本:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 直连(默认包含:大律师/大律师PC/找法网/法律快车)
|
||||||
|
bash common_sites/start_direct_twice_weekly.sh
|
||||||
|
|
||||||
|
# 代理(默认包含:华律/律图)
|
||||||
|
bash common_sites/start_proxy_weekly.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## cron 示例(每周两次直连 + 每周一次代理)
|
||||||
|
|
||||||
|
> 下面仅给示例,你可以按机器负载调整时间;日志会输出到 `common_sites/*.log`。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 编辑定时任务
|
||||||
|
crontab -e
|
||||||
|
|
||||||
|
# 每周二、周五 02:10 直连跑一次
|
||||||
|
10 2 * * 2,5 cd /www/wwwroot/lawyers && bash common_sites/start_direct_twice_weekly.sh
|
||||||
|
|
||||||
|
# 每周日 03:20 走代理跑一次(你手动续费代理 IP)
|
||||||
|
20 3 * * 0 cd /www/wwwroot/lawyers && bash common_sites/start_proxy_weekly.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### 常用参数(可选)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 限流(跨进程共享),直连可适当调高,代理建议保守
|
||||||
|
export PROXY_MAX_REQUESTS_PER_SECOND=8
|
||||||
|
|
||||||
|
# 代理连通性输出(部分脚本会打印测试信息)
|
||||||
|
export PROXY_TEST=1
|
||||||
|
```
|
||||||
|
|
||||||
## 说明
|
## 说明
|
||||||
|
|
||||||
- 当前项目直接复用原项目数据库配置和代理配置。
|
- 当前项目直接复用原项目数据库配置和代理配置。
|
||||||
|
|||||||
@@ -86,6 +86,7 @@ class DlsSpider:
|
|||||||
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
||||||
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
||||||
try:
|
try:
|
||||||
|
with request_slot():
|
||||||
resp = session.get(
|
resp = session.get(
|
||||||
test_url,
|
test_url,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
|
|||||||
@@ -0,0 +1,220 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_root = os.path.dirname(current_dir)
|
||||||
|
request_dir = os.path.join(project_root, "request")
|
||||||
|
if request_dir not in sys.path:
|
||||||
|
sys.path.insert(0, request_dir)
|
||||||
|
if project_root not in sys.path:
|
||||||
|
sys.path.append(project_root)
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from request.proxy_config import get_proxies, report_proxy_status
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CheckResult:
|
||||||
|
site: str
|
||||||
|
url: str
|
||||||
|
method: str
|
||||||
|
ok: bool
|
||||||
|
status_code: Optional[int]
|
||||||
|
error: str
|
||||||
|
hint: str
|
||||||
|
elapsed_ms: int
|
||||||
|
|
||||||
|
|
||||||
|
def _now_ms() -> int:
|
||||||
|
return int(time.time() * 1000)
|
||||||
|
|
||||||
|
|
||||||
|
def _short_hint(text: str) -> str:
|
||||||
|
s = (text or "").strip().lower()
|
||||||
|
flags = []
|
||||||
|
for key, label in [
|
||||||
|
("403", "403"),
|
||||||
|
("429", "429"),
|
||||||
|
("captcha", "captcha"),
|
||||||
|
("验证码", "captcha_cn"),
|
||||||
|
("人机", "bot_check_cn"),
|
||||||
|
("access denied", "access_denied"),
|
||||||
|
("forbidden", "forbidden"),
|
||||||
|
("too many requests", "rate_limited"),
|
||||||
|
("cloudflare", "cloudflare"),
|
||||||
|
("challenge", "challenge"),
|
||||||
|
]:
|
||||||
|
if key in s:
|
||||||
|
flags.append(label)
|
||||||
|
return ",".join(flags)[:120]
|
||||||
|
|
||||||
|
|
||||||
|
def _build_session() -> requests.Session:
|
||||||
|
report_proxy_status()
|
||||||
|
s = requests.Session()
|
||||||
|
s.trust_env = False
|
||||||
|
proxies = get_proxies()
|
||||||
|
if proxies:
|
||||||
|
s.proxies.update(proxies)
|
||||||
|
else:
|
||||||
|
s.proxies.clear()
|
||||||
|
s.headers.update(
|
||||||
|
{
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/136.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "*/*",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Connection": "close",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _check(
|
||||||
|
session: requests.Session,
|
||||||
|
*,
|
||||||
|
site: str,
|
||||||
|
method: str,
|
||||||
|
url: str,
|
||||||
|
timeout: Tuple[float, float] = (10.0, 15.0),
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
data: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> CheckResult:
|
||||||
|
start = _now_ms()
|
||||||
|
try:
|
||||||
|
resp = session.request(
|
||||||
|
method=method,
|
||||||
|
url=url,
|
||||||
|
timeout=timeout,
|
||||||
|
verify=False,
|
||||||
|
headers=headers,
|
||||||
|
data=data,
|
||||||
|
)
|
||||||
|
text = resp.text or ""
|
||||||
|
status = resp.status_code
|
||||||
|
hint = _short_hint(text[:1200])
|
||||||
|
ok = 200 <= status < 400
|
||||||
|
return CheckResult(
|
||||||
|
site=site,
|
||||||
|
url=url,
|
||||||
|
method=method,
|
||||||
|
ok=ok,
|
||||||
|
status_code=status,
|
||||||
|
error="",
|
||||||
|
hint=hint,
|
||||||
|
elapsed_ms=_now_ms() - start,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
return CheckResult(
|
||||||
|
site=site,
|
||||||
|
url=url,
|
||||||
|
method=method,
|
||||||
|
ok=False,
|
||||||
|
status_code=None,
|
||||||
|
error=str(exc)[:200],
|
||||||
|
hint="",
|
||||||
|
elapsed_ms=_now_ms() - start,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
resp.close() # type: ignore[name-defined]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _tests() -> List[Dict[str, Any]]:
|
||||||
|
# 每个站点选一个“代表性列表/API”作为冒烟:能快速暴露 403/验证码/限频。
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"site": "大律师(m站)",
|
||||||
|
"method": "GET",
|
||||||
|
"url": "https://m.maxlaw.cn/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "大律师(PC站)",
|
||||||
|
"method": "GET",
|
||||||
|
"url": "https://www.maxlaw.cn/law/beijing?page=1",
|
||||||
|
"headers": {"Referer": "https://www.maxlaw.cn/"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "找法网(m站)",
|
||||||
|
"method": "GET",
|
||||||
|
"url": "https://m.findlaw.cn/beijing/q_lawyer/p1?ajax=1&order=0&sex=-1",
|
||||||
|
"headers": {
|
||||||
|
"Referer": "https://m.findlaw.cn/beijing/q_lawyer/",
|
||||||
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "法律快车(m站)",
|
||||||
|
"method": "GET",
|
||||||
|
"url": "https://m.lawtime.cn/beijing/lawyer/?page=1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "律图(m站)",
|
||||||
|
"method": "POST",
|
||||||
|
"url": "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/",
|
||||||
|
"data": {
|
||||||
|
"RegionId": "110100", # 北京市
|
||||||
|
"OnlyData": "true",
|
||||||
|
"LawyerRecommendRequest[AreaId]": "110100",
|
||||||
|
"LawyerRecommendRequest[PageIndex]": "1",
|
||||||
|
"LawyerRecommendRequest[PageSize]": "10",
|
||||||
|
"LawyerRecommendRequest[OrderType]": "0",
|
||||||
|
"LawyerRecommendRequest[Type]": "1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"site": "华律(m站)",
|
||||||
|
"method": "POST",
|
||||||
|
"url": "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/",
|
||||||
|
"data": {
|
||||||
|
"pid": "110000", # 北京
|
||||||
|
"cid": "110100", # 北京市
|
||||||
|
"page": "1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
mode = os.getenv("PROXY_ENABLED")
|
||||||
|
print(f"[smoke] PROXY_ENABLED={mode!r}")
|
||||||
|
s = _build_session()
|
||||||
|
results: List[CheckResult] = []
|
||||||
|
for item in _tests():
|
||||||
|
res = _check(
|
||||||
|
s,
|
||||||
|
site=item["site"],
|
||||||
|
method=item["method"],
|
||||||
|
url=item["url"],
|
||||||
|
headers=item.get("headers"),
|
||||||
|
data=item.get("data"),
|
||||||
|
)
|
||||||
|
results.append(res)
|
||||||
|
print(
|
||||||
|
f"[smoke] {res.site} {res.method} {res.status_code} ok={res.ok} "
|
||||||
|
f"{res.elapsed_ms}ms hint={res.hint or '-'} err={res.error or '-'}"
|
||||||
|
)
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"proxy_enabled": mode,
|
||||||
|
"results": [res.__dict__ for res in results],
|
||||||
|
}
|
||||||
|
print("[smoke] summary_json=" + json.dumps(summary, ensure_ascii=False))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
|
|
||||||
@@ -6,11 +6,27 @@ cd "$(dirname "$0")"
|
|||||||
|
|
||||||
echo "使用 request/proxy_settings.json 读取代理配置"
|
echo "使用 request/proxy_settings.json 读取代理配置"
|
||||||
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||||
|
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||||
|
|
||||||
|
is_job_running() {
|
||||||
|
local script="$1"
|
||||||
|
local script_regex="${script//./\\.}"
|
||||||
|
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||||
|
}
|
||||||
|
|
||||||
start_job() {
|
start_job() {
|
||||||
local script="$1"
|
local script="$1"
|
||||||
local log_file="$2"
|
local log_file="$2"
|
||||||
local label="$3"
|
local label="$3"
|
||||||
|
local existing
|
||||||
|
|
||||||
|
existing="$(is_job_running "${script}")"
|
||||||
|
if [[ -n "${existing}" ]]; then
|
||||||
|
echo "跳过 ${label}: ${script} 已在运行"
|
||||||
|
echo "${existing}" | head -n 1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||||
echo "启动 ${label}: ${script} -> ${log_file}"
|
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||||
sleep 1
|
sleep 1
|
||||||
|
|||||||
Executable
+48
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# 切换到脚本所在目录,确保相对路径正确
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
# 强制直连:不使用代理 IP
|
||||||
|
export PROXY_ENABLED=0
|
||||||
|
|
||||||
|
# 直连模式建议更保守一些,降低被临时风控的概率
|
||||||
|
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||||
|
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||||
|
|
||||||
|
is_job_running() {
|
||||||
|
local script="$1"
|
||||||
|
local script_regex="${script//./\\.}"
|
||||||
|
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||||
|
}
|
||||||
|
|
||||||
|
start_job() {
|
||||||
|
local script="$1"
|
||||||
|
local log_file="$2"
|
||||||
|
local label="$3"
|
||||||
|
local existing
|
||||||
|
|
||||||
|
existing="$(is_job_running "${script}")"
|
||||||
|
if [[ -n "${existing}" ]]; then
|
||||||
|
echo "跳过 ${label}: ${script} 已在运行"
|
||||||
|
echo "${existing}" | head -n 1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||||
|
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||||
|
sleep 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "直连模式(PROXY_ENABLED=0),每周两次建议用 cron 调度"
|
||||||
|
echo "当前归入直连组:大律师(m/PC)、华律、律图"
|
||||||
|
|
||||||
|
# 直连优先站点:
|
||||||
|
# - 大律师(m站/PC站):当前可直接访问,未见明显强风控
|
||||||
|
# - 华律:当前网页可直接访问,未见明显强风控
|
||||||
|
# - 律图:当前网页可直接访问,未见明显强风控
|
||||||
|
start_job "dls.py" "direct_dls.log" "大律师(直连)"
|
||||||
|
start_job "dls_pc.py" "direct_dls_pc.log" "大律师PC站(直连)"
|
||||||
|
start_job "hualv.py" "direct_hualv.log" "华律(直连)"
|
||||||
|
start_job "six4365.py" "direct_six4365.log" "律图(直连)"
|
||||||
Executable
+53
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# 切换到脚本所在目录,确保相对路径正确
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
# 强制开启代理:用于容易被限频/拦截的站点
|
||||||
|
export PROXY_ENABLED=1
|
||||||
|
|
||||||
|
# 代理模式下默认更保守一点,避免冲爆代理与触发风控
|
||||||
|
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||||
|
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||||
|
|
||||||
|
# 可选:开启代理连通性测试输出(部分脚本会打印测试信息/代理状态)
|
||||||
|
export PROXY_TEST="${PROXY_TEST:-0}"
|
||||||
|
|
||||||
|
is_job_running() {
|
||||||
|
local script="$1"
|
||||||
|
local script_regex="${script//./\\.}"
|
||||||
|
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||||
|
}
|
||||||
|
|
||||||
|
start_job() {
|
||||||
|
local script="$1"
|
||||||
|
local log_file="$2"
|
||||||
|
local label="$3"
|
||||||
|
local existing
|
||||||
|
|
||||||
|
existing="$(is_job_running "${script}")"
|
||||||
|
if [[ -n "${existing}" ]]; then
|
||||||
|
echo "跳过 ${label}: ${script} 已在运行"
|
||||||
|
echo "${existing}" | head -n 1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||||
|
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||||
|
sleep 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "代理模式(PROXY_ENABLED=1),每周一次建议用 cron 调度"
|
||||||
|
echo "代理配置读取自 request/proxy_settings.json"
|
||||||
|
echo "每周一次代理任务 = 全量采集所有站点"
|
||||||
|
|
||||||
|
# 每周一次代理任务做全量采集:
|
||||||
|
# - 强风控/更敏感站点:找法网、法律快车
|
||||||
|
# - 其余站点也一并跑,保证每周至少有一次“全量最新数据”刷新
|
||||||
|
start_job "dls.py" "proxy_dls.log" "大律师(代理全量)"
|
||||||
|
start_job "dls_pc.py" "proxy_dls_pc.log" "大律师PC站(代理全量)"
|
||||||
|
start_job "findlaw.py" "proxy_findlaw.log" "找法网(代理)"
|
||||||
|
start_job "lawtime.py" "proxy_lawtime.log" "法律快车(代理)"
|
||||||
|
start_job "hualv.py" "proxy_hualv.log" "华律(代理全量)"
|
||||||
|
start_job "six4365.py" "proxy_six4365.log" "律图(代理全量)"
|
||||||
@@ -0,0 +1,565 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
from openpyxl import Workbook, load_workbook
|
||||||
|
from openpyxl.styles import Font
|
||||||
|
|
||||||
|
from config import DB_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class LawyerRecord:
|
||||||
|
id: int
|
||||||
|
name: str
|
||||||
|
phone: str
|
||||||
|
law_firm: str
|
||||||
|
province: str
|
||||||
|
city: str
|
||||||
|
domain: str
|
||||||
|
create_time: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PhoneBackfill:
|
||||||
|
matched_phones: List[str]
|
||||||
|
records: List[LawyerRecord]
|
||||||
|
best_name: str
|
||||||
|
best_law_firm: str
|
||||||
|
best_domain: str
|
||||||
|
candidate_names: List[str]
|
||||||
|
candidate_firms: List[str]
|
||||||
|
candidate_domains: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
DOMAIN_PRIORITY = {
|
||||||
|
"华律": 90,
|
||||||
|
"大律师": 85,
|
||||||
|
"找法网": 82,
|
||||||
|
"法律快车": 80,
|
||||||
|
"律图": 72,
|
||||||
|
"众法利单页": 68,
|
||||||
|
"众法利": 66,
|
||||||
|
"六四三六五": 64,
|
||||||
|
"智飞律师在线": 40,
|
||||||
|
"高德地图": 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
GENERIC_FIRMS = {"高德搜索"}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="按律所名从数据库补手机号并导出对比表")
|
||||||
|
parser.add_argument("--input", default="man.xlsx", help="原始 xlsx 文件路径")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="man_firm_phone_compare.xlsx",
|
||||||
|
help="输出 xlsx 文件路径",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_text(value: object) -> str:
|
||||||
|
text = str(value or "").strip()
|
||||||
|
text = text.replace("(", "(").replace(")", ")")
|
||||||
|
text = re.sub(r"\s+", "", text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_firm(value: object) -> str:
|
||||||
|
text = normalize_text(value)
|
||||||
|
text = text.replace("本地大所", "").replace("特色律所", "")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_name(value: object) -> str:
|
||||||
|
text = normalize_text(value)
|
||||||
|
return text.replace("律师", "")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_province(value: object) -> str:
|
||||||
|
text = str(value or "").strip()
|
||||||
|
mapping = {
|
||||||
|
"北京市": "北京",
|
||||||
|
"天津市": "天津",
|
||||||
|
"上海市": "上海",
|
||||||
|
"重庆市": "重庆",
|
||||||
|
"内蒙古自治区": "内蒙古",
|
||||||
|
"广西壮族自治区": "广西",
|
||||||
|
"宁夏回族自治区": "宁夏",
|
||||||
|
"新疆维吾尔自治区": "新疆",
|
||||||
|
"西藏自治区": "西藏",
|
||||||
|
"香港特别行政区": "香港",
|
||||||
|
"澳门特别行政区": "澳门",
|
||||||
|
"新疆生产建设兵团": "新疆",
|
||||||
|
}
|
||||||
|
if text in mapping:
|
||||||
|
return mapping[text]
|
||||||
|
if text.endswith("省") and len(text) > 1:
|
||||||
|
return text[:-1]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_city(value: object) -> str:
|
||||||
|
text = str(value or "").strip()
|
||||||
|
for suffix in ("市", "地区", "盟"):
|
||||||
|
if text.endswith(suffix) and len(text) > len(suffix):
|
||||||
|
return text[: -len(suffix)]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def split_phones(value: object) -> List[str]:
|
||||||
|
return re.findall(r"1\d{10}", str(value or ""))
|
||||||
|
|
||||||
|
|
||||||
|
def unique_phones(records: Sequence[LawyerRecord]) -> List[str]:
|
||||||
|
output: List[str] = []
|
||||||
|
seen = set()
|
||||||
|
for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True):
|
||||||
|
if record.phone and record.phone not in seen:
|
||||||
|
seen.add(record.phone)
|
||||||
|
output.append(record.phone)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def unique_values(records: Sequence[LawyerRecord], attr: str) -> List[str]:
|
||||||
|
output: List[str] = []
|
||||||
|
seen = set()
|
||||||
|
for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True):
|
||||||
|
value = getattr(record, attr, "")
|
||||||
|
if value and value not in seen:
|
||||||
|
seen.add(value)
|
||||||
|
output.append(value)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def phone_record_sort_key(
|
||||||
|
record: LawyerRecord,
|
||||||
|
target_name: object,
|
||||||
|
target_province: object,
|
||||||
|
target_city: object,
|
||||||
|
) -> Tuple[int, int, int]:
|
||||||
|
score = 0
|
||||||
|
normalized_target_name = normalize_name(target_name)
|
||||||
|
normalized_target_province = normalize_province(target_province)
|
||||||
|
normalized_target_city = normalize_city(target_city)
|
||||||
|
|
||||||
|
if normalized_target_name:
|
||||||
|
if normalize_name(record.name) == normalized_target_name:
|
||||||
|
score += 400
|
||||||
|
elif record.name:
|
||||||
|
score -= 40
|
||||||
|
|
||||||
|
if record.law_firm and record.law_firm not in GENERIC_FIRMS:
|
||||||
|
score += 220
|
||||||
|
elif record.law_firm:
|
||||||
|
score += 40
|
||||||
|
|
||||||
|
if record.name:
|
||||||
|
score += 100
|
||||||
|
|
||||||
|
if normalized_target_city:
|
||||||
|
if normalize_city(record.city) == normalized_target_city:
|
||||||
|
score += 45
|
||||||
|
elif record.city:
|
||||||
|
score -= 10
|
||||||
|
|
||||||
|
if normalized_target_province:
|
||||||
|
if normalize_province(record.province) == normalized_target_province:
|
||||||
|
score += 25
|
||||||
|
elif record.province:
|
||||||
|
score -= 5
|
||||||
|
|
||||||
|
score += DOMAIN_PRIORITY.get(record.domain, 50)
|
||||||
|
return score, record.create_time, record.id
|
||||||
|
|
||||||
|
|
||||||
|
def compare_result(original_phones: Sequence[str], candidate_phones: Sequence[str]) -> str:
|
||||||
|
if not candidate_phones:
|
||||||
|
return "未匹配"
|
||||||
|
if not original_phones:
|
||||||
|
return "原手机号为空"
|
||||||
|
|
||||||
|
original_set = set(original_phones)
|
||||||
|
candidate_set = set(candidate_phones)
|
||||||
|
if original_set == candidate_set:
|
||||||
|
return "完全一致"
|
||||||
|
if original_set & candidate_set:
|
||||||
|
return "候选包含原手机号"
|
||||||
|
return "不包含原手机号"
|
||||||
|
|
||||||
|
|
||||||
|
def infer_firm_from_address(address: object, ordered_firms: Sequence[str]) -> str:
|
||||||
|
normalized_address = normalize_text(address)
|
||||||
|
if not normalized_address:
|
||||||
|
return ""
|
||||||
|
for firm in ordered_firms:
|
||||||
|
if len(firm) < 4:
|
||||||
|
continue
|
||||||
|
if firm in normalized_address:
|
||||||
|
return firm
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def load_db_indexes() -> Tuple[Dict[str, List[LawyerRecord]], List[str], Dict[str, List[LawyerRecord]]]:
|
||||||
|
conn = pymysql.connect(**DB_CONFIG)
|
||||||
|
firm_index: Dict[str, List[LawyerRecord]] = defaultdict(list)
|
||||||
|
phone_index: Dict[str, List[LawyerRecord]] = defaultdict(list)
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
SELECT id, name, phone, law_firm, province, city, domain, create_time
|
||||||
|
FROM lawyer
|
||||||
|
WHERE phone IS NOT NULL
|
||||||
|
AND phone <> ''
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
for row in cur.fetchall():
|
||||||
|
record = LawyerRecord(
|
||||||
|
id=int(row[0]),
|
||||||
|
name=str(row[1] or "").strip(),
|
||||||
|
phone=str(row[2] or "").strip(),
|
||||||
|
law_firm=str(row[3] or "").strip(),
|
||||||
|
province=str(row[4] or "").strip(),
|
||||||
|
city=str(row[5] or "").strip(),
|
||||||
|
domain=str(row[6] or "").strip(),
|
||||||
|
create_time=int(row[7] or 0),
|
||||||
|
)
|
||||||
|
phone_index[record.phone].append(record)
|
||||||
|
normalized_firm = normalize_firm(record.law_firm)
|
||||||
|
if normalized_firm:
|
||||||
|
firm_index[normalized_firm].append(record)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
ordered_firms = sorted(firm_index.keys(), key=len, reverse=True)
|
||||||
|
return firm_index, ordered_firms, phone_index
|
||||||
|
|
||||||
|
|
||||||
|
def build_phone_backfill(
|
||||||
|
original_phone: object,
|
||||||
|
name: object,
|
||||||
|
province: object,
|
||||||
|
city: object,
|
||||||
|
phone_index: Dict[str, List[LawyerRecord]],
|
||||||
|
) -> PhoneBackfill:
|
||||||
|
def pick_best_name(records: Sequence[LawyerRecord], target_name: object) -> str:
|
||||||
|
normalized_target_name = normalize_name(target_name)
|
||||||
|
if normalized_target_name:
|
||||||
|
for item in records:
|
||||||
|
if item.name and normalize_name(item.name) == normalized_target_name:
|
||||||
|
return item.name
|
||||||
|
for item in records:
|
||||||
|
if item.name:
|
||||||
|
return item.name
|
||||||
|
return ""
|
||||||
|
|
||||||
|
records: List[LawyerRecord] = []
|
||||||
|
seen_ids = set()
|
||||||
|
for phone in split_phones(original_phone):
|
||||||
|
for record in phone_index.get(phone, []):
|
||||||
|
if record.id in seen_ids:
|
||||||
|
continue
|
||||||
|
seen_ids.add(record.id)
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
sorted_records = sorted(
|
||||||
|
records,
|
||||||
|
key=lambda item: phone_record_sort_key(item, name, province, city),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
candidate_names = unique_values(sorted_records, "name")
|
||||||
|
candidate_firms = unique_values(
|
||||||
|
[item for item in sorted_records if item.law_firm and item.law_firm not in GENERIC_FIRMS],
|
||||||
|
"law_firm",
|
||||||
|
)
|
||||||
|
if not candidate_firms:
|
||||||
|
candidate_firms = unique_values(
|
||||||
|
[item for item in sorted_records if item.law_firm],
|
||||||
|
"law_firm",
|
||||||
|
)
|
||||||
|
candidate_domains = unique_values(sorted_records, "domain")
|
||||||
|
matched_phones = unique_values(sorted_records, "phone")
|
||||||
|
|
||||||
|
best_name = pick_best_name(sorted_records, name)
|
||||||
|
best_law_firm = ""
|
||||||
|
best_domain = ""
|
||||||
|
preferred_name = normalize_name(name) or normalize_name(best_name)
|
||||||
|
|
||||||
|
for record in sorted_records:
|
||||||
|
if not record.law_firm or record.law_firm in GENERIC_FIRMS:
|
||||||
|
continue
|
||||||
|
if preferred_name and normalize_name(record.name) != preferred_name:
|
||||||
|
continue
|
||||||
|
best_law_firm = record.law_firm
|
||||||
|
best_domain = record.domain
|
||||||
|
break
|
||||||
|
|
||||||
|
if not best_law_firm:
|
||||||
|
for record in sorted_records:
|
||||||
|
if record.law_firm and record.law_firm not in GENERIC_FIRMS:
|
||||||
|
best_law_firm = record.law_firm
|
||||||
|
best_domain = record.domain
|
||||||
|
break
|
||||||
|
|
||||||
|
if not best_domain and sorted_records:
|
||||||
|
best_domain = sorted_records[0].domain
|
||||||
|
|
||||||
|
return PhoneBackfill(
|
||||||
|
matched_phones=matched_phones,
|
||||||
|
records=sorted_records,
|
||||||
|
best_name=best_name,
|
||||||
|
best_law_firm=best_law_firm,
|
||||||
|
best_domain=best_domain,
|
||||||
|
candidate_names=candidate_names,
|
||||||
|
candidate_firms=candidate_firms,
|
||||||
|
candidate_domains=candidate_domains,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def match_row(
|
||||||
|
name: object,
|
||||||
|
original_phone: object,
|
||||||
|
law_firm: object,
|
||||||
|
province: object,
|
||||||
|
city: object,
|
||||||
|
address: object,
|
||||||
|
phone_backfill: PhoneBackfill,
|
||||||
|
firm_index: Dict[str, List[LawyerRecord]],
|
||||||
|
ordered_firms: Sequence[str],
|
||||||
|
) -> Tuple[str, str, List[LawyerRecord]]:
|
||||||
|
def add_method(part: str, method_parts: List[str]) -> None:
|
||||||
|
if part and part not in method_parts:
|
||||||
|
method_parts.append(part)
|
||||||
|
|
||||||
|
matched_firm = normalize_firm(law_firm)
|
||||||
|
used_phone_backfill_firm = False
|
||||||
|
inferred_from_address = False
|
||||||
|
if not matched_firm:
|
||||||
|
matched_firm = normalize_firm(phone_backfill.best_law_firm)
|
||||||
|
used_phone_backfill_firm = bool(matched_firm)
|
||||||
|
if not matched_firm:
|
||||||
|
matched_firm = infer_firm_from_address(address, ordered_firms)
|
||||||
|
inferred_from_address = bool(matched_firm)
|
||||||
|
if not matched_firm:
|
||||||
|
return "", "无可用律所名", []
|
||||||
|
|
||||||
|
candidates = firm_index.get(matched_firm, [])
|
||||||
|
if not candidates:
|
||||||
|
return matched_firm, "数据库无此律所", []
|
||||||
|
|
||||||
|
method_parts = ["律所"]
|
||||||
|
chosen = list(candidates)
|
||||||
|
|
||||||
|
normalized_name = normalize_name(name)
|
||||||
|
if not normalized_name:
|
||||||
|
normalized_name = normalize_name(phone_backfill.best_name)
|
||||||
|
if normalized_name:
|
||||||
|
name_filtered = [item for item in chosen if normalize_name(item.name) == normalized_name]
|
||||||
|
if name_filtered:
|
||||||
|
chosen = name_filtered
|
||||||
|
add_method("姓名", method_parts)
|
||||||
|
|
||||||
|
if len(unique_phones(chosen)) != 1:
|
||||||
|
normalized_province = normalize_province(province)
|
||||||
|
normalized_city = normalize_city(city)
|
||||||
|
|
||||||
|
if normalized_province and normalized_city:
|
||||||
|
province_city_filtered = [
|
||||||
|
item
|
||||||
|
for item in chosen
|
||||||
|
if normalize_province(item.province) == normalized_province
|
||||||
|
and normalize_city(item.city) == normalized_city
|
||||||
|
]
|
||||||
|
if province_city_filtered:
|
||||||
|
chosen = province_city_filtered
|
||||||
|
add_method("省份", method_parts)
|
||||||
|
add_method("城市", method_parts)
|
||||||
|
|
||||||
|
if len(unique_phones(chosen)) != 1 and normalized_city:
|
||||||
|
city_filtered = [
|
||||||
|
item for item in chosen if normalize_city(item.city) == normalized_city
|
||||||
|
]
|
||||||
|
if city_filtered:
|
||||||
|
chosen = city_filtered
|
||||||
|
add_method("城市", method_parts)
|
||||||
|
|
||||||
|
if len(unique_phones(chosen)) != 1 and normalized_province:
|
||||||
|
province_filtered = [
|
||||||
|
item
|
||||||
|
for item in chosen
|
||||||
|
if normalize_province(item.province) == normalized_province
|
||||||
|
]
|
||||||
|
if province_filtered:
|
||||||
|
chosen = province_filtered
|
||||||
|
add_method("省份", method_parts)
|
||||||
|
|
||||||
|
method = "+".join(method_parts)
|
||||||
|
if used_phone_backfill_firm:
|
||||||
|
method = "手机号回填律所|" + method
|
||||||
|
elif inferred_from_address:
|
||||||
|
method = "地址推断律所|" + method
|
||||||
|
return matched_firm, method, chosen
|
||||||
|
|
||||||
|
|
||||||
|
def autosize_columns(ws) -> None:
|
||||||
|
for column_cells in ws.columns:
|
||||||
|
values = [str(cell.value or "") for cell in column_cells]
|
||||||
|
max_length = min(max((len(value) for value in values), default=0), 60)
|
||||||
|
column_letter = column_cells[0].column_letter
|
||||||
|
ws.column_dimensions[column_letter].width = max_length + 2
|
||||||
|
|
||||||
|
|
||||||
|
def iter_input_rows(ws) -> Iterable[Tuple[int, List[object]]]:
|
||||||
|
for row_idx in range(1, ws.max_row + 1):
|
||||||
|
yield row_idx, [ws.cell(row_idx, col_idx).value for col_idx in range(1, 8)]
|
||||||
|
|
||||||
|
|
||||||
|
def build_output(input_path: str, output_path: str) -> Dict[str, int]:
|
||||||
|
workbook = load_workbook(input_path)
|
||||||
|
source_ws = workbook.active
|
||||||
|
|
||||||
|
firm_index, ordered_firms, phone_index = load_db_indexes()
|
||||||
|
|
||||||
|
out_wb = Workbook()
|
||||||
|
out_ws = out_wb.active
|
||||||
|
out_ws.title = "firm_phone_compare"
|
||||||
|
headers = [
|
||||||
|
"原始行号",
|
||||||
|
"原姓名",
|
||||||
|
"原手机号",
|
||||||
|
"原律所",
|
||||||
|
"原省份",
|
||||||
|
"原城市",
|
||||||
|
"原地址",
|
||||||
|
"原备注",
|
||||||
|
"手机号命中记录数",
|
||||||
|
"手机号命中手机号",
|
||||||
|
"手机号补全姓名",
|
||||||
|
"手机号补全律所",
|
||||||
|
"手机号补全来源",
|
||||||
|
"手机号候选姓名",
|
||||||
|
"手机号候选律所",
|
||||||
|
"用于匹配的律所",
|
||||||
|
"匹配方式",
|
||||||
|
"数据库候选手机号",
|
||||||
|
"候选数量",
|
||||||
|
"原手机号对比",
|
||||||
|
"数据库候选姓名",
|
||||||
|
"数据库候选省市",
|
||||||
|
"数据库来源",
|
||||||
|
]
|
||||||
|
out_ws.append(headers)
|
||||||
|
for cell in out_ws[1]:
|
||||||
|
cell.font = Font(bold=True)
|
||||||
|
|
||||||
|
stats = defaultdict(int)
|
||||||
|
for row_idx, row in iter_input_rows(source_ws):
|
||||||
|
name, original_phone, law_firm, province, city, address, remark = row
|
||||||
|
needs_phone_completion = not normalize_firm(law_firm)
|
||||||
|
phone_backfill = build_phone_backfill(
|
||||||
|
original_phone=original_phone,
|
||||||
|
name=name,
|
||||||
|
province=province,
|
||||||
|
city=city,
|
||||||
|
phone_index=phone_index,
|
||||||
|
)
|
||||||
|
matched_firm, method, matched_records = match_row(
|
||||||
|
name=name,
|
||||||
|
original_phone=original_phone,
|
||||||
|
law_firm=law_firm,
|
||||||
|
province=province,
|
||||||
|
city=city,
|
||||||
|
address=address,
|
||||||
|
phone_backfill=phone_backfill,
|
||||||
|
firm_index=firm_index,
|
||||||
|
ordered_firms=ordered_firms,
|
||||||
|
)
|
||||||
|
candidate_phones = unique_phones(matched_records)
|
||||||
|
compare = compare_result(split_phones(original_phone), candidate_phones)
|
||||||
|
candidate_names = unique_values(matched_records, "name")
|
||||||
|
candidate_domains = unique_values(matched_records, "domain")
|
||||||
|
city_province_pairs = []
|
||||||
|
seen_pairs = set()
|
||||||
|
for record in matched_records:
|
||||||
|
pair = f"{record.province}-{record.city}".strip("-")
|
||||||
|
if pair and pair not in seen_pairs:
|
||||||
|
seen_pairs.add(pair)
|
||||||
|
city_province_pairs.append(pair)
|
||||||
|
|
||||||
|
out_ws.append(
|
||||||
|
[
|
||||||
|
row_idx,
|
||||||
|
name or "",
|
||||||
|
original_phone or "",
|
||||||
|
law_firm or "",
|
||||||
|
province or "",
|
||||||
|
city or "",
|
||||||
|
address or "",
|
||||||
|
remark or "",
|
||||||
|
len(phone_backfill.records) if needs_phone_completion else "",
|
||||||
|
" / ".join(phone_backfill.matched_phones) if needs_phone_completion else "",
|
||||||
|
phone_backfill.best_name if needs_phone_completion else "",
|
||||||
|
phone_backfill.best_law_firm if needs_phone_completion else "",
|
||||||
|
phone_backfill.best_domain if needs_phone_completion else "",
|
||||||
|
" / ".join(phone_backfill.candidate_names) if needs_phone_completion else "",
|
||||||
|
" / ".join(phone_backfill.candidate_firms) if needs_phone_completion else "",
|
||||||
|
matched_firm or "",
|
||||||
|
method or "",
|
||||||
|
" / ".join(candidate_phones) or "",
|
||||||
|
len(candidate_phones),
|
||||||
|
compare,
|
||||||
|
" / ".join(candidate_names) or "",
|
||||||
|
" / ".join(city_province_pairs) or "",
|
||||||
|
" / ".join(candidate_domains) or "",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if needs_phone_completion and phone_backfill.records:
|
||||||
|
stats["phone_backfill_hit_rows"] += 1
|
||||||
|
if needs_phone_completion and phone_backfill.best_name:
|
||||||
|
stats["phone_backfill_name_rows"] += 1
|
||||||
|
if needs_phone_completion and phone_backfill.best_law_firm:
|
||||||
|
stats["phone_backfill_firm_rows"] += 1
|
||||||
|
if needs_phone_completion and method.startswith("手机号回填律所|"):
|
||||||
|
stats["phone_backfill_used_for_match_rows"] += 1
|
||||||
|
|
||||||
|
if candidate_phones:
|
||||||
|
stats["matched_rows"] += 1
|
||||||
|
if len(candidate_phones) == 1:
|
||||||
|
stats["unique_rows"] += 1
|
||||||
|
else:
|
||||||
|
stats["multi_rows"] += 1
|
||||||
|
else:
|
||||||
|
stats["unmatched_rows"] += 1
|
||||||
|
|
||||||
|
if compare == "完全一致":
|
||||||
|
stats["same_rows"] += 1
|
||||||
|
elif compare == "候选包含原手机号":
|
||||||
|
stats["contains_rows"] += 1
|
||||||
|
elif compare == "不包含原手机号":
|
||||||
|
stats["diff_rows"] += 1
|
||||||
|
elif compare == "原手机号为空":
|
||||||
|
stats["blank_phone_rows"] += 1
|
||||||
|
|
||||||
|
out_ws.freeze_panes = "A2"
|
||||||
|
autosize_columns(out_ws)
|
||||||
|
out_wb.save(output_path)
|
||||||
|
return dict(stats)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
stats = build_output(args.input, args.output)
|
||||||
|
print(f"已生成: {args.output}")
|
||||||
|
for key in sorted(stats):
|
||||||
|
print(f"{key}={stats[key]}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -7,48 +7,43 @@ DB_CONFIG = {
|
|||||||
"charset": "utf8mb4",
|
"charset": "utf8mb4",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 高德地图 API 配置
|
||||||
|
GAODE_CONFIG = {
|
||||||
|
"API_KEY": "f261575fb28003761c433f6c9379e89d",
|
||||||
|
}
|
||||||
|
|
||||||
# 微信爬虫特定的配置
|
# 微信爬虫特定的配置
|
||||||
WEIXIN_CONFIG = {
|
WEIXIN_CONFIG = {
|
||||||
"TOKEN": "756858506", # 您的Token
|
"TOKEN": "553117235", # 您的Token
|
||||||
"FINGERPRINT": "1caa5fc52ac489e20a175e153dd3ef21",
|
"FINGERPRINT": "3c02c35093184e9a9a668ac3c81e53f9",
|
||||||
"COOKIE": {
|
"COOKIE": {
|
||||||
"appmsglist_action_3258147150": "card",
|
"appmsglist_action_3258147150": "card",
|
||||||
|
"_qimei_uuid42": "1a302160d051008226aec905b63f99ff3989f30009",
|
||||||
|
"_qimei_i_3": "63b22b84c15204dfc595ac6452d722b1f0bdf0f6145b568ae68a7c0e70947438686637943989e2a1d792",
|
||||||
|
"_qimei_h38": "215986ce26aec905b63f99ff0200000e81a302",
|
||||||
|
"ua_id": "S7gglu0eZh9NkAzLAAAAADH8dynpnFZVN29lxm7BQo0=",
|
||||||
|
"wxuin": "73074968761097",
|
||||||
"mm_lang": "zh_CN",
|
"mm_lang": "zh_CN",
|
||||||
"ts_uid": "8295434560",
|
"eas_sid": "91X7I7K4K5k364U2z3k2I980F5",
|
||||||
"markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838",
|
"_qimei_q36": "",
|
||||||
"_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1",
|
"_qimei_fingerprint": "d895c46d5fda98cab67d9daec00068ed",
|
||||||
"_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f",
|
"_qimei_i_1": "4dc76680945f59d3c7c4ab325dd526b3feeea1a31458558bbdd97e582493206c6163629d39d8e1dcd4b2c28f",
|
||||||
"_qimei_h38": "b885c955f8e9995f103aac140200000421811e",
|
"pgv_pvid": "6923507145",
|
||||||
"_qimei_i_1": "4ddd76d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd83abc2e0",
|
"ts_uid": "9585717820",
|
||||||
"_qpsvr_localtk": "0.2780749298744084",
|
"_t_qbtool_uid": "aaaa2vn5byd280l00iglw701zci788cb",
|
||||||
"RK": "ZGEMOpzbOS",
|
"_ga": "GA1.1.1323926288.1775838938",
|
||||||
"ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055",
|
"_ga_TPFW0KPXC1": "GS2.1.s1775841484$o2$g1$t1775841485$j59$l0$h0",
|
||||||
"pac_uid": "0_HXj3iphPm0Y4a",
|
"uuid": "20d1cfb540221c6e7b6d665ab1d4a8f7",
|
||||||
"_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55",
|
"rand_info": "CAESIA8LYV6dvWh5dYrgQLPhZb8TXwUJoWdcdDzN0TTdztSj",
|
||||||
"rewardsn": "",
|
|
||||||
"wxtokenkey": "777",
|
|
||||||
"omgid": "0_HXj3iphPm0Y4a",
|
|
||||||
"sig_login": "h01218fdccf5b63c15a6c5edb19ce20d0481c52723ee44ab56b9fc1415ff39c9ff0dd2000e12f1de8ae",
|
|
||||||
"ua_id": "QXSOTQUjDFjoH63yAAAAAPILc15EwzRTwdqntEiCGSE=",
|
|
||||||
"mp_token": "1331492699",
|
|
||||||
"appletToken": "2105598806",
|
|
||||||
"__wx_phantom_mark__": "breQbE92JS",
|
|
||||||
"mmad_session": "2bd2e1824d701b521c16fa35de0378e55273ce93a68ac0cc9ca30e8ad5b2e9f6fc419dd5fed1cd17f0a57fc3c327e03ccf325c1e1e97dde41374a9d8067d9aa700c8b87a29b0d3caf7f949761d8f4eeb56a1e3ddbc5a5d3a573e5b83971cd92e11de1c56c245721266e7088080fefde3",
|
|
||||||
"pgv_info": "ssid=s5739471549",
|
|
||||||
"pgv_pvid": "2616937300",
|
|
||||||
"_gcl_au": "1.1.954868153.1769494261",
|
|
||||||
"wxuin": "69676812527831",
|
|
||||||
"_clck": "3258147150|1|g35|0",
|
|
||||||
"uuid": "e07aa2889db56b1901e1fb6b1286d9a7",
|
|
||||||
"rand_info": "CAESIBnfIxLJoUVe5wP4SI/ADWnrnPUBlJDb4yyA7Et1+ZfF",
|
|
||||||
"slave_bizuin": "3258147150",
|
"slave_bizuin": "3258147150",
|
||||||
"data_bizuin": "3258147150",
|
"data_bizuin": "3258147150",
|
||||||
"bizuin": "3258147150",
|
"bizuin": "3258147150",
|
||||||
"data_ticket": "kv+SnLJADgPlcKQPIbYnfbEAxogpIMfAo/n0/HjtChnfDmQSogWvkO82/mUtzpcc",
|
"data_ticket": "dgLFmSrI8f1q6JnYOd2Y/sKJIWjh6YlLSau1n1+Mv5iOTR5hgsm1qjNLypWflGd6",
|
||||||
"slave_sid": "eFNMcEZ3bnhvRkppZVNkTDE4dFFnM0ZzdFM1REhpemZORHRnVnlnRHhKU29vY1ZBY0dJZkFHcXB5Nko4aV9pbVlnRTBRVDE0NzdIUDF4T3NTSDVzdXBJS2d3WFFuR3hiMWVVbG5ZTURfYmh3YTFTallIb2JXOWpyTWxXS25jbVFRVmtXWHVaWGdCN2lqZzVm",
|
"slave_sid": "VGVnNmM5NmFpV19ESElmVlZOTGZfVVJfWE5HanlHNjN0WEswZVkxVk9vc2FTenQzVGRsWUxDT0xGQVBJRVZzU0JNVV9RckRJVE9jSVUwbjl4Z2VHaEZKSzE5WVc3THRCRW96T0Z1V1VwbnBLSnkxSWdKaHdaN1dYdzI1SmdpZ0IyOFJtUE45OTR2Q2NvM1FB",
|
||||||
"slave_user": "gh_fe76760560d0",
|
"slave_user": "gh_fe76760560d0",
|
||||||
"xid": "34f577adf2c28e5b9f04de93c614c5c4",
|
"xid": "4893c62dc8518b6a1628fd34bc9aa276",
|
||||||
"_clsk": "639w4k|1769742296130|3|1|mp.weixin.qq.com/weheat-agent/payload/record"
|
"_clck": "3258147150|1|g5g|0",
|
||||||
|
"_clsk": "1p4oo3h|1776957001796|5|1|mp.weixin.qq.com/weheat-agent/payload/record"
|
||||||
},
|
},
|
||||||
"COUNT": 20, # 单页条数
|
"COUNT": 20, # 单页条数
|
||||||
"REQUESTS_PER_SECOND": 8, # 每秒最大请求数(调高更快,但有风控风险)
|
"REQUESTS_PER_SECOND": 8, # 每秒最大请求数(调高更快,但有风控风险)
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
services:
|
||||||
|
mongodb:
|
||||||
|
image: mongo:7
|
||||||
|
container_name: lawyers_mongodb
|
||||||
|
restart: always
|
||||||
|
ports:
|
||||||
|
- "27017:27017"
|
||||||
|
volumes:
|
||||||
|
- mongodb_data:/data/db
|
||||||
|
environment:
|
||||||
|
MONGO_INITDB_DATABASE: lawyer
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
mongodb_data:
|
||||||
@@ -23,6 +23,7 @@ if project_root not in sys.path:
|
|||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
from Db import Db # 你的 DB 封装
|
from Db import Db # 你的 DB 封装
|
||||||
|
import config as project_config
|
||||||
|
|
||||||
# logging 配置
|
# logging 配置
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@@ -45,9 +46,14 @@ class GaodeSpider:
|
|||||||
sleep_between_cities: float = 3.0,
|
sleep_between_cities: float = 3.0,
|
||||||
):
|
):
|
||||||
self.db = db_connection
|
self.db = db_connection
|
||||||
self.api_key = (api_key or os.environ.get("AMAP_API_KEY", "")).strip()
|
config_api_key = ""
|
||||||
|
gaode_config = getattr(project_config, "GAODE_CONFIG", None)
|
||||||
|
if isinstance(gaode_config, dict):
|
||||||
|
config_api_key = str(gaode_config.get("API_KEY", "")).strip()
|
||||||
|
|
||||||
|
self.api_key = (api_key or os.environ.get("AMAP_API_KEY", "") or config_api_key).strip()
|
||||||
if not self.api_key:
|
if not self.api_key:
|
||||||
raise ValueError("AMAP_API_KEY environment variable is required")
|
raise ValueError("高德 API Key 未配置,请在 config.py 的 GAODE_CONFIG.API_KEY 或环境变量 AMAP_API_KEY 中填写")
|
||||||
self.api_base = "https://restapi.amap.com/v3/place/text"
|
self.api_base = "https://restapi.amap.com/v3/place/text"
|
||||||
self.offset = offset
|
self.offset = offset
|
||||||
self.session = self._build_session()
|
self.session = self._build_session()
|
||||||
@@ -369,7 +375,7 @@ class GaodeSpider:
|
|||||||
return
|
return
|
||||||
|
|
||||||
total_stored = 0
|
total_stored = 0
|
||||||
keywords_suffix = "律所"
|
keywords_suffix = "律师"
|
||||||
|
|
||||||
for city_id, city_info in self.cities.items():
|
for city_id, city_info in self.cities.items():
|
||||||
try:
|
try:
|
||||||
|
|||||||
Binary file not shown.
+42
-2
@@ -24,6 +24,19 @@ def _normalize_bool(value, default: bool = True) -> bool:
|
|||||||
return text not in ("0", "false", "no", "off", "")
|
return text not in ("0", "false", "no", "off", "")
|
||||||
|
|
||||||
|
|
||||||
|
def _env_proxy_override() -> Optional[bool]:
|
||||||
|
"""
|
||||||
|
环境变量覆盖代理开关:
|
||||||
|
- PROXY_ENABLED 未设置:返回 None(不覆盖,仍读取 proxy_settings.json)
|
||||||
|
- PROXY_ENABLED=0/false/off:强制关闭代理
|
||||||
|
- PROXY_ENABLED=1/true/on:强制开启代理(前提是配置字段齐全)
|
||||||
|
"""
|
||||||
|
raw = os.getenv("PROXY_ENABLED")
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
return _normalize_bool(raw, True)
|
||||||
|
|
||||||
|
|
||||||
def _load_config() -> Dict[str, str]:
|
def _load_config() -> Dict[str, str]:
|
||||||
if not os.path.exists(CONFIG_PATH):
|
if not os.path.exists(CONFIG_PATH):
|
||||||
return dict(DEFAULT_CONFIG)
|
return dict(DEFAULT_CONFIG)
|
||||||
@@ -48,7 +61,12 @@ def report_proxy_status() -> None:
|
|||||||
_PROXY_STATUS_REPORTED = True
|
_PROXY_STATUS_REPORTED = True
|
||||||
|
|
||||||
config = _load_config()
|
config = _load_config()
|
||||||
enabled = _normalize_bool(config.get("enabled"), True)
|
override = _env_proxy_override()
|
||||||
|
if override is False:
|
||||||
|
print("[proxy] disabled by env (PROXY_ENABLED=0)")
|
||||||
|
return
|
||||||
|
|
||||||
|
enabled = _normalize_bool(config.get("enabled"), True) if override is None else True
|
||||||
if not enabled:
|
if not enabled:
|
||||||
print("[proxy] disabled by config")
|
print("[proxy] disabled by config")
|
||||||
return
|
return
|
||||||
@@ -66,7 +84,10 @@ def get_proxies() -> Optional[Dict[str, str]]:
|
|||||||
代理配置从 proxy_settings.json 读取,不依赖环境变量。
|
代理配置从 proxy_settings.json 读取,不依赖环境变量。
|
||||||
"""
|
"""
|
||||||
config = _load_config()
|
config = _load_config()
|
||||||
if not _normalize_bool(config.get("enabled"), True):
|
override = _env_proxy_override()
|
||||||
|
if override is False:
|
||||||
|
return None
|
||||||
|
if override is None and not _normalize_bool(config.get("enabled"), True):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
tunnel = str(config.get("tunnel") or "").strip()
|
tunnel = str(config.get("tunnel") or "").strip()
|
||||||
@@ -95,3 +116,22 @@ def apply_proxy(session) -> Optional[Dict[str, str]]:
|
|||||||
|
|
||||||
|
|
||||||
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status"]
|
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status"]
|
||||||
|
|
||||||
|
|
||||||
|
def is_proxy_enabled() -> bool:
|
||||||
|
"""
|
||||||
|
判断当前进程是否启用了代理。
|
||||||
|
|
||||||
|
优先遵循环境变量 PROXY_ENABLED;
|
||||||
|
未设置时回退到 proxy_settings.json 的 enabled 配置。
|
||||||
|
"""
|
||||||
|
config = _load_config()
|
||||||
|
override = _env_proxy_override()
|
||||||
|
if override is False:
|
||||||
|
return False
|
||||||
|
if override is True:
|
||||||
|
return True
|
||||||
|
return _normalize_bool(config.get("enabled"), True)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status", "is_proxy_enabled"]
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from pathlib import Path
|
|||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
import fcntl
|
import fcntl
|
||||||
|
from request.proxy_config import is_proxy_enabled
|
||||||
|
|
||||||
|
|
||||||
class RateLimiter:
|
class RateLimiter:
|
||||||
@@ -167,13 +168,27 @@ global_rate_limiter = RateLimiter(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _should_limit_proxy_requests() -> bool:
|
||||||
|
"""
|
||||||
|
仅在当前进程实际启用代理时启用全局代理限流。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return is_proxy_enabled()
|
||||||
|
except Exception:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def wait_for_request():
|
def wait_for_request():
|
||||||
"""等待直到可以发起请求。"""
|
"""等待直到可以发起请求。"""
|
||||||
|
if not _should_limit_proxy_requests():
|
||||||
|
return
|
||||||
global_rate_limiter.acquire()
|
global_rate_limiter.acquire()
|
||||||
|
|
||||||
|
|
||||||
def can_request_now() -> bool:
|
def can_request_now() -> bool:
|
||||||
"""检查是否可以立即发起请求。"""
|
"""检查是否可以立即发起请求。"""
|
||||||
|
if not _should_limit_proxy_requests():
|
||||||
|
return True
|
||||||
return global_rate_limiter.can_make_request()
|
return global_rate_limiter.can_make_request()
|
||||||
|
|
||||||
|
|
||||||
@@ -184,6 +199,10 @@ def request_slot():
|
|||||||
|
|
||||||
这样既能限制“每秒启动多少请求”,也能限制“同时在飞多少请求”。
|
这样既能限制“每秒启动多少请求”,也能限制“同时在飞多少请求”。
|
||||||
"""
|
"""
|
||||||
|
if not _should_limit_proxy_requests():
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
token = global_rate_limiter.try_acquire_slot()
|
token = global_rate_limiter.try_acquire_slot()
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import copy
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
@@ -26,64 +27,73 @@ DEFAULT_HEADERS = {
|
|||||||
"User-Agent": (
|
"User-Agent": (
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
"Chrome/138.0.0.0 Safari/537.36"
|
"Chrome/146.0.0.0 Safari/537.36"
|
||||||
),
|
),
|
||||||
"Accept": "*/*",
|
"Accept": "*/*",
|
||||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
|
||||||
|
"DNT": "1",
|
||||||
|
"Priority": "u=1, i",
|
||||||
|
"Sec-CH-UA": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"',
|
||||||
|
"Sec-CH-UA-Mobile": "?0",
|
||||||
|
"Sec-CH-UA-Platform": '"Windows"',
|
||||||
|
"Sec-Fetch-Dest": "empty",
|
||||||
|
"Sec-Fetch-Mode": "cors",
|
||||||
|
"Sec-Fetch-Site": "same-origin",
|
||||||
"X-Requested-With": "XMLHttpRequest",
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
}
|
}
|
||||||
DEFAULT_WEIXIN_CONFIG = {
|
DEFAULT_WEIXIN_CONFIG = {
|
||||||
"TOKEN": "32299576",
|
"TOKEN": "609153506",
|
||||||
"FINGERPRINT": "64a1c659b8b944d6e7fe596b0794ab35",
|
"FINGERPRINT": "46a7e6ac6ccf205986adc0aa99127860",
|
||||||
"COOKIE": {
|
"COOKIE": {
|
||||||
"appmsglist_action_3876849679": "card",
|
"appmsglist_action_3258147150": "card",
|
||||||
|
"_qimei_uuid42": "1a302160d051008226aec905b63f99ff3989f30009",
|
||||||
|
"_qimei_i_3": "63b22b84c15204dfc595ac6452d722b1f0bdf0f6145b568ae68a7c0e70947438686637943989e2a1d792",
|
||||||
|
"_qimei_h38": "215986ce26aec905b63f99ff0200000e81a302",
|
||||||
|
"ua_id": "S7gglu0eZh9NkAzLAAAAADH8dynpnFZVN29lxm7BQo0=",
|
||||||
|
"wxuin": "73074968761097",
|
||||||
"mm_lang": "zh_CN",
|
"mm_lang": "zh_CN",
|
||||||
"ts_uid": "8295434560",
|
"eas_sid": "91X7I7K4K5k364U2z3k2I980F5",
|
||||||
"markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838",
|
|
||||||
"_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1",
|
|
||||||
"_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f",
|
|
||||||
"_qimei_h38": "b885c955f8e9995f103aac140200000421811e",
|
|
||||||
"RK": "ZGEMOpzbOS",
|
|
||||||
"ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055",
|
|
||||||
"pac_uid": "0_HXj3iphPm0Y4a",
|
|
||||||
"_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55",
|
|
||||||
"wxuin": "70085167371972",
|
|
||||||
"omgid": "0_HXj3iphPm0Y4a",
|
|
||||||
"rewardsn": "",
|
|
||||||
"wxtokenkey": "777",
|
|
||||||
"sig_login": "h017c22e8921e6bf5a1f8659d9f34ee0db2be31cdcf03786b9ab4b787a9821ad84d3046473d9076181a",
|
|
||||||
"_qpsvr_localtk": "0.9079082151544442",
|
|
||||||
"appletToken": "880792228",
|
|
||||||
"mmad_session": "ae5215dd3c930e6256d8f0656bd8497e719817e0df77a677766e128e2135218486f674b88b349db0d47039f54cb99c8753beb8d4b921ae452b66773db51ad3006ab1f0d19253ae83e2cb9ba53ff5b5b4f45f2fe160db66fd300a1fb4e04a92bd11de1c56c245721266e7088080fefde3",
|
|
||||||
"qq_domain_video_guid_verify": "6cce52525a146907",
|
|
||||||
"_qimei_q36": "",
|
"_qimei_q36": "",
|
||||||
"pgv_info": "ssid=s4741843528",
|
"_qimei_fingerprint": "d895c46d5fda98cab67d9daec00068ed",
|
||||||
"pgv_pvid": "9337874960",
|
"_clck": "501quy|1|g4t|0",
|
||||||
"_qimei_i_2": "47e96bdff700",
|
"uuid": "210d1c199a63afd4c774eccd9a06a27f",
|
||||||
"_qimei_i_1": "40bb51d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd838fd0da",
|
"rand_info": "CAESIE4WqrFFVVjqrrNflbCUM7wPD5NXjuGbjfHolAEsMmEm",
|
||||||
"_qimei_q32": "",
|
"slave_bizuin": "3258147150",
|
||||||
"mp_token": "1555009133",
|
"data_bizuin": "3258147150",
|
||||||
"ua_id": "390pNywJFJA6BsgOAAAAADO0TqlmW7NBB1GD0Y7OVwk=",
|
"bizuin": "3258147150",
|
||||||
"__wx_phantom_mark__": "UTRZE71JZ7",
|
"data_ticket": "tpcLjRB7B7AlUY3rFe/ILEjtCKs7dEEGsn8kXnHVzdTb9dgIpSPN1aP8FlE6FDhj",
|
||||||
"_clck": "3841887471|1|g4a|0",
|
"slave_sid": "U3hfU1Z0UV91N0U5d0lkRDhyTzh3d3hmbnBHMjBnbmFNdzVJeGlJeTJ6OTVxRjJQVVE2VkNhejYzTkxETVVSZkF3eWRORmtRS01XWFBjdnFZZWFLNjR2ZGtwdUJ2MzByclg0NjF4SHlDeVJneEhsczdSYUJVNE45VEhNRWVTQXg1dlpGdWQ0bU5VM3pnRzJN",
|
||||||
"uuid": "6ae7cb97104627c5d3b9d1d9ab2eef60",
|
"slave_user": "gh_fe76760560d0",
|
||||||
"rand_info": "CAESIGjvJyiJ58Ii0enQVKBwl6d4IyCrWeN7kzhIAVTgM2lc",
|
"xid": "ef503a6864cceaef225c615a45606e4a",
|
||||||
"slave_bizuin": "3876849679",
|
"_clsk": "12arnf1|1774975723874|4|1|mp.weixin.qq.com/weheat-agent/payload/record",
|
||||||
"data_bizuin": "3876849679",
|
"_qimei_i_1": "2ddc6a80945f59d3c7c4ab325dd526b3feeea1a31458558bbdd97e582493206c6163629d39d8e1dcd49fddc7"
|
||||||
"bizuin": "3876849679",
|
},
|
||||||
"data_ticket": "8wg11/LIrTLHAbJdbAH2HWdqlW/K2jijwP27oPSrH2myYNpuSR1NedfmSbzeq5go",
|
"COUNT": 21,
|
||||||
"slave_sid": "TjBzVV83WThEaThRdUhlcFpqRFhQejFSUzRfOWdGa0l3S0dPSW41QWdkSk9qSkQ2ZTljbWRHa0poQ1lNTXlub25WMUJORVluVU5HaFBGRXVJS19yeG53SUNWWU14YjNQeWpxTUczalBHV1dTY0V3TDZ6aE14bFNaS2ExeGNhb3J0WlRWMlM4NnNmNGFST0ZD",
|
"REFERER": "https://mp.weixin.qq.com/",
|
||||||
"slave_user": "gh_6c1283858808",
|
"HEADERS": {},
|
||||||
"xid": "116378d10877a35558158970698ca0c3",
|
"REQUEST_PARAMS": {
|
||||||
"_clsk": "3okzsf|1773282377657|6|1|mp.weixin.qq.com/weheat-agent/payload/record"
|
"action": "search",
|
||||||
|
"scene": "1",
|
||||||
|
"lang": "zh_CN",
|
||||||
|
"f": "json",
|
||||||
|
"ajax": "1",
|
||||||
},
|
},
|
||||||
"COUNT": 20,
|
|
||||||
"REQUESTS_PER_SECOND": 5,
|
"REQUESTS_PER_SECOND": 5,
|
||||||
"PAGE_DELAY": 5,
|
"PAGE_DELAY": 5,
|
||||||
"CITY_DELAY": 2,
|
"CITY_DELAY": 2,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _deep_merge_dict(base: Dict, incoming: Dict) -> Dict:
|
||||||
|
merged = copy.deepcopy(base)
|
||||||
|
for key, value in incoming.items():
|
||||||
|
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||||
|
merged[key] = _deep_merge_dict(merged[key], value)
|
||||||
|
else:
|
||||||
|
merged[key] = value
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def _parse_cookie_value(cookie_value) -> Dict[str, str]:
|
def _parse_cookie_value(cookie_value) -> Dict[str, str]:
|
||||||
if isinstance(cookie_value, dict):
|
if isinstance(cookie_value, dict):
|
||||||
return {str(key): str(value) for key, value in cookie_value.items()}
|
return {str(key): str(value) for key, value in cookie_value.items()}
|
||||||
@@ -110,15 +120,16 @@ def _parse_cookie_value(cookie_value) -> Dict[str, str]:
|
|||||||
|
|
||||||
|
|
||||||
def _load_weixin_config() -> Dict:
|
def _load_weixin_config() -> Dict:
|
||||||
config = DEFAULT_WEIXIN_CONFIG.copy()
|
config = copy.deepcopy(DEFAULT_WEIXIN_CONFIG)
|
||||||
module_config = getattr(project_config, "WEIXIN_CONFIG", None)
|
module_config = getattr(project_config, "WEIXIN_CONFIG", None)
|
||||||
if isinstance(module_config, dict):
|
if isinstance(module_config, dict):
|
||||||
config.update(module_config)
|
config = _deep_merge_dict(config, module_config)
|
||||||
|
|
||||||
env_mapping = {
|
env_mapping = {
|
||||||
"TOKEN": os.getenv("WEIXIN_TOKEN"),
|
"TOKEN": os.getenv("WEIXIN_TOKEN"),
|
||||||
"FINGERPRINT": os.getenv("WEIXIN_FINGERPRINT"),
|
"FINGERPRINT": os.getenv("WEIXIN_FINGERPRINT"),
|
||||||
"COOKIE": os.getenv("WEIXIN_COOKIE"),
|
"COOKIE": os.getenv("WEIXIN_COOKIE"),
|
||||||
|
"REFERER": os.getenv("WEIXIN_REFERER"),
|
||||||
"COUNT": os.getenv("WEIXIN_COUNT"),
|
"COUNT": os.getenv("WEIXIN_COUNT"),
|
||||||
"REQUESTS_PER_SECOND": os.getenv("WEIXIN_REQUESTS_PER_SECOND"),
|
"REQUESTS_PER_SECOND": os.getenv("WEIXIN_REQUESTS_PER_SECOND"),
|
||||||
"PAGE_DELAY": os.getenv("WEIXIN_PAGE_DELAY"),
|
"PAGE_DELAY": os.getenv("WEIXIN_PAGE_DELAY"),
|
||||||
@@ -161,17 +172,32 @@ class WeixinSpider:
|
|||||||
self.fingerprint = str(self.config.get("FINGERPRINT", "")).strip()
|
self.fingerprint = str(self.config.get("FINGERPRINT", "")).strip()
|
||||||
self.cookies = self.config.get("COOKIE", {})
|
self.cookies = self.config.get("COOKIE", {})
|
||||||
self.count = str(self.config.get("COUNT", DEFAULT_WEIXIN_CONFIG["COUNT"]))
|
self.count = str(self.config.get("COUNT", DEFAULT_WEIXIN_CONFIG["COUNT"]))
|
||||||
|
self.referer = str(self.config.get("REFERER", DEFAULT_WEIXIN_CONFIG["REFERER"])).strip()
|
||||||
|
self.request_params = {
|
||||||
|
str(key): str(value)
|
||||||
|
for key, value in (self.config.get("REQUEST_PARAMS", {}) or {}).items()
|
||||||
|
if value is not None
|
||||||
|
}
|
||||||
self.page_delay = max(0.0, float(self.config.get("PAGE_DELAY", DEFAULT_WEIXIN_CONFIG["PAGE_DELAY"])))
|
self.page_delay = max(0.0, float(self.config.get("PAGE_DELAY", DEFAULT_WEIXIN_CONFIG["PAGE_DELAY"])))
|
||||||
self.city_delay = max(0.0, float(self.config.get("CITY_DELAY", DEFAULT_WEIXIN_CONFIG["CITY_DELAY"])))
|
self.city_delay = max(0.0, float(self.config.get("CITY_DELAY", DEFAULT_WEIXIN_CONFIG["CITY_DELAY"])))
|
||||||
max_rps = self.config.get("REQUESTS_PER_SECOND")
|
max_rps = self.config.get("REQUESTS_PER_SECOND")
|
||||||
if max_rps:
|
if max_rps:
|
||||||
global_rate_limiter.max_requests = int(max_rps)
|
global_rate_limiter.max_requests = int(max_rps)
|
||||||
|
|
||||||
headers = getattr(project_config, "HEADERS", DEFAULT_HEADERS).copy()
|
headers = DEFAULT_HEADERS.copy()
|
||||||
headers["Referer"] = "https://mp.weixin.qq.com/"
|
project_headers = getattr(project_config, "HEADERS", None)
|
||||||
|
if isinstance(project_headers, dict):
|
||||||
|
headers.update(project_headers)
|
||||||
|
config_headers = self.config.get("HEADERS", {})
|
||||||
|
if isinstance(config_headers, dict):
|
||||||
|
headers.update({str(key): str(value) for key, value in config_headers.items()})
|
||||||
|
if self.referer:
|
||||||
|
headers["Referer"] = self.referer
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.session.trust_env = False
|
self.session.trust_env = False
|
||||||
self.session.headers.update(headers)
|
self.session.headers.update(headers)
|
||||||
|
if self.cookies:
|
||||||
|
self.session.cookies.update(self.cookies)
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
def _validate_runtime_config(self) -> bool:
|
def _validate_runtime_config(self) -> bool:
|
||||||
@@ -214,18 +240,14 @@ class WeixinSpider:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
def _build_query_url(self, query: str, buffer: str) -> str:
|
def _build_query_url(self, query: str, buffer: str) -> str:
|
||||||
params = {
|
params = self.request_params.copy()
|
||||||
"action": "search",
|
params.update({
|
||||||
"scene": "1",
|
|
||||||
"query": query,
|
"query": query,
|
||||||
"count": self.count,
|
"count": self.count,
|
||||||
"buffer": buffer,
|
"buffer": buffer,
|
||||||
"fingerprint": self.fingerprint,
|
"fingerprint": self.fingerprint,
|
||||||
"token": self.token,
|
"token": self.token,
|
||||||
"lang": "zh_CN",
|
})
|
||||||
"f": "json",
|
|
||||||
"ajax": "1",
|
|
||||||
}
|
|
||||||
return f"{API_ENDPOINT}?{urlencode(params)}"
|
return f"{API_ENDPOINT}?{urlencode(params)}"
|
||||||
|
|
||||||
def _extract_phone(self, text: str) -> Optional[str]:
|
def _extract_phone(self, text: str) -> Optional[str]:
|
||||||
|
|||||||
Reference in New Issue
Block a user