chore: 暂存本地修改
This commit is contained in:
+6
-5
@@ -86,11 +86,12 @@ class DlsSpider:
|
||||
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
||||
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
||||
try:
|
||||
resp = session.get(
|
||||
test_url,
|
||||
timeout=timeout,
|
||||
headers={"Connection": "close"},
|
||||
)
|
||||
with request_slot():
|
||||
resp = session.get(
|
||||
test_url,
|
||||
timeout=timeout,
|
||||
headers={"Connection": "close"},
|
||||
)
|
||||
print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
|
||||
except Exception as exc:
|
||||
print(f"[proxy] test failed: {exc}")
|
||||
|
||||
@@ -0,0 +1,220 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
request_dir = os.path.join(project_root, "request")
|
||||
if request_dir not in sys.path:
|
||||
sys.path.insert(0, request_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
import requests
|
||||
|
||||
from request.proxy_config import get_proxies, report_proxy_status
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckResult:
|
||||
site: str
|
||||
url: str
|
||||
method: str
|
||||
ok: bool
|
||||
status_code: Optional[int]
|
||||
error: str
|
||||
hint: str
|
||||
elapsed_ms: int
|
||||
|
||||
|
||||
def _now_ms() -> int:
|
||||
return int(time.time() * 1000)
|
||||
|
||||
|
||||
def _short_hint(text: str) -> str:
|
||||
s = (text or "").strip().lower()
|
||||
flags = []
|
||||
for key, label in [
|
||||
("403", "403"),
|
||||
("429", "429"),
|
||||
("captcha", "captcha"),
|
||||
("验证码", "captcha_cn"),
|
||||
("人机", "bot_check_cn"),
|
||||
("access denied", "access_denied"),
|
||||
("forbidden", "forbidden"),
|
||||
("too many requests", "rate_limited"),
|
||||
("cloudflare", "cloudflare"),
|
||||
("challenge", "challenge"),
|
||||
]:
|
||||
if key in s:
|
||||
flags.append(label)
|
||||
return ",".join(flags)[:120]
|
||||
|
||||
|
||||
def _build_session() -> requests.Session:
|
||||
report_proxy_status()
|
||||
s = requests.Session()
|
||||
s.trust_env = False
|
||||
proxies = get_proxies()
|
||||
if proxies:
|
||||
s.proxies.update(proxies)
|
||||
else:
|
||||
s.proxies.clear()
|
||||
s.headers.update(
|
||||
{
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/136.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "*/*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Connection": "close",
|
||||
}
|
||||
)
|
||||
return s
|
||||
|
||||
|
||||
def _check(
|
||||
session: requests.Session,
|
||||
*,
|
||||
site: str,
|
||||
method: str,
|
||||
url: str,
|
||||
timeout: Tuple[float, float] = (10.0, 15.0),
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
) -> CheckResult:
|
||||
start = _now_ms()
|
||||
try:
|
||||
resp = session.request(
|
||||
method=method,
|
||||
url=url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
data=data,
|
||||
)
|
||||
text = resp.text or ""
|
||||
status = resp.status_code
|
||||
hint = _short_hint(text[:1200])
|
||||
ok = 200 <= status < 400
|
||||
return CheckResult(
|
||||
site=site,
|
||||
url=url,
|
||||
method=method,
|
||||
ok=ok,
|
||||
status_code=status,
|
||||
error="",
|
||||
hint=hint,
|
||||
elapsed_ms=_now_ms() - start,
|
||||
)
|
||||
except Exception as exc:
|
||||
return CheckResult(
|
||||
site=site,
|
||||
url=url,
|
||||
method=method,
|
||||
ok=False,
|
||||
status_code=None,
|
||||
error=str(exc)[:200],
|
||||
hint="",
|
||||
elapsed_ms=_now_ms() - start,
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
resp.close() # type: ignore[name-defined]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _tests() -> List[Dict[str, Any]]:
|
||||
# 每个站点选一个“代表性列表/API”作为冒烟:能快速暴露 403/验证码/限频。
|
||||
return [
|
||||
{
|
||||
"site": "大律师(m站)",
|
||||
"method": "GET",
|
||||
"url": "https://m.maxlaw.cn/",
|
||||
},
|
||||
{
|
||||
"site": "大律师(PC站)",
|
||||
"method": "GET",
|
||||
"url": "https://www.maxlaw.cn/law/beijing?page=1",
|
||||
"headers": {"Referer": "https://www.maxlaw.cn/"},
|
||||
},
|
||||
{
|
||||
"site": "找法网(m站)",
|
||||
"method": "GET",
|
||||
"url": "https://m.findlaw.cn/beijing/q_lawyer/p1?ajax=1&order=0&sex=-1",
|
||||
"headers": {
|
||||
"Referer": "https://m.findlaw.cn/beijing/q_lawyer/",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||
},
|
||||
},
|
||||
{
|
||||
"site": "法律快车(m站)",
|
||||
"method": "GET",
|
||||
"url": "https://m.lawtime.cn/beijing/lawyer/?page=1",
|
||||
},
|
||||
{
|
||||
"site": "律图(m站)",
|
||||
"method": "POST",
|
||||
"url": "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/",
|
||||
"data": {
|
||||
"RegionId": "110100", # 北京市
|
||||
"OnlyData": "true",
|
||||
"LawyerRecommendRequest[AreaId]": "110100",
|
||||
"LawyerRecommendRequest[PageIndex]": "1",
|
||||
"LawyerRecommendRequest[PageSize]": "10",
|
||||
"LawyerRecommendRequest[OrderType]": "0",
|
||||
"LawyerRecommendRequest[Type]": "1",
|
||||
},
|
||||
},
|
||||
{
|
||||
"site": "华律(m站)",
|
||||
"method": "POST",
|
||||
"url": "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/",
|
||||
"data": {
|
||||
"pid": "110000", # 北京
|
||||
"cid": "110100", # 北京市
|
||||
"page": "1",
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def main() -> int:
|
||||
mode = os.getenv("PROXY_ENABLED")
|
||||
print(f"[smoke] PROXY_ENABLED={mode!r}")
|
||||
s = _build_session()
|
||||
results: List[CheckResult] = []
|
||||
for item in _tests():
|
||||
res = _check(
|
||||
s,
|
||||
site=item["site"],
|
||||
method=item["method"],
|
||||
url=item["url"],
|
||||
headers=item.get("headers"),
|
||||
data=item.get("data"),
|
||||
)
|
||||
results.append(res)
|
||||
print(
|
||||
f"[smoke] {res.site} {res.method} {res.status_code} ok={res.ok} "
|
||||
f"{res.elapsed_ms}ms hint={res.hint or '-'} err={res.error or '-'}"
|
||||
)
|
||||
time.sleep(0.3)
|
||||
|
||||
summary = {
|
||||
"proxy_enabled": mode,
|
||||
"results": [res.__dict__ for res in results],
|
||||
}
|
||||
print("[smoke] summary_json=" + json.dumps(summary, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
@@ -6,11 +6,27 @@ cd "$(dirname "$0")"
|
||||
|
||||
echo "使用 request/proxy_settings.json 读取代理配置"
|
||||
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||
|
||||
is_job_running() {
|
||||
local script="$1"
|
||||
local script_regex="${script//./\\.}"
|
||||
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||
}
|
||||
|
||||
start_job() {
|
||||
local script="$1"
|
||||
local log_file="$2"
|
||||
local label="$3"
|
||||
local existing
|
||||
|
||||
existing="$(is_job_running "${script}")"
|
||||
if [[ -n "${existing}" ]]; then
|
||||
echo "跳过 ${label}: ${script} 已在运行"
|
||||
echo "${existing}" | head -n 1
|
||||
return 0
|
||||
fi
|
||||
|
||||
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||
sleep 1
|
||||
|
||||
Executable
+48
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 切换到脚本所在目录,确保相对路径正确
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# 强制直连:不使用代理 IP
|
||||
export PROXY_ENABLED=0
|
||||
|
||||
# 直连模式建议更保守一些,降低被临时风控的概率
|
||||
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||
|
||||
is_job_running() {
|
||||
local script="$1"
|
||||
local script_regex="${script//./\\.}"
|
||||
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||
}
|
||||
|
||||
start_job() {
|
||||
local script="$1"
|
||||
local log_file="$2"
|
||||
local label="$3"
|
||||
local existing
|
||||
|
||||
existing="$(is_job_running "${script}")"
|
||||
if [[ -n "${existing}" ]]; then
|
||||
echo "跳过 ${label}: ${script} 已在运行"
|
||||
echo "${existing}" | head -n 1
|
||||
return 0
|
||||
fi
|
||||
|
||||
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||
sleep 1
|
||||
}
|
||||
|
||||
echo "直连模式(PROXY_ENABLED=0),每周两次建议用 cron 调度"
|
||||
echo "当前归入直连组:大律师(m/PC)、华律、律图"
|
||||
|
||||
# 直连优先站点:
|
||||
# - 大律师(m站/PC站):当前可直接访问,未见明显强风控
|
||||
# - 华律:当前网页可直接访问,未见明显强风控
|
||||
# - 律图:当前网页可直接访问,未见明显强风控
|
||||
start_job "dls.py" "direct_dls.log" "大律师(直连)"
|
||||
start_job "dls_pc.py" "direct_dls_pc.log" "大律师PC站(直连)"
|
||||
start_job "hualv.py" "direct_hualv.log" "华律(直连)"
|
||||
start_job "six4365.py" "direct_six4365.log" "律图(直连)"
|
||||
Executable
+53
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 切换到脚本所在目录,确保相对路径正确
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# 强制开启代理:用于容易被限频/拦截的站点
|
||||
export PROXY_ENABLED=1
|
||||
|
||||
# 代理模式下默认更保守一点,避免冲爆代理与触发风控
|
||||
export PROXY_MAX_REQUESTS_PER_SECOND="${PROXY_MAX_REQUESTS_PER_SECOND:-5}"
|
||||
export PROXY_MAX_CONCURRENT_REQUESTS="${PROXY_MAX_CONCURRENT_REQUESTS:-5}"
|
||||
|
||||
# 可选:开启代理连通性测试输出(部分脚本会打印测试信息/代理状态)
|
||||
export PROXY_TEST="${PROXY_TEST:-0}"
|
||||
|
||||
is_job_running() {
|
||||
local script="$1"
|
||||
local script_regex="${script//./\\.}"
|
||||
pgrep -af "(^|[[:space:]/])${script_regex}([[:space:]]|$)" || true
|
||||
}
|
||||
|
||||
start_job() {
|
||||
local script="$1"
|
||||
local log_file="$2"
|
||||
local label="$3"
|
||||
local existing
|
||||
|
||||
existing="$(is_job_running "${script}")"
|
||||
if [[ -n "${existing}" ]]; then
|
||||
echo "跳过 ${label}: ${script} 已在运行"
|
||||
echo "${existing}" | head -n 1
|
||||
return 0
|
||||
fi
|
||||
|
||||
nohup python "../common_sites/${script}" > "${log_file}" 2>&1 &
|
||||
echo "启动 ${label}: ${script} -> ${log_file}"
|
||||
sleep 1
|
||||
}
|
||||
|
||||
echo "代理模式(PROXY_ENABLED=1),每周一次建议用 cron 调度"
|
||||
echo "代理配置读取自 request/proxy_settings.json"
|
||||
echo "每周一次代理任务 = 全量采集所有站点"
|
||||
|
||||
# 每周一次代理任务做全量采集:
|
||||
# - 强风控/更敏感站点:找法网、法律快车
|
||||
# - 其余站点也一并跑,保证每周至少有一次“全量最新数据”刷新
|
||||
start_job "dls.py" "proxy_dls.log" "大律师(代理全量)"
|
||||
start_job "dls_pc.py" "proxy_dls_pc.log" "大律师PC站(代理全量)"
|
||||
start_job "findlaw.py" "proxy_findlaw.log" "找法网(代理)"
|
||||
start_job "lawtime.py" "proxy_lawtime.log" "法律快车(代理)"
|
||||
start_job "hualv.py" "proxy_hualv.log" "华律(代理全量)"
|
||||
start_job "six4365.py" "proxy_six4365.log" "律图(代理全量)"
|
||||
Reference in New Issue
Block a user