chore: initialize lawyers crawler project

This commit is contained in:
hello-dd-code
2026-03-02 00:19:48 +08:00
commit 03847a4b8e
17 changed files with 1928 additions and 0 deletions
+19
View File
@@ -0,0 +1,19 @@
from request.requests_client import (
RequestClientError,
RequestConnectTimeout,
RequestConnectionError,
RequestSSLError,
RequestTimeout,
RequestsClient,
ResponseData,
)
__all__ = [
"RequestsClient",
"ResponseData",
"RequestClientError",
"RequestConnectTimeout",
"RequestTimeout",
"RequestConnectionError",
"RequestSSLError",
]
+97
View File
@@ -0,0 +1,97 @@
import json
import os
from typing import Dict, Optional
CONFIG_PATH = os.path.join(os.path.dirname(__file__), "proxy_settings.json")
DEFAULT_CONFIG = {
"enabled": True,
"tunnel": "t133.kdltps.com:15818",
"username": "t16766298346583",
"password": "zyn0vb20",
"scheme": "http",
}
_PROXY_STATUS_REPORTED = False
def _normalize_bool(value, default: bool = True) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
text = str(value).strip().lower()
return text not in ("0", "false", "no", "off", "")
def _load_config() -> Dict[str, str]:
if not os.path.exists(CONFIG_PATH):
return dict(DEFAULT_CONFIG)
try:
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
data = json.load(f) or {}
except Exception as exc:
print(f"[proxy] 配置读取失败: {exc}, 使用默认配置")
return dict(DEFAULT_CONFIG)
config = dict(DEFAULT_CONFIG)
for key, value in data.items():
if value is not None:
config[key] = value
return config
def report_proxy_status() -> None:
global _PROXY_STATUS_REPORTED
if _PROXY_STATUS_REPORTED:
return
_PROXY_STATUS_REPORTED = True
config = _load_config()
enabled = _normalize_bool(config.get("enabled"), True)
if not enabled:
print("[proxy] disabled by config")
return
missing = [key for key in ("tunnel", "username", "password") if not config.get(key)]
if missing:
print(f"[proxy] enabled but missing fields: {', '.join(missing)}")
return
print(f"[proxy] enabled=True tunnel={config.get('tunnel')}")
def get_proxies() -> Optional[Dict[str, str]]:
"""
返回统一的代理配置;当配置 enabled=false 时返回 None。
代理配置从 proxy_settings.json 读取,不依赖环境变量。
"""
config = _load_config()
if not _normalize_bool(config.get("enabled"), True):
return None
tunnel = str(config.get("tunnel") or "").strip()
username = str(config.get("username") or "").strip()
password = str(config.get("password") or "").strip()
scheme = str(config.get("scheme") or "http").strip().lower()
if not tunnel or not username or not password:
print("[proxy] missing proxy credentials, proxy disabled")
return None
proxy = f"{scheme}://{username}:{password}@{tunnel}/"
return {"http": proxy, "https": proxy}
def apply_proxy(session) -> Optional[Dict[str, str]]:
"""为 requests.Session 应用统一代理配置,返回最终代理字典或 None。"""
report_proxy_status()
proxies = get_proxies()
session.trust_env = False
if proxies:
session.proxies.update(proxies)
else:
session.proxies.clear()
return proxies
__all__ = ["get_proxies", "apply_proxy", "report_proxy_status"]
+7
View File
@@ -0,0 +1,7 @@
{
"enabled": true,
"tunnel": "t133.kdltps.com:15818",
"username": "t16766298346583",
"password": "zyn0vb20",
"scheme": "http"
}
+168
View File
@@ -0,0 +1,168 @@
from dataclasses import dataclass
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from request.proxy_config import apply_proxy
TimeoutType = Union[float, Tuple[float, float]]
class RequestClientError(Exception):
"""请求客户端通用异常。"""
class RequestConnectTimeout(RequestClientError):
"""连接超时。"""
class RequestTimeout(RequestClientError):
"""请求超时。"""
class RequestConnectionError(RequestClientError):
"""连接错误。"""
class RequestSSLError(RequestClientError):
"""SSL 错误。"""
@dataclass
class ResponseData:
# 只保留采集侧稳定需要的字段,避免直接向上层泄露原始 Response 对象
status_code: int
text: str
url: str
headers: Dict[str, str]
class RequestsClient:
"""
统一 requests 客户端:
- 自动应用代理配置
- 支持可选重试
- 对外抛出统一异常类型
"""
def __init__(
self,
headers: Optional[Mapping[str, str]] = None,
*,
retry_total: int = 0,
retry_backoff_factor: float = 0.0,
retry_status_forcelist: Optional[Iterable[int]] = None,
retry_allowed_methods: Optional[Iterable[str]] = None,
default_timeout: Optional[TimeoutType] = None,
) -> None:
self._base_headers: Dict[str, str] = dict(headers or {})
self.retry_total = int(retry_total)
self.retry_backoff_factor = float(retry_backoff_factor)
self.retry_status_forcelist = tuple(retry_status_forcelist or ())
self.retry_allowed_methods = tuple(retry_allowed_methods or ("GET", "POST"))
self.default_timeout = default_timeout
self._session = self._build_session()
def _build_session(self) -> requests.Session:
session = requests.Session()
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
apply_proxy(session)
if self.retry_total > 0:
# 适配器级重试:主要处理连接波动与指定状态码的瞬时失败
retries = Retry(
total=self.retry_total,
backoff_factor=self.retry_backoff_factor,
status_forcelist=self.retry_status_forcelist,
allowed_methods=frozenset(self.retry_allowed_methods),
raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.mount("http://", adapter)
if self._base_headers:
# 基础头只在建 session 时注入,业务请求可通过 headers 临时覆盖
session.headers.update(self._base_headers)
return session
@property
def headers(self):
return self._session.headers
@property
def proxies(self) -> Dict[str, str]:
return dict(self._session.proxies)
def refresh(self) -> None:
# 强制重建 session,用于 403/连接异常后的“换连接”场景
self.close()
self._session = self._build_session()
def close(self) -> None:
try:
self._session.close()
except Exception:
pass
def clone(self) -> "RequestsClient":
# 线程场景建议 clone:复用同配置,但使用独立连接池
clone_client = RequestsClient(
headers=dict(self.headers),
retry_total=self.retry_total,
retry_backoff_factor=self.retry_backoff_factor,
retry_status_forcelist=self.retry_status_forcelist,
retry_allowed_methods=self.retry_allowed_methods,
default_timeout=self.default_timeout,
)
return clone_client
def request_text(
self,
method: str,
url: str,
*,
timeout: Optional[TimeoutType] = None,
verify: bool = True,
headers: Optional[Mapping[str, str]] = None,
**kwargs: Any,
) -> ResponseData:
response = None
# 调用方未传 timeout 时,回退到客户端默认超时
real_timeout = self.default_timeout if timeout is None else timeout
try:
response = self._session.request(
method=method,
url=url,
timeout=real_timeout,
verify=verify,
headers=headers,
**kwargs,
)
return ResponseData(
status_code=response.status_code,
text=response.text,
url=response.url,
headers=dict(response.headers),
)
# 把 requests 的具体异常统一收敛,业务层无需依赖 requests.exceptions
except requests.exceptions.ConnectTimeout as exc:
raise RequestConnectTimeout(str(exc)) from exc
except requests.exceptions.Timeout as exc:
raise RequestTimeout(str(exc)) from exc
except requests.exceptions.ConnectionError as exc:
raise RequestConnectionError(str(exc)) from exc
except requests.exceptions.SSLError as exc:
raise RequestSSLError(str(exc)) from exc
except requests.exceptions.RequestException as exc:
raise RequestClientError(str(exc)) from exc
finally:
if response is not None:
# 立即释放底层连接,避免大量采集时连接堆积
response.close()
def get_text(self, url: str, **kwargs: Any) -> ResponseData:
return self.request_text("GET", url, **kwargs)
def post_text(self, url: str, **kwargs: Any) -> ResponseData:
return self.request_text("POST", url, **kwargs)