19cf9ce901
- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
177 lines
6.0 KiB
Python
177 lines
6.0 KiB
Python
from dataclasses import dataclass
|
|
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
|
|
|
|
import requests
|
|
from requests.adapters import HTTPAdapter
|
|
from urllib3.util.retry import Retry
|
|
|
|
from request.proxy_config import apply_proxy
|
|
|
|
TimeoutType = Union[float, Tuple[float, float]]
|
|
|
|
|
|
class RequestClientError(Exception):
|
|
"""请求客户端通用异常。"""
|
|
|
|
|
|
class RequestConnectTimeout(RequestClientError):
|
|
"""连接超时。"""
|
|
|
|
|
|
class RequestTimeout(RequestClientError):
|
|
"""请求超时。"""
|
|
|
|
|
|
class RequestConnectionError(RequestClientError):
|
|
"""连接错误。"""
|
|
|
|
|
|
class RequestSSLError(RequestClientError):
|
|
"""SSL 错误。"""
|
|
|
|
|
|
@dataclass
|
|
class ResponseData:
|
|
# 只保留采集侧稳定需要的字段,避免直接向上层泄露原始 Response 对象
|
|
status_code: int
|
|
text: str
|
|
url: str
|
|
headers: Dict[str, str]
|
|
|
|
|
|
class RequestsClient:
|
|
"""
|
|
统一 requests 客户端:
|
|
- 自动应用代理配置
|
|
- 支持可选重试
|
|
- 对外抛出统一异常类型
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
headers: Optional[Mapping[str, str]] = None,
|
|
*,
|
|
use_proxy: bool = True,
|
|
retry_total: int = 0,
|
|
retry_backoff_factor: float = 0.0,
|
|
retry_status_forcelist: Optional[Iterable[int]] = None,
|
|
retry_allowed_methods: Optional[Iterable[str]] = None,
|
|
default_timeout: Optional[TimeoutType] = None,
|
|
) -> None:
|
|
self._base_headers: Dict[str, str] = dict(headers or {})
|
|
self.use_proxy = bool(use_proxy)
|
|
self.retry_total = int(retry_total)
|
|
self.retry_backoff_factor = float(retry_backoff_factor)
|
|
self.retry_status_forcelist = tuple(retry_status_forcelist or ())
|
|
self.retry_allowed_methods = tuple(retry_allowed_methods or ("GET", "POST"))
|
|
self.default_timeout = default_timeout
|
|
self._session = self._build_session()
|
|
|
|
def _build_session(self) -> requests.Session:
|
|
session = requests.Session()
|
|
if self.use_proxy:
|
|
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
|
|
apply_proxy(session)
|
|
else:
|
|
# 强制直连:不读取环境代理,不走配置文件代理
|
|
session.trust_env = False
|
|
session.proxies.clear()
|
|
if self.retry_total > 0:
|
|
# 适配器级重试:主要处理连接波动与指定状态码的瞬时失败
|
|
retries = Retry(
|
|
total=self.retry_total,
|
|
backoff_factor=self.retry_backoff_factor,
|
|
status_forcelist=self.retry_status_forcelist,
|
|
allowed_methods=frozenset(self.retry_allowed_methods),
|
|
raise_on_status=False,
|
|
)
|
|
adapter = HTTPAdapter(max_retries=retries)
|
|
session.mount("https://", adapter)
|
|
session.mount("http://", adapter)
|
|
if self._base_headers:
|
|
# 基础头只在建 session 时注入,业务请求可通过 headers 临时覆盖
|
|
session.headers.update(self._base_headers)
|
|
return session
|
|
|
|
@property
|
|
def headers(self):
|
|
return self._session.headers
|
|
|
|
@property
|
|
def proxies(self) -> Dict[str, str]:
|
|
return dict(self._session.proxies)
|
|
|
|
def refresh(self) -> None:
|
|
# 强制重建 session,用于 403/连接异常后的“换连接”场景
|
|
self.close()
|
|
self._session = self._build_session()
|
|
|
|
def close(self) -> None:
|
|
try:
|
|
self._session.close()
|
|
except Exception:
|
|
pass
|
|
|
|
def clone(self) -> "RequestsClient":
|
|
# 线程场景建议 clone:复用同配置,但使用独立连接池
|
|
clone_client = RequestsClient(
|
|
headers=dict(self.headers),
|
|
use_proxy=self.use_proxy,
|
|
retry_total=self.retry_total,
|
|
retry_backoff_factor=self.retry_backoff_factor,
|
|
retry_status_forcelist=self.retry_status_forcelist,
|
|
retry_allowed_methods=self.retry_allowed_methods,
|
|
default_timeout=self.default_timeout,
|
|
)
|
|
return clone_client
|
|
|
|
def request_text(
|
|
self,
|
|
method: str,
|
|
url: str,
|
|
*,
|
|
timeout: Optional[TimeoutType] = None,
|
|
verify: bool = True,
|
|
headers: Optional[Mapping[str, str]] = None,
|
|
**kwargs: Any,
|
|
) -> ResponseData:
|
|
response = None
|
|
# 调用方未传 timeout 时,回退到客户端默认超时
|
|
real_timeout = self.default_timeout if timeout is None else timeout
|
|
try:
|
|
response = self._session.request(
|
|
method=method,
|
|
url=url,
|
|
timeout=real_timeout,
|
|
verify=verify,
|
|
headers=headers,
|
|
**kwargs,
|
|
)
|
|
return ResponseData(
|
|
status_code=response.status_code,
|
|
text=response.text,
|
|
url=response.url,
|
|
headers=dict(response.headers),
|
|
)
|
|
# 把 requests 的具体异常统一收敛,业务层无需依赖 requests.exceptions
|
|
except requests.exceptions.ConnectTimeout as exc:
|
|
raise RequestConnectTimeout(str(exc)) from exc
|
|
except requests.exceptions.Timeout as exc:
|
|
raise RequestTimeout(str(exc)) from exc
|
|
except requests.exceptions.ConnectionError as exc:
|
|
raise RequestConnectionError(str(exc)) from exc
|
|
except requests.exceptions.SSLError as exc:
|
|
raise RequestSSLError(str(exc)) from exc
|
|
except requests.exceptions.RequestException as exc:
|
|
raise RequestClientError(str(exc)) from exc
|
|
finally:
|
|
if response is not None:
|
|
# 立即释放底层连接,避免大量采集时连接堆积
|
|
response.close()
|
|
|
|
def get_text(self, url: str, **kwargs: Any) -> ResponseData:
|
|
return self.request_text("GET", url, **kwargs)
|
|
|
|
def post_text(self, url: str, **kwargs: Any) -> ResponseData:
|
|
return self.request_text("POST", url, **kwargs)
|