from dataclasses import dataclass from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from request.proxy_config import apply_proxy TimeoutType = Union[float, Tuple[float, float]] class RequestClientError(Exception): """请求客户端通用异常。""" class RequestConnectTimeout(RequestClientError): """连接超时。""" class RequestTimeout(RequestClientError): """请求超时。""" class RequestConnectionError(RequestClientError): """连接错误。""" class RequestSSLError(RequestClientError): """SSL 错误。""" @dataclass class ResponseData: # 只保留采集侧稳定需要的字段,避免直接向上层泄露原始 Response 对象 status_code: int text: str url: str headers: Dict[str, str] class RequestsClient: """ 统一 requests 客户端: - 自动应用代理配置 - 支持可选重试 - 对外抛出统一异常类型 """ def __init__( self, headers: Optional[Mapping[str, str]] = None, *, retry_total: int = 0, retry_backoff_factor: float = 0.0, retry_status_forcelist: Optional[Iterable[int]] = None, retry_allowed_methods: Optional[Iterable[str]] = None, default_timeout: Optional[TimeoutType] = None, ) -> None: self._base_headers: Dict[str, str] = dict(headers or {}) self.retry_total = int(retry_total) self.retry_backoff_factor = float(retry_backoff_factor) self.retry_status_forcelist = tuple(retry_status_forcelist or ()) self.retry_allowed_methods = tuple(retry_allowed_methods or ("GET", "POST")) self.default_timeout = default_timeout self._session = self._build_session() def _build_session(self) -> requests.Session: session = requests.Session() # 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰 apply_proxy(session) if self.retry_total > 0: # 适配器级重试:主要处理连接波动与指定状态码的瞬时失败 retries = Retry( total=self.retry_total, backoff_factor=self.retry_backoff_factor, status_forcelist=self.retry_status_forcelist, allowed_methods=frozenset(self.retry_allowed_methods), raise_on_status=False, ) adapter = HTTPAdapter(max_retries=retries) session.mount("https://", adapter) session.mount("http://", adapter) if self._base_headers: # 基础头只在建 session 时注入,业务请求可通过 headers 临时覆盖 session.headers.update(self._base_headers) return session @property def headers(self): return self._session.headers @property def proxies(self) -> Dict[str, str]: return dict(self._session.proxies) def refresh(self) -> None: # 强制重建 session,用于 403/连接异常后的“换连接”场景 self.close() self._session = self._build_session() def close(self) -> None: try: self._session.close() except Exception: pass def clone(self) -> "RequestsClient": # 线程场景建议 clone:复用同配置,但使用独立连接池 clone_client = RequestsClient( headers=dict(self.headers), retry_total=self.retry_total, retry_backoff_factor=self.retry_backoff_factor, retry_status_forcelist=self.retry_status_forcelist, retry_allowed_methods=self.retry_allowed_methods, default_timeout=self.default_timeout, ) return clone_client def request_text( self, method: str, url: str, *, timeout: Optional[TimeoutType] = None, verify: bool = True, headers: Optional[Mapping[str, str]] = None, **kwargs: Any, ) -> ResponseData: response = None # 调用方未传 timeout 时,回退到客户端默认超时 real_timeout = self.default_timeout if timeout is None else timeout try: response = self._session.request( method=method, url=url, timeout=real_timeout, verify=verify, headers=headers, **kwargs, ) return ResponseData( status_code=response.status_code, text=response.text, url=response.url, headers=dict(response.headers), ) # 把 requests 的具体异常统一收敛,业务层无需依赖 requests.exceptions except requests.exceptions.ConnectTimeout as exc: raise RequestConnectTimeout(str(exc)) from exc except requests.exceptions.Timeout as exc: raise RequestTimeout(str(exc)) from exc except requests.exceptions.ConnectionError as exc: raise RequestConnectionError(str(exc)) from exc except requests.exceptions.SSLError as exc: raise RequestSSLError(str(exc)) from exc except requests.exceptions.RequestException as exc: raise RequestClientError(str(exc)) from exc finally: if response is not None: # 立即释放底层连接,避免大量采集时连接堆积 response.close() def get_text(self, url: str, **kwargs: Any) -> ResponseData: return self.request_text("GET", url, **kwargs) def post_text(self, url: str, **kwargs: Any) -> ResponseData: return self.request_text("POST", url, **kwargs)