Files
lawyers/request/requests_client.py
T
2026-03-02 00:19:48 +08:00

169 lines
5.7 KiB
Python

from dataclasses import dataclass
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from request.proxy_config import apply_proxy
TimeoutType = Union[float, Tuple[float, float]]
class RequestClientError(Exception):
"""请求客户端通用异常。"""
class RequestConnectTimeout(RequestClientError):
"""连接超时。"""
class RequestTimeout(RequestClientError):
"""请求超时。"""
class RequestConnectionError(RequestClientError):
"""连接错误。"""
class RequestSSLError(RequestClientError):
"""SSL 错误。"""
@dataclass
class ResponseData:
# 只保留采集侧稳定需要的字段,避免直接向上层泄露原始 Response 对象
status_code: int
text: str
url: str
headers: Dict[str, str]
class RequestsClient:
"""
统一 requests 客户端:
- 自动应用代理配置
- 支持可选重试
- 对外抛出统一异常类型
"""
def __init__(
self,
headers: Optional[Mapping[str, str]] = None,
*,
retry_total: int = 0,
retry_backoff_factor: float = 0.0,
retry_status_forcelist: Optional[Iterable[int]] = None,
retry_allowed_methods: Optional[Iterable[str]] = None,
default_timeout: Optional[TimeoutType] = None,
) -> None:
self._base_headers: Dict[str, str] = dict(headers or {})
self.retry_total = int(retry_total)
self.retry_backoff_factor = float(retry_backoff_factor)
self.retry_status_forcelist = tuple(retry_status_forcelist or ())
self.retry_allowed_methods = tuple(retry_allowed_methods or ("GET", "POST"))
self.default_timeout = default_timeout
self._session = self._build_session()
def _build_session(self) -> requests.Session:
session = requests.Session()
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
apply_proxy(session)
if self.retry_total > 0:
# 适配器级重试:主要处理连接波动与指定状态码的瞬时失败
retries = Retry(
total=self.retry_total,
backoff_factor=self.retry_backoff_factor,
status_forcelist=self.retry_status_forcelist,
allowed_methods=frozenset(self.retry_allowed_methods),
raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.mount("http://", adapter)
if self._base_headers:
# 基础头只在建 session 时注入,业务请求可通过 headers 临时覆盖
session.headers.update(self._base_headers)
return session
@property
def headers(self):
return self._session.headers
@property
def proxies(self) -> Dict[str, str]:
return dict(self._session.proxies)
def refresh(self) -> None:
# 强制重建 session,用于 403/连接异常后的“换连接”场景
self.close()
self._session = self._build_session()
def close(self) -> None:
try:
self._session.close()
except Exception:
pass
def clone(self) -> "RequestsClient":
# 线程场景建议 clone:复用同配置,但使用独立连接池
clone_client = RequestsClient(
headers=dict(self.headers),
retry_total=self.retry_total,
retry_backoff_factor=self.retry_backoff_factor,
retry_status_forcelist=self.retry_status_forcelist,
retry_allowed_methods=self.retry_allowed_methods,
default_timeout=self.default_timeout,
)
return clone_client
def request_text(
self,
method: str,
url: str,
*,
timeout: Optional[TimeoutType] = None,
verify: bool = True,
headers: Optional[Mapping[str, str]] = None,
**kwargs: Any,
) -> ResponseData:
response = None
# 调用方未传 timeout 时,回退到客户端默认超时
real_timeout = self.default_timeout if timeout is None else timeout
try:
response = self._session.request(
method=method,
url=url,
timeout=real_timeout,
verify=verify,
headers=headers,
**kwargs,
)
return ResponseData(
status_code=response.status_code,
text=response.text,
url=response.url,
headers=dict(response.headers),
)
# 把 requests 的具体异常统一收敛,业务层无需依赖 requests.exceptions
except requests.exceptions.ConnectTimeout as exc:
raise RequestConnectTimeout(str(exc)) from exc
except requests.exceptions.Timeout as exc:
raise RequestTimeout(str(exc)) from exc
except requests.exceptions.ConnectionError as exc:
raise RequestConnectionError(str(exc)) from exc
except requests.exceptions.SSLError as exc:
raise RequestSSLError(str(exc)) from exc
except requests.exceptions.RequestException as exc:
raise RequestClientError(str(exc)) from exc
finally:
if response is not None:
# 立即释放底层连接,避免大量采集时连接堆积
response.close()
def get_text(self, url: str, **kwargs: Any) -> ResponseData:
return self.request_text("GET", url, **kwargs)
def post_text(self, url: str, **kwargs: Any) -> ResponseData:
return self.request_text("POST", url, **kwargs)