重构采集脚本并新增按时间导出Excel
- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
This commit is contained in:
@@ -51,6 +51,7 @@ class RequestsClient:
|
||||
self,
|
||||
headers: Optional[Mapping[str, str]] = None,
|
||||
*,
|
||||
use_proxy: bool = True,
|
||||
retry_total: int = 0,
|
||||
retry_backoff_factor: float = 0.0,
|
||||
retry_status_forcelist: Optional[Iterable[int]] = None,
|
||||
@@ -58,6 +59,7 @@ class RequestsClient:
|
||||
default_timeout: Optional[TimeoutType] = None,
|
||||
) -> None:
|
||||
self._base_headers: Dict[str, str] = dict(headers or {})
|
||||
self.use_proxy = bool(use_proxy)
|
||||
self.retry_total = int(retry_total)
|
||||
self.retry_backoff_factor = float(retry_backoff_factor)
|
||||
self.retry_status_forcelist = tuple(retry_status_forcelist or ())
|
||||
@@ -67,8 +69,13 @@ class RequestsClient:
|
||||
|
||||
def _build_session(self) -> requests.Session:
|
||||
session = requests.Session()
|
||||
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
|
||||
apply_proxy(session)
|
||||
if self.use_proxy:
|
||||
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
|
||||
apply_proxy(session)
|
||||
else:
|
||||
# 强制直连:不读取环境代理,不走配置文件代理
|
||||
session.trust_env = False
|
||||
session.proxies.clear()
|
||||
if self.retry_total > 0:
|
||||
# 适配器级重试:主要处理连接波动与指定状态码的瞬时失败
|
||||
retries = Retry(
|
||||
@@ -109,6 +116,7 @@ class RequestsClient:
|
||||
# 线程场景建议 clone:复用同配置,但使用独立连接池
|
||||
clone_client = RequestsClient(
|
||||
headers=dict(self.headers),
|
||||
use_proxy=self.use_proxy,
|
||||
retry_total=self.retry_total,
|
||||
retry_backoff_factor=self.retry_backoff_factor,
|
||||
retry_status_forcelist=self.retry_status_forcelist,
|
||||
|
||||
Reference in New Issue
Block a user