重构采集脚本并新增按时间导出Excel

- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
This commit is contained in:
hello-dd-code
2026-03-02 11:46:05 +08:00
parent 03847a4b8e
commit 19cf9ce901
12 changed files with 3545 additions and 1059 deletions
+10 -2
View File
@@ -51,6 +51,7 @@ class RequestsClient:
self,
headers: Optional[Mapping[str, str]] = None,
*,
use_proxy: bool = True,
retry_total: int = 0,
retry_backoff_factor: float = 0.0,
retry_status_forcelist: Optional[Iterable[int]] = None,
@@ -58,6 +59,7 @@ class RequestsClient:
default_timeout: Optional[TimeoutType] = None,
) -> None:
self._base_headers: Dict[str, str] = dict(headers or {})
self.use_proxy = bool(use_proxy)
self.retry_total = int(retry_total)
self.retry_backoff_factor = float(retry_backoff_factor)
self.retry_status_forcelist = tuple(retry_status_forcelist or ())
@@ -67,8 +69,13 @@ class RequestsClient:
def _build_session(self) -> requests.Session:
session = requests.Session()
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
apply_proxy(session)
if self.use_proxy:
# 统一从 proxy_settings.json 注入代理,并屏蔽系统环境代理干扰
apply_proxy(session)
else:
# 强制直连:不读取环境代理,不走配置文件代理
session.trust_env = False
session.proxies.clear()
if self.retry_total > 0:
# 适配器级重试:主要处理连接波动与指定状态码的瞬时失败
retries = Retry(
@@ -109,6 +116,7 @@ class RequestsClient:
# 线程场景建议 clone:复用同配置,但使用独立连接池
clone_client = RequestsClient(
headers=dict(self.headers),
use_proxy=self.use_proxy,
retry_total=self.retry_total,
retry_backoff_factor=self.retry_backoff_factor,
retry_status_forcelist=self.retry_status_forcelist,