feat: enhance project configuration and improve data export functionality

- Updated `.gitignore` to streamline ignored files and added logging for common sites.
- Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings.
- Refined `README.md` to clarify project structure and usage instructions.
- Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support.
- Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management.
- Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
This commit is contained in:
hello-dd-code
2026-03-18 10:02:25 +08:00
parent c2b77975c1
commit 38e7c284e8
14 changed files with 1665 additions and 3004 deletions
+11 -6
View File
@@ -19,6 +19,9 @@ if project_root not in sys.path:
from Db import Db
DEFAULT_EXPORT_START_TS = 1772932103
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
parser.add_argument(
@@ -30,7 +33,10 @@ def parse_args() -> argparse.Namespace:
"--start-ts",
type=int,
default=None,
help="create_time 起始时间戳(含),不传时默认取最近7天",
help=(
"create_time 起始时间戳(含),"
f"不传时默认取 {DEFAULT_EXPORT_START_TS} 之后的数据"
),
)
parser.add_argument(
"--end-ts",
@@ -83,9 +89,9 @@ def parse_args() -> argparse.Namespace:
def apply_default_time_filter(args: argparse.Namespace) -> None:
# 未显式传时间范围时,默认导出最近7天的数据
# 未显式传时间范围时,默认导出指定时间戳之后的数据
if args.start_ts is None and args.end_ts is None:
args.start_ts = int(time.time()) - 7 * 24 * 3600
args.start_ts = DEFAULT_EXPORT_START_TS
args.end_ts = 0
return
if args.start_ts is None:
@@ -211,11 +217,10 @@ def export_to_excel(
ws = wb.active
ws.title = "lawyers"
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"]
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain", "URL"]
if include_extra:
headers.extend(
[
"URL",
"站点",
"create_time",
"create_time_text",
@@ -270,12 +275,12 @@ def export_to_excel(
row.get("city", "") or "",
site_name,
row.get("domain", "") or "",
row.get("url", "") or "",
]
if include_extra:
line.extend(
[
row.get("url", "") or "",
row.get("domain", "") or "",
row.get("create_time", "") or "",
ts_to_text(row.get("create_time")),