feat: enhance project configuration and improve data export functionality
- Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
This commit is contained in:
@@ -19,6 +19,9 @@ if project_root not in sys.path:
|
||||
from Db import Db
|
||||
|
||||
|
||||
DEFAULT_EXPORT_START_TS = 1772932103
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
|
||||
parser.add_argument(
|
||||
@@ -30,7 +33,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--start-ts",
|
||||
type=int,
|
||||
default=None,
|
||||
help="create_time 起始时间戳(含),不传时默认取最近7天",
|
||||
help=(
|
||||
"create_time 起始时间戳(含),"
|
||||
f"不传时默认取 {DEFAULT_EXPORT_START_TS} 之后的数据"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end-ts",
|
||||
@@ -83,9 +89,9 @@ def parse_args() -> argparse.Namespace:
|
||||
|
||||
|
||||
def apply_default_time_filter(args: argparse.Namespace) -> None:
|
||||
# 未显式传时间范围时,默认导出最近7天的数据
|
||||
# 未显式传时间范围时,默认导出指定时间戳之后的数据
|
||||
if args.start_ts is None and args.end_ts is None:
|
||||
args.start_ts = int(time.time()) - 7 * 24 * 3600
|
||||
args.start_ts = DEFAULT_EXPORT_START_TS
|
||||
args.end_ts = 0
|
||||
return
|
||||
if args.start_ts is None:
|
||||
@@ -211,11 +217,10 @@ def export_to_excel(
|
||||
ws = wb.active
|
||||
ws.title = "lawyers"
|
||||
|
||||
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"]
|
||||
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain", "URL"]
|
||||
if include_extra:
|
||||
headers.extend(
|
||||
[
|
||||
"URL",
|
||||
"站点",
|
||||
"create_time",
|
||||
"create_time_text",
|
||||
@@ -270,12 +275,12 @@ def export_to_excel(
|
||||
row.get("city", "") or "",
|
||||
site_name,
|
||||
row.get("domain", "") or "",
|
||||
row.get("url", "") or "",
|
||||
]
|
||||
|
||||
if include_extra:
|
||||
line.extend(
|
||||
[
|
||||
row.get("url", "") or "",
|
||||
row.get("domain", "") or "",
|
||||
row.get("create_time", "") or "",
|
||||
ts_to_text(row.get("create_time")),
|
||||
|
||||
Reference in New Issue
Block a user