重构采集脚本并新增按时间导出Excel
- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
This commit is contained in:
+75
-8
@@ -1,13 +1,80 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 切换到脚本所在目录,确保相对路径正确
|
||||
cd "$(dirname "$0")"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
LOG_DIR="${PROJECT_ROOT}/logs"
|
||||
DATA_DIR="${PROJECT_ROOT}/data"
|
||||
|
||||
echo "使用 request/proxy_settings.json 读取代理配置"
|
||||
mkdir -p "${LOG_DIR}" "${DATA_DIR}"
|
||||
|
||||
nohup python3 dls.py > dls.log 2>&1 & # 大律师
|
||||
nohup python3 findlaw.py > findlaw.log 2>&1 & # 找法网
|
||||
nohup python3 lawtime.py > lawtime.log 2>&1 & # 法律快车
|
||||
nohup python3 six4365.py > six4365.log 2>&1 & # 律图
|
||||
nohup python3 hualv.py > hualv.log 2>&1 & # 华律
|
||||
if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
|
||||
PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
|
||||
else
|
||||
PYTHON_BIN="python3"
|
||||
fi
|
||||
|
||||
RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
|
||||
|
||||
echo "[start] project=${PROJECT_ROOT}"
|
||||
echo "[start] python=${PYTHON_BIN}"
|
||||
echo "[start] mode=${RUN_MODE}"
|
||||
echo "[start] proxy=request/proxy_settings.json"
|
||||
|
||||
# 大律师(新结构采集 + 写库)可通过环境变量控制
|
||||
DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
|
||||
DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
|
||||
DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
|
||||
DLS_SLEEP="${DLS_SLEEP:-0.2}"
|
||||
DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
|
||||
DLS_EXTRA_ARGS=()
|
||||
|
||||
if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
|
||||
fi
|
||||
if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
|
||||
fi
|
||||
if [[ -n "${DLS_CITY_FILTER}" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
|
||||
fi
|
||||
DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
|
||||
|
||||
if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--direct)
|
||||
fi
|
||||
if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--no-db)
|
||||
fi
|
||||
|
||||
run_bg() {
|
||||
local name="$1"
|
||||
shift
|
||||
local logfile="${LOG_DIR}/${name}.log"
|
||||
nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
|
||||
echo "[start] ${name} pid=$! log=${logfile}"
|
||||
}
|
||||
|
||||
run_fg() {
|
||||
local name="$1"
|
||||
shift
|
||||
local logfile="${LOG_DIR}/${name}.log"
|
||||
echo "[start] ${name} fg log=${logfile}"
|
||||
env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
|
||||
}
|
||||
|
||||
if [[ "${RUN_MODE}" == "sequential" ]]; then
|
||||
run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
||||
run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
||||
run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
||||
run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
||||
run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
||||
echo "[done] sequential completed"
|
||||
else
|
||||
run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
||||
run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
||||
run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
||||
run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
||||
run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
||||
echo "[done] all crawlers started in background"
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user