19cf9ce901
- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
81 lines
2.5 KiB
Bash
Executable File
81 lines
2.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
|
LOG_DIR="${PROJECT_ROOT}/logs"
|
|
DATA_DIR="${PROJECT_ROOT}/data"
|
|
|
|
mkdir -p "${LOG_DIR}" "${DATA_DIR}"
|
|
|
|
if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
|
|
PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
|
|
else
|
|
PYTHON_BIN="python3"
|
|
fi
|
|
|
|
RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
|
|
|
|
echo "[start] project=${PROJECT_ROOT}"
|
|
echo "[start] python=${PYTHON_BIN}"
|
|
echo "[start] mode=${RUN_MODE}"
|
|
echo "[start] proxy=request/proxy_settings.json"
|
|
|
|
# 大律师(新结构采集 + 写库)可通过环境变量控制
|
|
DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
|
|
DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
|
|
DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
|
|
DLS_SLEEP="${DLS_SLEEP:-0.2}"
|
|
DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
|
|
DLS_EXTRA_ARGS=()
|
|
|
|
if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
|
|
DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
|
|
fi
|
|
if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
|
|
DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
|
|
fi
|
|
if [[ -n "${DLS_CITY_FILTER}" ]]; then
|
|
DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
|
|
fi
|
|
DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
|
|
|
|
if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
|
|
DLS_EXTRA_ARGS+=(--direct)
|
|
fi
|
|
if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
|
|
DLS_EXTRA_ARGS+=(--no-db)
|
|
fi
|
|
|
|
run_bg() {
|
|
local name="$1"
|
|
shift
|
|
local logfile="${LOG_DIR}/${name}.log"
|
|
nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
|
|
echo "[start] ${name} pid=$! log=${logfile}"
|
|
}
|
|
|
|
run_fg() {
|
|
local name="$1"
|
|
shift
|
|
local logfile="${LOG_DIR}/${name}.log"
|
|
echo "[start] ${name} fg log=${logfile}"
|
|
env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
|
|
}
|
|
|
|
if [[ "${RUN_MODE}" == "sequential" ]]; then
|
|
run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
|
run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
|
run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
|
run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
|
run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
|
echo "[done] sequential completed"
|
|
else
|
|
run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
|
run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
|
run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
|
run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
|
run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
|
echo "[done] all crawlers started in background"
|
|
fi
|