feat: enhance project configuration and improve data export functionality

- Updated `.gitignore` to streamline ignored files and added logging for common sites.
- Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings.
- Refined `README.md` to clarify project structure and usage instructions.
- Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support.
- Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management.
- Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
This commit is contained in:
hello-dd-code
2026-03-18 10:02:25 +08:00
parent c2b77975c1
commit 38e7c284e8
14 changed files with 1665 additions and 3004 deletions
+8 -75
View File
@@ -1,80 +1,13 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
LOG_DIR="${PROJECT_ROOT}/logs"
DATA_DIR="${PROJECT_ROOT}/data"
# 切换到脚本所在目录,确保相对路径正确
cd "$(dirname "$0")"
mkdir -p "${LOG_DIR}" "${DATA_DIR}"
echo "使用 request/proxy_settings.json 读取代理配置"
if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
else
PYTHON_BIN="python3"
fi
RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
echo "[start] project=${PROJECT_ROOT}"
echo "[start] python=${PYTHON_BIN}"
echo "[start] mode=${RUN_MODE}"
echo "[start] proxy=request/proxy_settings.json"
# 大律师(新结构采集 + 写库)可通过环境变量控制
DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
DLS_SLEEP="${DLS_SLEEP:-0.2}"
DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
DLS_EXTRA_ARGS=()
if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
fi
if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
fi
if [[ -n "${DLS_CITY_FILTER}" ]]; then
DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
fi
DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
DLS_EXTRA_ARGS+=(--direct)
fi
if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
DLS_EXTRA_ARGS+=(--no-db)
fi
run_bg() {
local name="$1"
shift
local logfile="${LOG_DIR}/${name}.log"
nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
echo "[start] ${name} pid=$! log=${logfile}"
}
run_fg() {
local name="$1"
shift
local logfile="${LOG_DIR}/${name}.log"
echo "[start] ${name} fg log=${logfile}"
env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
}
if [[ "${RUN_MODE}" == "sequential" ]]; then
run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
echo "[done] sequential completed"
else
run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
echo "[done] all crawlers started in background"
fi
nohup python ../common_sites/dls.py > dls.log 2>&1 & # 大律师
nohup python ../common_sites/findlaw.py > findlaw.log 2>&1 & # 找法网
nohup python ../common_sites/lawtime.py > lawtime.log 2>&1 & # 法律快车
nohup python ../common_sites/six4365.py > six4365.log 2>&1 & # 律图
nohup python ../common_sites/hualv.py > hualv.log 2>&1 & # 华律