feat: enhance project configuration and improve data export functionality
- Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
This commit is contained in:
+8
-75
@@ -1,80 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
LOG_DIR="${PROJECT_ROOT}/logs"
|
||||
DATA_DIR="${PROJECT_ROOT}/data"
|
||||
# 切换到脚本所在目录,确保相对路径正确
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
mkdir -p "${LOG_DIR}" "${DATA_DIR}"
|
||||
echo "使用 request/proxy_settings.json 读取代理配置"
|
||||
|
||||
if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
|
||||
PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
|
||||
else
|
||||
PYTHON_BIN="python3"
|
||||
fi
|
||||
|
||||
RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
|
||||
|
||||
echo "[start] project=${PROJECT_ROOT}"
|
||||
echo "[start] python=${PYTHON_BIN}"
|
||||
echo "[start] mode=${RUN_MODE}"
|
||||
echo "[start] proxy=request/proxy_settings.json"
|
||||
|
||||
# 大律师(新结构采集 + 写库)可通过环境变量控制
|
||||
DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
|
||||
DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
|
||||
DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
|
||||
DLS_SLEEP="${DLS_SLEEP:-0.2}"
|
||||
DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
|
||||
DLS_EXTRA_ARGS=()
|
||||
|
||||
if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
|
||||
fi
|
||||
if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
|
||||
fi
|
||||
if [[ -n "${DLS_CITY_FILTER}" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
|
||||
fi
|
||||
DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
|
||||
|
||||
if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--direct)
|
||||
fi
|
||||
if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
|
||||
DLS_EXTRA_ARGS+=(--no-db)
|
||||
fi
|
||||
|
||||
run_bg() {
|
||||
local name="$1"
|
||||
shift
|
||||
local logfile="${LOG_DIR}/${name}.log"
|
||||
nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
|
||||
echo "[start] ${name} pid=$! log=${logfile}"
|
||||
}
|
||||
|
||||
run_fg() {
|
||||
local name="$1"
|
||||
shift
|
||||
local logfile="${LOG_DIR}/${name}.log"
|
||||
echo "[start] ${name} fg log=${logfile}"
|
||||
env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
|
||||
}
|
||||
|
||||
if [[ "${RUN_MODE}" == "sequential" ]]; then
|
||||
run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
||||
run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
||||
run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
||||
run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
||||
run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
||||
echo "[done] sequential completed"
|
||||
else
|
||||
run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
|
||||
run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
|
||||
run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
|
||||
run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
|
||||
run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
|
||||
echo "[done] all crawlers started in background"
|
||||
fi
|
||||
nohup python ../common_sites/dls.py > dls.log 2>&1 & # 大律师
|
||||
nohup python ../common_sites/findlaw.py > findlaw.log 2>&1 & # 找法网
|
||||
nohup python ../common_sites/lawtime.py > lawtime.log 2>&1 & # 法律快车
|
||||
nohup python ../common_sites/six4365.py > six4365.log 2>&1 & # 律图
|
||||
nohup python ../common_sites/hualv.py > hualv.log 2>&1 & # 华律
|
||||
|
||||
Reference in New Issue
Block a user