落地结构化日志与基础监控告警

This commit is contained in:
hello-dd-code
2026-02-28 16:37:37 +08:00
parent 78f488fbbb
commit 5666dc61a0
7 changed files with 345 additions and 0 deletions
+67
View File
@@ -0,0 +1,67 @@
#!/usr/bin/env bash
set -euo pipefail
# 用法:
# METRICS_URL=http://127.0.0.1:8080/metrics/basic \
# SERVER_ERROR_RATE_THRESHOLD=5 AVG_LATENCY_THRESHOLD_MS=800 \
# OPS_ALERT_WEBHOOK=https://example.com/webhook ./scripts/ops/check_basic_metrics.sh
METRICS_URL="${METRICS_URL:-http://127.0.0.1:8080/metrics/basic}"
SERVER_ERROR_RATE_THRESHOLD="${SERVER_ERROR_RATE_THRESHOLD:-5}"
AVG_LATENCY_THRESHOLD_MS="${AVG_LATENCY_THRESHOLD_MS:-800}"
OPS_ALERT_WEBHOOK="${OPS_ALERT_WEBHOOK:-}"
ALERT_TITLE="${ALERT_TITLE:-[wx_service] 基础监控告警}"
send_alert() {
local message="$1"
if [[ -z "${OPS_ALERT_WEBHOOK}" ]]; then
echo "ALERT: ${message}" >&2
return
fi
curl -fsS -X POST "${OPS_ALERT_WEBHOOK}" \
-H "Content-Type: application/json" \
-d "{\"title\":\"${ALERT_TITLE}\",\"message\":\"${message}\"}" >/dev/null || true
}
response="$(curl -fsS "${METRICS_URL}" || true)"
if [[ -z "${response}" ]]; then
send_alert "无法访问 metrics 接口:${METRICS_URL}"
exit 1
fi
set +e
check_output="$(echo "${response}" | python3 - "${SERVER_ERROR_RATE_THRESHOLD}" "${AVG_LATENCY_THRESHOLD_MS}" <<'PY'
import json
import sys
payload = json.load(sys.stdin)
data = payload.get("data", {})
server_error = float(data.get("server_error_rate_pct", 0.0))
avg_latency = float(data.get("avg_latency_ms", 0.0))
server_th = float(sys.argv[1])
latency_th = float(sys.argv[2])
problems = []
if server_error > server_th:
problems.append(f"server_error_rate_pct={server_error:.2f}% > {server_th:.2f}%")
if avg_latency > latency_th:
problems.append(f"avg_latency_ms={avg_latency:.2f} > {latency_th:.2f}")
if problems:
print("; ".join(problems))
sys.exit(1)
print(f"ok server_error_rate_pct={server_error:.2f}% avg_latency_ms={avg_latency:.2f}")
PY
)"
check_exit=$?
set -e
if [[ ${check_exit} -ne 0 ]]; then
send_alert "基础监控阈值触发:${check_output}"
exit 1
fi
echo "${check_output}"