落地结构化日志与基础监控告警
This commit is contained in:
Executable
+67
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 用法:
|
||||
# METRICS_URL=http://127.0.0.1:8080/metrics/basic \
|
||||
# SERVER_ERROR_RATE_THRESHOLD=5 AVG_LATENCY_THRESHOLD_MS=800 \
|
||||
# OPS_ALERT_WEBHOOK=https://example.com/webhook ./scripts/ops/check_basic_metrics.sh
|
||||
|
||||
METRICS_URL="${METRICS_URL:-http://127.0.0.1:8080/metrics/basic}"
|
||||
SERVER_ERROR_RATE_THRESHOLD="${SERVER_ERROR_RATE_THRESHOLD:-5}"
|
||||
AVG_LATENCY_THRESHOLD_MS="${AVG_LATENCY_THRESHOLD_MS:-800}"
|
||||
OPS_ALERT_WEBHOOK="${OPS_ALERT_WEBHOOK:-}"
|
||||
ALERT_TITLE="${ALERT_TITLE:-[wx_service] 基础监控告警}"
|
||||
|
||||
send_alert() {
|
||||
local message="$1"
|
||||
if [[ -z "${OPS_ALERT_WEBHOOK}" ]]; then
|
||||
echo "ALERT: ${message}" >&2
|
||||
return
|
||||
fi
|
||||
curl -fsS -X POST "${OPS_ALERT_WEBHOOK}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"title\":\"${ALERT_TITLE}\",\"message\":\"${message}\"}" >/dev/null || true
|
||||
}
|
||||
|
||||
response="$(curl -fsS "${METRICS_URL}" || true)"
|
||||
if [[ -z "${response}" ]]; then
|
||||
send_alert "无法访问 metrics 接口:${METRICS_URL}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set +e
|
||||
check_output="$(echo "${response}" | python3 - "${SERVER_ERROR_RATE_THRESHOLD}" "${AVG_LATENCY_THRESHOLD_MS}" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(sys.stdin)
|
||||
data = payload.get("data", {})
|
||||
server_error = float(data.get("server_error_rate_pct", 0.0))
|
||||
avg_latency = float(data.get("avg_latency_ms", 0.0))
|
||||
|
||||
server_th = float(sys.argv[1])
|
||||
latency_th = float(sys.argv[2])
|
||||
|
||||
problems = []
|
||||
if server_error > server_th:
|
||||
problems.append(f"server_error_rate_pct={server_error:.2f}% > {server_th:.2f}%")
|
||||
if avg_latency > latency_th:
|
||||
problems.append(f"avg_latency_ms={avg_latency:.2f} > {latency_th:.2f}")
|
||||
|
||||
if problems:
|
||||
print("; ".join(problems))
|
||||
sys.exit(1)
|
||||
|
||||
print(f"ok server_error_rate_pct={server_error:.2f}% avg_latency_ms={avg_latency:.2f}")
|
||||
PY
|
||||
)"
|
||||
check_exit=$?
|
||||
set -e
|
||||
|
||||
if [[ ${check_exit} -ne 0 ]]; then
|
||||
send_alert "基础监控阈值触发:${check_output}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "${check_output}"
|
||||
|
||||
Reference in New Issue
Block a user