68 lines
1.9 KiB
Bash
Executable File
68 lines
1.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# 用法:
|
|
# METRICS_URL=http://127.0.0.1:8080/metrics/basic \
|
|
# SERVER_ERROR_RATE_THRESHOLD=5 AVG_LATENCY_THRESHOLD_MS=800 \
|
|
# OPS_ALERT_WEBHOOK=https://example.com/webhook ./scripts/ops/check_basic_metrics.sh
|
|
|
|
METRICS_URL="${METRICS_URL:-http://127.0.0.1:8080/metrics/basic}"
|
|
SERVER_ERROR_RATE_THRESHOLD="${SERVER_ERROR_RATE_THRESHOLD:-5}"
|
|
AVG_LATENCY_THRESHOLD_MS="${AVG_LATENCY_THRESHOLD_MS:-800}"
|
|
OPS_ALERT_WEBHOOK="${OPS_ALERT_WEBHOOK:-}"
|
|
ALERT_TITLE="${ALERT_TITLE:-[wx_service] 基础监控告警}"
|
|
|
|
send_alert() {
|
|
local message="$1"
|
|
if [[ -z "${OPS_ALERT_WEBHOOK}" ]]; then
|
|
echo "ALERT: ${message}" >&2
|
|
return
|
|
fi
|
|
curl -fsS -X POST "${OPS_ALERT_WEBHOOK}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"title\":\"${ALERT_TITLE}\",\"message\":\"${message}\"}" >/dev/null || true
|
|
}
|
|
|
|
response="$(curl -fsS "${METRICS_URL}" || true)"
|
|
if [[ -z "${response}" ]]; then
|
|
send_alert "无法访问 metrics 接口:${METRICS_URL}"
|
|
exit 1
|
|
fi
|
|
|
|
set +e
|
|
check_output="$(echo "${response}" | python3 - "${SERVER_ERROR_RATE_THRESHOLD}" "${AVG_LATENCY_THRESHOLD_MS}" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(sys.stdin)
|
|
data = payload.get("data", {})
|
|
server_error = float(data.get("server_error_rate_pct", 0.0))
|
|
avg_latency = float(data.get("avg_latency_ms", 0.0))
|
|
|
|
server_th = float(sys.argv[1])
|
|
latency_th = float(sys.argv[2])
|
|
|
|
problems = []
|
|
if server_error > server_th:
|
|
problems.append(f"server_error_rate_pct={server_error:.2f}% > {server_th:.2f}%")
|
|
if avg_latency > latency_th:
|
|
problems.append(f"avg_latency_ms={avg_latency:.2f} > {latency_th:.2f}")
|
|
|
|
if problems:
|
|
print("; ".join(problems))
|
|
sys.exit(1)
|
|
|
|
print(f"ok server_error_rate_pct={server_error:.2f}% avg_latency_ms={avg_latency:.2f}")
|
|
PY
|
|
)"
|
|
check_exit=$?
|
|
set -e
|
|
|
|
if [[ ${check_exit} -ne 0 ]]; then
|
|
send_alert "基础监控阈值触发:${check_output}"
|
|
exit 1
|
|
fi
|
|
|
|
echo "${check_output}"
|
|
|