#!/usr/bin/env bash set -euo pipefail # 用法: # METRICS_URL=http://127.0.0.1:8080/metrics/basic \ # SERVER_ERROR_RATE_THRESHOLD=5 AVG_LATENCY_THRESHOLD_MS=800 \ # OPS_ALERT_WEBHOOK=https://example.com/webhook ./scripts/ops/check_basic_metrics.sh METRICS_URL="${METRICS_URL:-http://127.0.0.1:8080/metrics/basic}" SERVER_ERROR_RATE_THRESHOLD="${SERVER_ERROR_RATE_THRESHOLD:-5}" AVG_LATENCY_THRESHOLD_MS="${AVG_LATENCY_THRESHOLD_MS:-800}" OPS_ALERT_WEBHOOK="${OPS_ALERT_WEBHOOK:-}" ALERT_TITLE="${ALERT_TITLE:-[wx_service] 基础监控告警}" send_alert() { local message="$1" if [[ -z "${OPS_ALERT_WEBHOOK}" ]]; then echo "ALERT: ${message}" >&2 return fi curl -fsS -X POST "${OPS_ALERT_WEBHOOK}" \ -H "Content-Type: application/json" \ -d "{\"title\":\"${ALERT_TITLE}\",\"message\":\"${message}\"}" >/dev/null || true } response="$(curl -fsS "${METRICS_URL}" || true)" if [[ -z "${response}" ]]; then send_alert "无法访问 metrics 接口:${METRICS_URL}" exit 1 fi set +e check_output="$(echo "${response}" | python3 - "${SERVER_ERROR_RATE_THRESHOLD}" "${AVG_LATENCY_THRESHOLD_MS}" <<'PY' import json import sys payload = json.load(sys.stdin) data = payload.get("data", {}) server_error = float(data.get("server_error_rate_pct", 0.0)) avg_latency = float(data.get("avg_latency_ms", 0.0)) server_th = float(sys.argv[1]) latency_th = float(sys.argv[2]) problems = [] if server_error > server_th: problems.append(f"server_error_rate_pct={server_error:.2f}% > {server_th:.2f}%") if avg_latency > latency_th: problems.append(f"avg_latency_ms={avg_latency:.2f} > {latency_th:.2f}") if problems: print("; ".join(problems)) sys.exit(1) print(f"ok server_error_rate_pct={server_error:.2f}% avg_latency_ms={avg_latency:.2f}") PY )" check_exit=$? set -e if [[ ${check_exit} -ne 0 ]]; then send_alert "基础监控阈值触发:${check_output}" exit 1 fi echo "${check_output}"