监控体系概述
完善的监控体系是保障服务稳定运行的关键。Claude Code 可以帮你快速搭建 Prometheus + Grafana 监控方案。
监控黄金指标
可使用性(Availability)、延迟(Latency)、吞吐量(Throughput)、错误率(Errors)、资源利用率(Saturation)。
Prometheus 配置
Docker Compose 部署
bash
帮我创建一个监控系统的 docker-compose 配置,包含:
- Prometheus
- Grafana
- Node Exporter
- 应用指标采集yaml
# docker-compose.monitoring.yml
version: "3.9"
services:
prometheus:
image: prom/prometheus:v2.47.0
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/rules:/etc/prometheus/rules
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
grafana:
image: grafana/grafana:10.1.0
container_name: grafana
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning
- grafana_data:/var/lib/grafana
depends_on:
- prometheus
restart: unless-stopped
node-exporter:
image: prom/node-exporter:v1.6.1
container_name: node-exporter
ports:
- "9100:9100"
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
restart: unless-stopped
volumes:
prometheus_data:
grafana_data:Prometheus 配置
yaml
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: []
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
# Prometheus 自身监控
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Node Exporter - 系统指标
- job_name: "node-exporter"
static_configs:
- targets: ["node-exporter:9100"]
# 应用指标 - 使用 prometheus-client
- job_name: "myapp"
static_configs:
- targets: ["app:3000"]
metrics_path: /metrics
scrape_interval: 10s
# API 延迟监控
- job_name: "api-latency"
metrics_path: /metrics
static_configs:
- targets: ["app:3000"]
metric_relabel_configs:
- source_labels: [__name__]
regex: "http_request_duration.*"
action: keep告警规则
yaml
# prometheus/rules/app.yml
groups:
- name: myapp
rules:
# 实例宕机告警
- alert: InstanceDown
expr: up{job="myapp"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 宕机"
description: "{{ $labels.job }} 的实例 {{ $labels.instance }} 已经宕机超过 1 分钟"
# CPU 使用率过高
- alert: HighCPUUsage
expr: rate(process_cpu_seconds_total{job="myapp"}[5m]) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高"
description: "实例 {{ $labels.instance }} 的 CPU 使用率超过 80%"
# 内存使用率过高
- alert: HighMemoryUsage
expr: process_resident_memory_bytes{job="myapp"} / process_working_set_bytes{job="myapp"} > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "内存使用率过高"
description: "实例 {{ $labels.instance }} 的内存使用率超过 90%"
# 请求延迟过高
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "请求延迟过高"
description: "P95 延迟超过 2 秒"
# 错误率过高
- alert: HighErrorRate
expr: rate(http_requests_total{job="myapp", status=~"5.."}[5m]) / rate(http_requests_total{job="myapp"}[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "错误率过高"
description: "5xx 错误率超过 5%"应用指标埋点
Express 应用指标
bash
帮我为 Express 应用添加 Prometheus 指标埋点。javascript
// lib/metrics/index.js
import client from "prom-client";
// 收集默认指标(内存、CPU 等)
const collectDefaultMetrics = client.collectDefaultMetrics;
collectDefaultMetrics({ register: client.register });
// HTTP 请求计数器
export const httpRequestsTotal = new client.Counter({
name: "http_requests_total",
help: "Total number of HTTP requests",
labelNames: ["method", "path", "status"],
});
// HTTP 请求延迟
export const httpRequestDuration = new client.Histogram({
name: "http_request_duration_seconds",
help: "HTTP request latency in seconds",
labelNames: ["method", "path", "status"],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
});
// 数据库查询计数器
export const dbQueryTotal = new client.Counter({
name: "db_query_total",
help: "Total number of database queries",
labelNames: ["operation", "table"],
});
// 数据库查询延迟
export const dbQueryDuration = new client.Histogram({
name: "db_query_duration_seconds",
help: "Database query latency in seconds",
labelNames: ["operation", "table"],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
});
// 活跃连接数
export const activeConnections = new client.Gauge({
name: "active_connections",
help: "Number of active connections",
labelNames: ["type"],
});
// 缓存命中率
export const cacheHits = new client.Counter({
name: "cache_hits_total",
help: "Total number of cache hits",
labelNames: ["cache", "result"],
});指标中间件
javascript
// lib/middleware/metrics.js
import { httpRequestsTotal, httpRequestDuration } from "../metrics/index.js";
export function metricsMiddleware(req, res, next) {
const start = process.hrtime.bigint();
res.on("finish", () => {
const end = process.hrtime.bigint();
const duration = Number(end - start) / 1e9;
// 规范化路径(避免路径参数导致标签过多)
const path = req.route ? req.route.path : req.path;
const normalizedPath = normalizePath(path);
httpRequestsTotal.inc({
method: req.method,
path: normalizedPath,
status: res.statusCode,
});
httpRequestDuration.observe(
{
method: req.method,
path: normalizedPath,
status: res.statusCode,
},
duration
);
});
next();
}
// 路径规范化函数
function normalizePath(path) {
return path
.replace(/\/\d+/g, "/:id") // 替换数字 ID
.replace(/\/[a-f0-9-]{36}/gi, "/:uuid") // 替换 UUID
.substring(0, 100); // 限制长度
}指标端点
javascript
// routes/metrics.js
import express from "express";
import client from "prom-client";
import { activeConnections } from "../metrics/index.js";
const router = express.Router();
router.get("/metrics", async (req, res) => {
try {
// 更新活跃连接数
activeConnections.set({ type: "http" }, activeConnections._get().values().length);
res.set("Content-Type", client.register.contentType);
res.end(await client.register.metrics());
} catch (error) {
res.status(500).end(error.message);
}
});
export default router;Grafana 配置
数据源配置
yaml
# grafana/provisioning/datasources/datasources.yml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: falseDashboard 配置
json
// grafana/provisioning/dashboards/app.json
{
"dashboard": {
"title": "MyApp 监控面板",
"panels": [
{
"title": "请求率",
"type": "graph",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"targets": [
{
"expr": "rate(http_requests_total{job=\"myapp\"}[5m])",
"legendFormat": "{{method}} {{path}} {{status}}"
}
]
},
{
"title": "P99 延迟",
"type": "graph",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"targets": [
{
"expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "P99"
}
]
},
{
"title": "错误率",
"type": "graph",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"targets": [
{
"expr": "rate(http_requests_total{job=\"myapp\", status=~\"5..\"}[5m]) / rate(http_requests_total{job=\"myapp\"}[5m])",
"legendFormat": "5xx 错误率"
}
]
},
{
"title": "CPU 使用率",
"type": "graph",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=\"myapp\"}[5m]) * 100",
"legendFormat": "CPU %"
}
]
}
]
}
}Alertmanager 告警通知
Alertmanager 配置
yaml
# alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ["alertname"]
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: "default"
routes:
- match:
severity: critical
receiver: "critical-alerts"
continue: true
- match:
severity: warning
receiver: "warning-alerts"
receivers:
- name: "default"
webhook_configs:
- url: "http://webhook:5000/alert"
- name: "critical-alerts"
webhook_configs:
- url: "http://webhook:5000/alert/critical"
pagerduty_configs:
- service_key: "${PAGERDUTY_SERVICE_KEY}"
severity: critical
- name: "warning-alerts"
webhook_configs:
- url: "http://webhook:5000/alert/warning"常用 PromQL
promql
# 请求率(每秒)
rate(http_requests_total[5m])
# P50 延迟
histogram_quantile(0.5, rate(http_request_duration_seconds_bucket[5m]))
# P95 延迟
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# P99 延迟
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
# 错误率
rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])
# CPU 使用率
rate(process_cpu_seconds_total[5m]) * 100
# 内存使用
process_resident_memory_bytes / 1024 / 1024
# QPS
rate(http_requests_total[5m])
# 活跃连接数
active_connections{type="http"}监控最佳实践
遵循 USE 方法(Utilization、Saturation、Errors)和 RED 方法(Rate、Errors、Duration)设计指标。
总结
使用 Claude Code 搭建监控体系:
- 部署 Prometheus + Grafana
- 配置应用指标埋点
- 设置合理的告警规则
- 配置 Alertmanager 通知
- 创建直观的 Grafana 面板
- 定期巡检监控数据