Prometheus监控
指标类型
| 类型 | 描述 | 示例 |
|---|---|---|
| Counter(计数器) | 单调递增,永不减少 | http_requests_total、errors_total |
| Gauge(仪表) | 可增可减 | memory_usage_bytes、active_connections |
| Histogram(直方图) | 观察并分桶值,计算分位数 | http_request_duration_seconds |
| Summary(摘要) | 客户端滑动窗口分位数计算 | rpc_duration_seconds |
PromQL 示例
# 即时向量 - 当前值
http_requests_total
# 带标签过滤
http_requests_total{job="api", status="200"}
# 范围向量 - 最近 5 分钟
http_requests_total[5m]
# 速率(5m 内每秒速率)
rate(http_requests_total[5m])
# 错误率
rate(http_requests_errors_total[5m]) / rate(http_requests_total[5m])
# P95 延迟
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# 按标签求和
sum(rate(http_requests_total[5m])) by (service)
# CPU 使用率
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
告警规则
# alerts.yml
groups:
- name: api-alerts
rules:
- alert: HighErrorRate
expr: |
rate(http_requests_errors_total[5m])
/ rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.service }} 错误率过高"
description: "错误率为 {{ $value | humanizePercentage }}(阈值:5%)"
- alert: HighLatency
expr: |
histogram_quantile(0.95,
rate(http_request_duration_seconds_bucket[5m])
) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "P95 延迟过高:{{ $value }}s"
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical