第 50 章
Prometheus + Grafana 监控体系
第50章:Prometheus + Grafana 监控体系
导语
"你无法管理你无法测量的东西。" 对于 Hermes Agent 而言,这句话尤为正确。Agent 的行为比传统 Web 服务更复杂:每一步推理都消耗 Token,每次工具调用都可能失败,每个会话都有生命周期。没有完善的监控,你对系统的实际状态一无所知。本章构建完整的 Prometheus + Grafana 监控体系,覆盖从基础指标采集到生产级告警的全流程。
50.1 需要监控的关键指标
在设计监控体系之前,先明确"我们关心什么"。Hermes Agent 的监控指标分为五个维度:
mindmap
root((Hermes 监控指标))
性能维度
请求延迟 P50/P95/P99
Agent 步骤执行时间
工具调用耗时
可靠性维度
请求成功率
工具调用成功率
错误率分类
成本维度
Token 消耗量
API 调用费用
GPU 利用率
容量维度
活跃会话数
队列深度
内存使用
业务维度
任务完成率
用户满意度评分
会话平均步骤数
核心指标定义
| 指标名称 | 类型 | 说明 | 告警阈值 |
|---|---|---|---|
hermes_request_duration_seconds |
Histogram | 端到端请求延迟 | P95 > 30s |
hermes_step_duration_seconds |
Histogram | 单步推理延迟 | P95 > 10s |
hermes_tool_call_total |
Counter | 工具调用次数(按工具名/状态) | - |
hermes_tool_call_errors_total |
Counter | 工具调用失败次数 | 错误率 > 5% |
hermes_tokens_total |
Counter | Token 消耗量(prompt/completion) | 日均 > 预算 |
hermes_active_sessions |
Gauge | 当前活跃会话数 | > 实例容量 |
hermes_session_steps_total |
Histogram | 每个会话的步骤数分布 | - |
hermes_cost_usd_total |
Counter | API 调用成本(美元) | 日均 > 预算 |
hermes_llm_errors_total |
Counter | LLM API 错误次数 | - |
50.2 Prometheus 指标采集实现
在 Hermes Agent 中嵌入指标
# hermes_metrics.py
import time
import functools
from typing import Callable, Optional
from prometheus_client import (
Counter, Histogram, Gauge, Summary,
generate_latest, CONTENT_TYPE_LATEST,
CollectorRegistry, start_http_server
)
# ─── 指标定义 ──────────────────────────────────────────────────
# 请求延迟
REQUEST_DURATION = Histogram(
'hermes_request_duration_seconds',
'End-to-end request latency in seconds',
['method', 'endpoint', 'status'],
buckets=[0.5, 1, 2, 5, 10, 30, 60, 120, 300]
)
# Agent 单步执行时间
STEP_DURATION = Histogram(
'hermes_step_duration_seconds',
'Duration of a single Agent reasoning step',
['model', 'step_type'],
buckets=[0.1, 0.5, 1, 2, 5, 10, 30]
)
# 工具调用统计
TOOL_CALLS_TOTAL = Counter(
'hermes_tool_call_total',
'Total number of tool calls',
['tool_name', 'status'] # status: success/error/timeout
)
TOOL_DURATION = Histogram(
'hermes_tool_duration_seconds',
'Tool execution duration',
['tool_name'],
buckets=[0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30]
)
# Token 消耗
TOKENS_TOTAL = Counter(
'hermes_tokens_total',
'Total tokens consumed',
['model', 'token_type'] # token_type: prompt/completion
)
# 会话状态
ACTIVE_SESSIONS = Gauge(
'hermes_active_sessions',
'Number of currently active Agent sessions',
['instance']
)
SESSION_STEPS = Histogram(
'hermes_session_steps_total',
'Number of steps per completed session',
buckets=[1, 2, 5, 10, 20, 50, 100]
)
# 成本追踪
COST_TOTAL = Counter(
'hermes_cost_usd_total',
'Cumulative API cost in USD',
['model', 'provider']
)
# LLM 错误
LLM_ERRORS_TOTAL = Counter(
'hermes_llm_errors_total',
'LLM API errors',
['model', 'error_type'] # error_type: rate_limit/context_too_long/api_error
)
# ─── 装饰器:自动追踪工具调用 ──────────────────────────────────
def track_tool_call(tool_name: str):
"""装饰器:自动追踪工具调用的成功/失败/耗时"""
def decorator(func: Callable):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
start = time.monotonic()
try:
result = await func(*args, **kwargs)
TOOL_CALLS_TOTAL.labels(tool_name=tool_name, status='success').inc()
return result
except TimeoutError:
TOOL_CALLS_TOTAL.labels(tool_name=tool_name, status='timeout').inc()
raise
except Exception:
TOOL_CALLS_TOTAL.labels(tool_name=tool_name, status='error').inc()
raise
finally:
duration = time.monotonic() - start
TOOL_DURATION.labels(tool_name=tool_name).observe(duration)
return wrapper
return decorator
# ─── Hermes Agent 监控集成 ─────────────────────────────────────
class MonitoredHermesAgent:
"""带完整 Prometheus 监控的 Hermes Agent 包装器"""
# 模型价格(美元/1000 tokens)
PRICING = {
"NousResearch/Hermes-3-Llama-3.1-8B": {"input": 0.0002, "output": 0.0002},
"NousResearch/Hermes-3-Llama-3.1-70B": {"input": 0.0009, "output": 0.0009},
}
def __init__(self, model: str, instance_id: str = "default"):
self.model = model
self.instance_id = instance_id
# 初始化活跃会话计数
ACTIVE_SESSIONS.labels(instance=instance_id).set(0)
async def run_session(self, session_id: str, task: str) -> dict:
"""执行一个完整 Agent 会话,记录全程指标"""
ACTIVE_SESSIONS.labels(instance=self.instance_id).inc()
start = time.monotonic()
steps = 0
total_prompt_tokens = 0
total_completion_tokens = 0
try:
from hermes import HermesAgent, AgentConfig
agent = HermesAgent(AgentConfig(model=self.model))
result = {"output": "", "steps": 0}
# 逐步执行,记录每步指标
async for step_result in agent.run_stream(task):
step_start = time.monotonic()
steps += 1
# 记录 Token 消耗
if hasattr(step_result, 'usage'):
pt = step_result.usage.prompt_tokens
ct = step_result.usage.completion_tokens
total_prompt_tokens += pt
total_completion_tokens += ct
TOKENS_TOTAL.labels(
model=self.model, token_type='prompt'
).inc(pt)
TOKENS_TOTAL.labels(
model=self.model, token_type='completion'
).inc(ct)
# 计算并记录成本
if self.model in self.PRICING:
cost = (pt / 1000 * self.PRICING[self.model]["input"] +
ct / 1000 * self.PRICING[self.model]["output"])
COST_TOTAL.labels(
model=self.model, provider='nous_research'
).inc(cost)
# 记录步骤耗时
step_duration = time.monotonic() - step_start
STEP_DURATION.labels(
model=self.model,
step_type=step_result.step_type # 'reasoning'/'tool_call'/'final'
).observe(step_duration)
if step_result.is_final:
result["output"] = step_result.content
break
result["steps"] = steps
SESSION_STEPS.observe(steps)
# 记录会话总耗时
total_duration = time.monotonic() - start
REQUEST_DURATION.labels(
method='POST', endpoint='/run', status='success'
).observe(total_duration)
return result
except Exception as e:
# 分类记录错误
error_type = self._classify_error(e)
LLM_ERRORS_TOTAL.labels(model=self.model, error_type=error_type).inc()
total_duration = time.monotonic() - start
REQUEST_DURATION.labels(
method='POST', endpoint='/run', status='error'
).observe(total_duration)
raise
finally:
ACTIVE_SESSIONS.labels(instance=self.instance_id).dec()
def _classify_error(self, error: Exception) -> str:
error_str = str(error).lower()
if 'rate limit' in error_str:
return 'rate_limit'
elif 'context' in error_str and 'length' in error_str:
return 'context_too_long'
elif 'timeout' in error_str:
return 'timeout'
else:
return 'api_error'
@track_tool_call("web_search")
async def web_search(self, query: str) -> dict:
"""带监控的网络搜索工具"""
# 实际搜索实现...
pass
@track_tool_call("code_exec")
async def code_exec(self, code: str) -> dict:
"""带监控的代码执行工具"""
# 实际代码执行实现...
pass
# ─── FastAPI 指标端点 ──────────────────────────────────────────
from fastapi import FastAPI, Response
app = FastAPI()
@app.get("/metrics")
async def metrics():
"""Prometheus 拉取指标的端点"""
return Response(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST
)
50.3 Prometheus 配置文件
# prometheus.yml
global:
scrape_interval: 15s # 每15秒拉取一次指标
evaluation_interval: 15s # 每15秒评估一次告警规则
external_labels:
cluster: 'prod'
region: 'us-east-1'
# ─── 告警规则文件 ─────────────────────────────────────────────
rule_files:
- "hermes_alerts.yml"
- "hermes_recording_rules.yml"
# ─── 抓取配置 ─────────────────────────────────────────────────
scrape_configs:
# Hermes Agent 实例
- job_name: 'hermes-agent'
scrape_interval: 10s
metrics_path: '/metrics'
static_configs:
- targets:
- '10.0.1.10:8000'
- '10.0.1.11:8000'
- '10.0.1.12:8000'
labels:
service: 'hermes-agent'
environment: 'production'
# 在 Kubernetes 中使用服务发现
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# action: keep
# regex: hermes-agent
# Redis 监控(redis_exporter)
- job_name: 'redis'
static_configs:
- targets: ['redis:9121']
labels:
service: 'redis'
# Nginx 监控(nginx-prometheus-exporter)
- job_name: 'nginx'
static_configs:
- targets: ['nginx:9113']
labels:
service: 'nginx'
# Node Exporter(主机指标)
- job_name: 'node'
static_configs:
- targets:
- '10.0.1.10:9100'
- '10.0.1.11:9100'
- '10.0.1.12:9100'
# Prometheus 自身
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# ─── 远程写入(可选,用于长期存储)──────────────────────────────
# remote_write:
# - url: "https://prometheus-remote-write.example.com/api/v1/push"
# basic_auth:
# username: "user"
# password: "pass"
预计算规则(Recording Rules)
# hermes_recording_rules.yml
groups:
- name: hermes_recording_rules
interval: 30s
rules:
# 工具调用成功率(过去5分钟)
- record: hermes:tool_success_rate:5m
expr: |
sum(rate(hermes_tool_call_total{status="success"}[5m])) by (tool_name)
/
sum(rate(hermes_tool_call_total[5m])) by (tool_name)
# 请求延迟 P95(过去5分钟)
- record: hermes:request_duration_p95:5m
expr: |
histogram_quantile(0.95,
sum(rate(hermes_request_duration_seconds_bucket[5m])) by (le, endpoint)
)
# 每分钟 Token 消耗速率
- record: hermes:tokens_per_minute
expr: |
sum(rate(hermes_tokens_total[1m])) by (model, token_type) * 60
# 每小时成本
- record: hermes:cost_per_hour_usd
expr: |
sum(rate(hermes_cost_usd_total[1h])) by (model) * 3600
50.4 告警规则配置
# hermes_alerts.yml
groups:
- name: hermes_performance
rules:
# 高延迟告警
- alert: HermesHighLatency
expr: |
histogram_quantile(0.95,
sum(rate(hermes_request_duration_seconds_bucket[5m])) by (le)
) > 30
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Hermes Agent P95 延迟过高"
description: "P95 请求延迟 {{ $value | humanizeDuration }} 超过 30 秒,持续 5 分钟"
runbook: "https://wiki.example.com/runbooks/hermes-high-latency"
# 关键延迟告警
- alert: HermesCriticalLatency
expr: |
histogram_quantile(0.99,
sum(rate(hermes_request_duration_seconds_bucket[5m])) by (le)
) > 120
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "Hermes Agent P99 延迟严重超标"
description: "P99 延迟超过 2 分钟,需立即介入"
- name: hermes_reliability
rules:
# 工具调用错误率
- alert: HermesToolErrorRate
expr: |
(
sum(rate(hermes_tool_call_total{status="error"}[5m])) by (tool_name)
/
sum(rate(hermes_tool_call_total[5m])) by (tool_name)
) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "工具 {{ $labels.tool_name }} 错误率超过 5%"
description: "工具调用错误率: {{ $value | humanizePercentage }}"
# LLM API 错误率飙升
- alert: HermesLLMErrorSpike
expr: |
sum(rate(hermes_llm_errors_total[5m])) by (error_type) > 0.5
for: 3m
labels:
severity: critical
annotations:
summary: "LLM API 错误类型 {{ $labels.error_type }} 发生突增"
description: "每秒错误率: {{ $value }}"
# 活跃会话数过高(容量警告)
- alert: HermesCapacityWarning
expr: |
sum(hermes_active_sessions) > 200
for: 5m
labels:
severity: warning
annotations:
summary: "Hermes Agent 活跃会话数接近容量上限"
description: "当前 {{ $value }} 个活跃会话"
- name: hermes_cost
rules:
# 小时成本超标
- alert: HermesCostOverrun
expr: |
sum(hermes:cost_per_hour_usd) > 100
for: 15m
labels:
severity: warning
team: finance
annotations:
summary: "Hermes Agent 小时 API 成本超过 $100"
description: "当前小时成本: ${{ $value }}"
# Token 消耗异常激增(可能是提示词注入或 Bug)
- alert: HermesTokenSpike
expr: |
sum(rate(hermes_tokens_total[5m]))
> 2 * avg_over_time(sum(rate(hermes_tokens_total[5m]))[1h:5m])
for: 10m
labels:
severity: warning
annotations:
summary: "Token 消耗量异常激增(超过1小时均值2倍)"
- name: hermes_infrastructure
rules:
# 实例宕机
- alert: HermesInstanceDown
expr: up{job="hermes-agent"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Hermes Agent 实例 {{ $labels.instance }} 宕机"
# Redis 连接失败
- alert: RedisDown
expr: up{job="redis"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis 实例 {{ $labels.instance }} 不可达,Agent 状态存储失效"
50.5 完整 Grafana Dashboard JSON
{
"title": "Hermes Agent 监控总览",
"uid": "hermes-agent-overview",
"tags": ["hermes", "agent", "ai"],
"timezone": "browser",
"refresh": "30s",
"time": {
"from": "now-3h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "活跃会话数",
"type": "stat",
"gridPos": {"x": 0, "y": 0, "w": 4, "h": 4},
"targets": [{
"expr": "sum(hermes_active_sessions)",
"legendFormat": "活跃会话"
}],
"options": {
"reduceOptions": {"calcs": ["lastNotNull"]},
"colorMode": "background",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 100},
{"color": "red", "value": 200}
]
}
}
},
{
"id": 2,
"title": "请求延迟 (P50/P95/P99)",
"type": "timeseries",
"gridPos": {"x": 4, "y": 0, "w": 10, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(hermes_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(hermes_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(hermes_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P99"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 10},
{"color": "red", "value": 30}
]
}
}
}
},
{
"id": 3,
"title": "工具调用成功率",
"type": "timeseries",
"gridPos": {"x": 0, "y": 8, "w": 8, "h": 6},
"targets": [{
"expr": "hermes:tool_success_rate:5m * 100",
"legendFormat": "{{ tool_name }}"
}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
}
},
{
"id": 4,
"title": "Token 消耗速率",
"type": "timeseries",
"gridPos": {"x": 8, "y": 8, "w": 8, "h": 6},
"targets": [
{
"expr": "sum(rate(hermes_tokens_total{token_type='prompt'}[5m])) by (model) * 60",
"legendFormat": "Prompt - {{ model }}"
},
{
"expr": "sum(rate(hermes_tokens_total{token_type='completion'}[5m])) by (model) * 60",
"legendFormat": "Completion - {{ model }}"
}
],
"fieldConfig": {
"defaults": {"unit": "tokens/min"}
}
},
{
"id": 5,
"title": "累计 API 成本 (今日)",
"type": "stat",
"gridPos": {"x": 16, "y": 0, "w": 4, "h": 4},
"targets": [{
"expr": "sum(increase(hermes_cost_usd_total[24h]))",
"legendFormat": "今日成本"
}],
"fieldConfig": {
"defaults": {"unit": "currencyUSD"}
}
},
{
"id": 6,
"title": "LLM 错误分类",
"type": "piechart",
"gridPos": {"x": 16, "y": 4, "w": 8, "h": 8},
"targets": [{
"expr": "sum(increase(hermes_llm_errors_total[1h])) by (error_type)",
"legendFormat": "{{ error_type }}"
}]
}
]
}
50.6 告警通知配置(Alertmanager)
# alertmanager.yml
global:
slack_api_url: 'https://hooks.slack.com/services/T.../B.../...'
smtp_smarthost: 'smtp.example.com:587'
smtp_from: '[email protected]'
route:
group_by: ['alertname', 'cluster']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'default'
routes:
# 成本告警发给财务
- match:
team: finance
receiver: 'finance-slack'
# 关键告警发给 PagerDuty
- match:
severity: critical
receiver: 'pagerduty'
continue: true
# 所有告警都发 Slack
- match_re:
severity: (warning|critical)
receiver: 'platform-slack'
receivers:
- name: 'default'
slack_configs:
- channel: '#hermes-alerts'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'platform-slack'
slack_configs:
- channel: '#platform-alerts'
send_resolved: true
title: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*告警*: {{ .Annotations.summary }}
*详情*: {{ .Annotations.description }}
*Runbook*: {{ .Annotations.runbook }}
{{ end }}
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'your_pagerduty_key'
description: '{{ .GroupLabels.alertname }}: {{ .Annotations.summary }}'
- name: 'finance-slack'
slack_configs:
- channel: '#finance-alerts'
title: '成本告警: {{ .GroupLabels.alertname }}'
小结
本章构建了 Hermes Agent 的完整 Prometheus + Grafana 监控体系:
- 五维指标体系:性能(延迟)、可靠性(成功率/错误率)、成本(Token/费用)、容量(会话数)、业务(完成率)。
- 代码级集成:通过装饰器和上下文管理器在 Agent 核心代码中透明嵌入指标采集。
- Prometheus 配置:包含服务发现、抓取配置、预计算规则(减少查询压力)。
- 多级告警:警告级 → 关键级,按团队路由(平台/财务),结合 Alertmanager + Slack + PagerDuty。
- Grafana Dashboard:覆盖关键指标的可视化面板,支持快速定位问题。
思考题
- 对于 LLM 推理成本,除了按 Token 计费,还有哪些维度的成本值得监控?
- 如果 Hermes Agent 部署在用户私有环境(无法连接外部 Prometheus),应如何设计监控方案?
- 工具调用成功率的"成功"如何定义?HTTP 200 是否就等于业务成功?
- 告警疲劳(Alert Fatigue)是监控体系的常见问题,如何设计抑制规则(Inhibit Rules)避免告警风暴?