第 75 章
模型行为调优:Opus 4.7 字面执行特性 / 默认风格覆盖 / Subagent 频率控制
第七十五章:Prompt 评估体系:自动化测试、人工评估与 LLM-as-Judge
75.1 为什么需要系统化的评估体系
"这个 Prompt 好不好"是一个无法凭直觉可靠回答的问题。人类的直觉偏差、样本选择偏差、以及 LLM 输出的高方差特性,使得主观判断极不可靠。没有系统化评估体系的 Prompt 工程,本质上是在黑暗中摸索。
一个完整的 Prompt 评估体系包含三个相互补充的层次:
- 自动化测试(Automated Testing):基于精确匹配、正则表达式、或结构化验证的确定性评估,速度快、成本低,适合持续集成
- 人工评估(Human Evaluation):由人类评审员对输出质量进行主观判断,质量高但成本昂贵且不可扩展
- LLM-as-Judge:使用语言模型代替人类进行评估,结合了自动化的规模优势和接近人工质量的判断能力
三者不是替代关系,而是分工协作:自动化测试把守基本质量门槛,LLM-as-Judge 负责主观质量维度,人工评估作为最终仲裁和黄金标准校准。
75.2 自动化测试
75.2.1 确定性评估指标
对于有明确正确答案的任务,确定性评估是最可靠的方法:
import re
import json
from typing import Callable
class AutomatedEvaluator:
"""自动化评估器基类"""
def exact_match(self, prediction: str, ground_truth: str) -> float:
"""精确匹配:归一化后比较"""
pred = prediction.strip().lower()
truth = ground_truth.strip().lower()
return 1.0 if pred == truth else 0.0
def contains_match(self, prediction: str, expected_phrases: list) -> float:
"""包含匹配:检查输出是否包含所有期望短语"""
pred_lower = prediction.lower()
matches = sum(1 for phrase in expected_phrases if phrase.lower() in pred_lower)
return matches / len(expected_phrases) if expected_phrases else 0.0
def json_schema_validation(self, prediction: str, schema: dict) -> float:
"""JSON 格式验证"""
try:
# 提取 JSON(处理可能的前缀文本)
json_match = re.search(r'\{.*\}', prediction, re.DOTALL)
if not json_match:
return 0.0
parsed = json.loads(json_match.group())
# 检查必需字段
required_fields = schema.get("required", [])
fields_present = all(field in parsed for field in required_fields)
# 检查字段类型
properties = schema.get("properties", {})
type_correct = True
for field, field_schema in properties.items():
if field in parsed:
expected_type = field_schema.get("type")
if expected_type == "string" and not isinstance(parsed[field], str):
type_correct = False
elif expected_type == "number" and not isinstance(parsed[field], (int, float)):
type_correct = False
elif expected_type == "array" and not isinstance(parsed[field], list):
type_correct = False
return 1.0 if (fields_present and type_correct) else 0.5
except json.JSONDecodeError:
return 0.0
def regex_match(self, prediction: str, pattern: str) -> float:
"""正则表达式匹配"""
return 1.0 if re.search(pattern, prediction, re.IGNORECASE) else 0.0
def length_check(self, prediction: str, min_words: int = 0, max_words: int = float('inf')) -> float:
"""长度约束检查"""
word_count = len(prediction.split())
if min_words <= word_count <= max_words:
return 1.0
elif word_count < min_words:
return word_count / min_words
else:
return max_words / word_count
75.2.2 测试套件的设计
class PromptTestSuite:
"""Prompt 测试套件"""
def __init__(self, prompt: str, model: str = "claude-opus-4-5"):
self.prompt = prompt
self.model = model
self.test_cases = []
self.evaluator = AutomatedEvaluator()
def add_test_case(
self,
input_text: str,
expected_output: str = None,
checks: list = None,
tags: list = None
):
"""
checks: [
{"type": "exact_match", "expected": "..."},
{"type": "contains", "phrases": [...]},
{"type": "json_schema", "schema": {...}},
{"type": "regex", "pattern": "..."},
{"type": "length", "min": 10, "max": 500},
{"type": "custom", "fn": lambda output: float}
]
"""
self.test_cases.append({
"input": input_text,
"expected": expected_output,
"checks": checks or [],
"tags": tags or []
})
def run(self, client, verbose: bool = False) -> dict:
results = []
for i, case in enumerate(self.test_cases):
response = client.messages.create(
model=self.model,
max_tokens=1024,
system=self.prompt,
messages=[{"role": "user", "content": case["input"]}]
)
output = response.content[0].text
# 运行所有检查
check_scores = []
check_details = []
for check in case["checks"]:
score = self._run_check(check, output, case.get("expected", ""))
check_scores.append(score)
check_details.append({
"type": check["type"],
"score": score,
"passed": score >= 0.8
})
case_score = sum(check_scores) / len(check_scores) if check_scores else 1.0
result = {
"case_index": i,
"input": case["input"],
"output": output,
"score": case_score,
"passed": case_score >= 0.8,
"check_details": check_details,
"tags": case.get("tags", [])
}
results.append(result)
if verbose:
status = "PASS" if result["passed"] else "FAIL"
print(f"[{status}] Case {i+1}: {case_score:.2f}")
# 计算汇总统计
total = len(results)
passed = sum(1 for r in results if r["passed"])
return {
"total_cases": total,
"passed": passed,
"failed": total - passed,
"pass_rate": passed / total if total > 0 else 0,
"avg_score": sum(r["score"] for r in results) / total if total > 0 else 0,
"results": results
}
def _run_check(self, check: dict, output: str, expected: str) -> float:
check_type = check["type"]
if check_type == "exact_match":
return self.evaluator.exact_match(output, check.get("expected", expected))
elif check_type == "contains":
return self.evaluator.contains_match(output, check["phrases"])
elif check_type == "json_schema":
return self.evaluator.json_schema_validation(output, check["schema"])
elif check_type == "regex":
return self.evaluator.regex_match(output, check["pattern"])
elif check_type == "length":
return self.evaluator.length_check(output, check.get("min", 0), check.get("max", float('inf')))
elif check_type == "custom":
return check["fn"](output)
return 0.0
75.2.3 持续集成中的 Prompt 测试
# .github/workflows/prompt-ci.yml
name: Prompt CI
on:
pull_request:
paths:
- 'prompts/**'
- 'tests/prompt_tests/**'
jobs:
test-prompts:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Run Prompt Tests
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
python tests/run_prompt_tests.py \
--min-pass-rate 0.85 \
--max-regression-rate 0.05
75.3 人工评估
75.3.1 人工评估的适用场景
人工评估无法被完全自动化替代,尤其在以下情况下:
- 主观质量维度:风格是否符合品牌调性、回答是否有温度感、创意是否足够
- 新任务基准建立:在 LLM-as-Judge 能可靠工作之前,需要人工构建黄金标准
- 安全性评估:有害内容、偏见检测,需要经过训练的人工审核员
- 专业领域准确性:医疗、法律、金融建议的专业准确性需要领域专家评审
75.3.2 标注指南设计
清晰的标注指南是人工评估一致性的基础:
## 客服回复质量评估指南
### 评分维度
**1. 问题解决性(1-5分)**
- 5分:完全准确地解决了用户的具体问题
- 4分:基本解决了问题,但有轻微遗漏
- 3分:部分解决了问题,用户可能仍需追问
- 2分:回答方向正确但信息不足以解决问题
- 1分:未解决问题或回答与问题无关
**2. 语气与专业性(1-5分)**
- 5分:温暖专业,让用户感到被重视
- 4分:专业礼貌,无明显问题
- 3分:中性,不温不火
- 2分:略显生硬或过于正式
- 1分:不专业或让用户感到不舒适
**3. 信息准确性(1-5分)**
- 5分:所有信息完全准确
- 4分:主要信息准确,细节有小误
- 3分:大部分准确,有一处明显错误
- 2分:多处信息错误
- 1分:信息严重错误或具有误导性
### 边界情况处理
- 当用户情绪激动时,语气分应额外加权
- 当问题涉及账单/退款时,准确性分加权
75.3.3 评估者间一致性
from itertools import combinations
import numpy as np
def compute_inter_rater_agreement(ratings: dict) -> dict:
"""
计算评估者间一致性(Cohen's Kappa 简化版)
ratings: {"rater_a": [分数列表], "rater_b": [分数列表], ...}
"""
rater_names = list(ratings.keys())
agreements = {}
for r1, r2 in combinations(rater_names, 2):
scores1 = np.array(ratings[r1])
scores2 = np.array(ratings[r2])
# 简单相关系数
correlation = np.corrcoef(scores1, scores2)[0, 1]
# 平均绝对误差
mae = np.mean(np.abs(scores1 - scores2))
# 完全一致率(分数相同)
exact_agreement = np.mean(scores1 == scores2)
agreements[f"{r1}_vs_{r2}"] = {
"correlation": float(correlation),
"mae": float(mae),
"exact_agreement": float(exact_agreement),
"acceptable": correlation > 0.7 and mae < 0.8
}
return agreements
75.4 LLM-as-Judge
75.4.1 核心概念与适用场景
LLM-as-Judge 是指使用语言模型(通常是能力较强的模型,如 Claude Opus)来评估另一个模型的输出质量。
适用场景:
- 需要主观判断但有清晰标准的任务(写作流畅性、逻辑连贯性)
- 人工评估成本过高的大规模评估
- 开发迭代过程中的快速反馈循环
核心挑战:
- 位置偏见(Position Bias):模型倾向于给第一个或最后一个选项更高评分
- 冗长偏见(Verbosity Bias):模型倾向于偏爱更长的回答
- 自我偏见(Self-Enhancement Bias):模型倾向于偏爱与自己风格相似的输出
- 重复性偏见(Sycophancy Bias):当被追问时,模型倾向于改变评分以取悦提问者
75.4.2 高质量 Judge Prompt 设计
from anthropic import Anthropic
client = Anthropic()
JUDGE_SYSTEM_PROMPT = """你是一位客观、严格的 AI 输出质量评审员。
你的评估必须:
1. 基于提供的评估标准,而非个人偏好
2. 保持一致性:相同质量的输出应获得相同的评分
3. 提供可追溯的理由:每个评分都需要具体的文本证据支撑
4. 抵抗以下偏见:
- 不因回答更长就给更高分
- 不因位置(A还是B先展示)而影响判断
- 不因语气讨好而忽视实质内容
重要:在开始评分之前,先完整阅读所有待评估的输出。"""
def llm_judge_single(
task_description: str,
user_input: str,
model_output: str,
evaluation_criteria: dict
) -> dict:
"""
对单个输出进行 LLM 评估
"""
criteria_text = "\n".join([
f"- {name}(权重 {info['weight']}):{info['description']}"
for name, info in evaluation_criteria.items()
])
judge_prompt = f"""请评估以下 AI 回复的质量。
任务描述:{task_description}
用户输入:
{user_input}
AI 回复:
{model_output}
评估维度:
{criteria_text}
请按以下格式输出评估结果:
<evaluation>
对每个维度进行分析:
[对每个维度的具体分析,引用回复中的具体文本作为证据]
</evaluation>
<scores>
{{
{", ".join([f'"{name}": {{"score": 1-10, "reasoning": "..."}}' for name in evaluation_criteria])}
}}
</scores>
<overall>
{{
"weighted_score": 综合加权分(0-10),
"verdict": "excellent/good/acceptable/poor/unacceptable",
"key_strength": "最突出的优点",
"key_weakness": "最需要改进的方面"
}}
</overall>"""
response = client.messages.create(
model="claude-opus-4-5",
max_tokens=2000,
system=JUDGE_SYSTEM_PROMPT,
messages=[{"role": "user", "content": judge_prompt}]
)
return parse_judge_response(response.content[0].text)
75.4.3 对比评估(Pairwise Comparison)
对比评估比绝对评分更可靠,因为它减少了校准偏差:
def llm_judge_pairwise(
task_description: str,
user_input: str,
output_a: str,
output_b: str,
evaluation_criteria: dict,
randomize_order: bool = True
) -> dict:
"""
使用 LLM 对两个输出进行对比评估
randomize_order: 随机化展示顺序以减少位置偏见
"""
import random
if randomize_order and random.random() > 0.5:
output_a, output_b = output_b, output_a
swapped = True
else:
swapped = False
criteria_text = "\n".join([
f"- {name}:{info['description']}"
for name, info in evaluation_criteria.items()
])
judge_prompt = f"""请比较以下两个 AI 回复,判断哪个更好。
任务描述:{task_description}
用户输入:
{user_input}
回复 A:
{output_a}
回复 B:
{output_b}
评估维度:
{criteria_text}
重要提示:
- 不要因为回复更长就认为更好
- 只基于内容质量做判断,忽略位置顺序
- 如果两者质量相当,选择 "tie"
请输出:
<comparison>
对比分析两个回复在各维度上的差异
</comparison>
<verdict>
{{
"winner": "A" 或 "B" 或 "tie",
"confidence": "high/medium/low",
"reasoning": "核心理由(1-2句话)"
}}
</verdict>"""
response = client.messages.create(
model="claude-opus-4-5",
max_tokens=1500,
system=JUDGE_SYSTEM_PROMPT,
messages=[{"role": "user", "content": judge_prompt}]
)
result = parse_pairwise_response(response.content[0].text)
# 如果顺序被交换,需要还原
if swapped and result.get("winner") in ["A", "B"]:
result["winner"] = "B" if result["winner"] == "A" else "A"
return result
75.4.4 偏见校准
def calibrate_judge_bias(judge_fn, calibration_set: list) -> dict:
"""
使用已知质量对比的校准集来检测 Judge 的偏见
calibration_set: [
{
"input": ...,
"output_better": ..., # 人工确认更好的输出
"output_worse": ..., # 人工确认较差的输出
}
]
"""
position_bias_tests = []
accuracy_tests = []
for item in calibration_set:
# 测试一:Better 在 A 位置
result_a = judge_fn(item["input"], item["output_better"], item["output_worse"])
# 测试二:Better 在 B 位置
result_b = judge_fn(item["input"], item["output_worse"], item["output_better"])
position_bias_tests.append({
"correct_when_better_is_a": result_a.get("winner") == "A",
"correct_when_better_is_b": result_b.get("winner") == "B"
})
position_a_preference = sum(1 for t in position_bias_tests if t["correct_when_better_is_a"]) / len(position_bias_tests)
position_b_preference = sum(1 for t in position_bias_tests if t["correct_when_better_is_b"]) / len(position_bias_tests)
return {
"accuracy_when_better_is_a": position_a_preference,
"accuracy_when_better_is_b": position_b_preference,
"position_bias": abs(position_a_preference - position_b_preference),
"overall_accuracy": (position_a_preference + position_b_preference) / 2,
"bias_detected": abs(position_a_preference - position_b_preference) > 0.1
}
75.5 评估体系的架构设计
75.5.1 三层评估流水线
输入测试集
↓
[第一层] 自动化测试(格式检查、基本正确性)
↓ 通过率 < 80%? → 触发告警,阻断部署
↓
[第二层] LLM-as-Judge(主观质量、多维度评分)
↓ 得分下降 > 5%? → 触发人工审查
↓
[第三层] 人工评估(边界案例、安全性、新场景)
↓
评估报告 → 优化决策
75.5.2 评估指标仪表盘
class EvaluationDashboard:
def __init__(self):
self.metrics_history = []
def record_evaluation(
self,
prompt_version: str,
auto_test_results: dict,
llm_judge_results: dict,
timestamp: str
):
self.metrics_history.append({
"version": prompt_version,
"timestamp": timestamp,
"auto_pass_rate": auto_test_results.get("pass_rate", 0),
"auto_avg_score": auto_test_results.get("avg_score", 0),
"llm_overall_score": llm_judge_results.get("overall_score", 0),
"llm_dimension_scores": llm_judge_results.get("dimension_scores", {}),
})
def detect_regression(self, current_version: str, baseline_version: str) -> dict:
current = next((m for m in self.metrics_history if m["version"] == current_version), None)
baseline = next((m for m in self.metrics_history if m["version"] == baseline_version), None)
if not current or not baseline:
return {"error": "版本不存在"}
return {
"auto_pass_rate_change": current["auto_pass_rate"] - baseline["auto_pass_rate"],
"llm_score_change": current["llm_overall_score"] - baseline["llm_overall_score"],
"has_regression": (
current["auto_pass_rate"] < baseline["auto_pass_rate"] - 0.05 or
current["llm_overall_score"] < baseline["llm_overall_score"] - 0.3
)
}
75.6 评估数据集的构建与维护
75.6.1 测试集构建原则
高质量的测试集需要满足以下条件:
- 代表性:覆盖任务的主要场景分布
- 挑战性:包含足够的困难案例和边界情况
- 无泄露性:测试集不能出现在 Prompt 优化的训练过程中
- 版本化:测试集本身需要版本控制
75.6.2 测试集的持续扩充
def mine_failure_cases_from_production(
production_logs: list,
human_feedback: list,
min_confidence: float = 0.8
) -> list:
"""
从生产日志和用户反馈中挖掘高价值测试案例
"""
new_cases = []
for log in production_logs:
# 用户明确标记为"不好"的回复
if log.get("user_feedback") == "negative":
new_cases.append({
"input": log["input"],
"output": log["output"],
"issue": "user_negative_feedback",
"source": "production"
})
# 人工审核后添加到正式测试集
reviewed_cases = [
case for case in new_cases
if case.get("review_status") == "confirmed_issue"
]
return reviewed_cases
小结
Prompt 评估体系是从"感觉不错"到"可度量可改进"的关键转变。自动化测试确保基本功能的一致性,人工评估建立质量标准的黄金基准,LLM-as-Judge 在两者之间提供可扩展的主观质量评估。
实践中,三者的比例通常是:自动化测试覆盖 70-80% 的常规场景,LLM-as-Judge 处理主观维度,人工评估聚焦于高价值的安全性和新场景评审。偏见意识、校准机制、以及持续维护的测试集,是这套体系长期有效运行的关键保障。