第 32 章
Skill 测试框架与质量保障
第32章:Skill 测试框架与质量保障
一个没有测试的 Skill 就像一架没有仪表盘的飞机——它可能运行良好,但你不知道何时会出问题,也不知道出了什么问题。本章构建完整的 Skill 测试体系,覆盖功能测试、性能测试、边界测试和集成测试四个维度,并展示如何在 CI/CD 流水线中自动化执行。
32.1 Skill 测试的四个维度
维度全景图
┌─────────────────────────────────────────────────────────┐
│ Skill 测试金字塔 │
│ │
│ ┌──────────────────────┐ │
│ │ 集成测试 (10%) │ ← 与真实 Hermes 联调 │
│ └──────────────────────┘ │
│ ┌────────────────────────────┐ │
│ │ 性能/边界测试 (20%) │ ← 压力、超时、异常 │
│ └────────────────────────────┘ │
│ ┌──────────────────────────────────┐ │
│ │ 功能测试 (70%) │ ← 核心逻辑验证 │
│ └──────────────────────────────────┘ │
│ │
│ 各维度职责: │
│ 功能测试:验证"做对了什么" │
│ 性能测试:验证"做得够快吗" │
│ 边界测试:验证"做错了怎么办" │
│ 集成测试:验证"放在一起能工作吗" │
└─────────────────────────────────────────────────────────┘
各维度的测试目标
| 维度 | 目标 | 关键问题 | 工具 |
|---|---|---|---|
| 功能测试 | 核心逻辑正确性 | Skill 做对了什么? | pytest + Mock |
| 性能测试 | 响应时间、资源使用 | 够快吗?够稳定吗? | pytest-benchmark |
| 边界测试 | 异常输入处理 | 出错时会崩溃吗? | pytest + Hypothesis |
| 集成测试 | 端到端工作流 | 放在真实环境中能跑吗? | Hermes TestHarness |
32.2 单元测试框架搭建
项目测试目录结构
tests/
├── conftest.py # 共享 fixtures 和配置
├── unit/
│ ├── test_input_validation.py # 输入验证测试
│ ├── test_news_fetcher.py # 新闻抓取逻辑测试
│ ├── test_digest_formatter.py # 格式化逻辑测试
│ └── test_error_handling.py # 错误处理测试
├── performance/
│ ├── test_response_time.py # 响应时间测试
│ └── test_concurrent_load.py # 并发负载测试
├── boundary/
│ ├── test_edge_cases.py # 边界值测试
│ └── test_malformed_input.py # 畸形输入测试
├── integration/
│ ├── test_full_workflow.py # 完整工作流测试
│ └── test_hermes_integration.py # Hermes 集成测试
└── fixtures/
├── sample_news_response.json
├── sample_article_content.html
└── edge_case_inputs.json
conftest.py:共享 Fixtures
"""共享测试 fixtures 和配置"""
import pytest
import json
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock
from tools.news_fetcher import NewsFetcher, NewsArticle
from tools.digest_formatter import DigestFormatter
FIXTURES_DIR = Path(__file__).parent / "fixtures"
# ===== 基础 Fixtures =====
@pytest.fixture
def fixtures_dir():
return FIXTURES_DIR
@pytest.fixture
def sample_search_response():
"""模拟搜索 API 响应"""
with open(FIXTURES_DIR / "sample_news_response.json") as f:
return json.load(f)
@pytest.fixture
def sample_articles():
"""预构建的 NewsArticle 对象列表"""
return [
NewsArticle(
title="EU AI Act Implementation Accelerates",
url="https://reuters.com/eu-ai-act",
snippet="The European Commission announced new timelines...",
source="reuters.com",
published_date="2 hours ago",
full_content="Full article content for EU AI Act..."
),
NewsArticle(
title="AI Regulation: Industry Response",
url="https://techcrunch.com/ai-regulation",
snippet="Tech companies are preparing for...",
source="techcrunch.com",
published_date="5 hours ago",
full_content="Full article content for industry response..."
),
]
@pytest.fixture
def news_fetcher():
"""NewsFetcher 实例(使用测试 API key)"""
return NewsFetcher(search_api_key="test-key-123", timeout=5)
@pytest.fixture
def formatter():
"""DigestFormatter 实例"""
return DigestFormatter()
@pytest.fixture
def mock_http_client():
"""Mock HTTP 客户端"""
mock = MagicMock()
mock.__enter__ = MagicMock(return_value=mock)
mock.__exit__ = MagicMock(return_value=None)
return mock
# ===== Hermes 集成 Fixtures =====
@pytest.fixture
def hermes_test_config():
"""Hermes 测试环境配置"""
return {
"model": "claude-3-haiku-20240307", # 使用最小模型降低测试成本
"max_tokens": 512,
"tools": ["web_search", "fetch_url", "write_file"]
}
unit/test_input_validation.py
"""输入验证单元测试"""
import pytest
from tools.validation import SkillInputValidator, SkillInputError
from conftest import NEWS_DIGEST_INPUT_SCHEMA
@pytest.fixture
def validator():
return SkillInputValidator(NEWS_DIGEST_INPUT_SCHEMA)
class TestRequiredParameters:
"""必需参数验证测试"""
def test_valid_minimal_input(self, validator):
"""测试最小有效输入"""
valid = {"topics": ["AI regulation"]}
is_valid, errors = validator.validate(valid)
assert is_valid is True
assert errors == []
def test_missing_topics_fails(self, validator):
"""测试缺少 topics 参数"""
invalid = {"time_range": "today"}
is_valid, errors = validator.validate(invalid)
assert is_valid is False
assert any("topics" in e for e in errors)
def test_empty_topics_array_fails(self, validator):
"""测试空 topics 数组"""
invalid = {"topics": []}
is_valid, errors = validator.validate(invalid)
assert is_valid is False
def test_too_many_topics_fails(self, validator):
"""测试超过最大 topics 数量"""
invalid = {"topics": ["t1", "t2", "t3", "t4", "t5", "t6"]} # 最大 5 个
is_valid, errors = validator.validate(invalid)
assert is_valid is False
class TestOptionalParameters:
"""可选参数验证测试"""
@pytest.mark.parametrize("time_range", ["today", "24h", "this_week", "this_month"])
def test_valid_time_ranges(self, validator, time_range):
"""测试所有有效的时间范围"""
data = {"topics": ["AI"], "time_range": time_range}
is_valid, _ = validator.validate(data)
assert is_valid is True
def test_invalid_time_range_fails(self, validator):
"""测试无效的时间范围"""
data = {"topics": ["AI"], "time_range": "yesterday"}
is_valid, errors = validator.validate(data)
assert is_valid is False
assert any("time_range" in e or "yesterday" in e for e in errors)
@pytest.mark.parametrize("count,expected_valid", [
(1, True), (5, True), (10, True),
(0, False), (11, False), (-1, False)
])
def test_max_articles_range(self, validator, count, expected_valid):
"""测试文章数量范围约束"""
data = {"topics": ["AI"], "max_articles_per_topic": count}
is_valid, _ = validator.validate(data)
assert is_valid is expected_valid
class TestTypeCoercion:
"""类型强制转换测试"""
def test_string_int_coercion(self, validator):
"""测试字符串整数被正确转换"""
data = {"topics": ["AI"], "max_articles_per_topic": "5"}
coerced = validator.coerce_and_validate(data)
assert coerced["max_articles_per_topic"] == 5
assert isinstance(coerced["max_articles_per_topic"], int)
def test_string_bool_coercion(self, validator):
"""测试字符串布尔值被正确转换"""
for true_val in ["true", "True", "yes", "YES", "1"]:
data = {"topics": ["AI"], "save_to_file": true_val}
coerced = validator.coerce_and_validate(data)
assert coerced["save_to_file"] is True
def test_single_string_to_array(self, validator):
"""测试单字符串被转换为数组(LLM 常见错误)"""
data = {"topics": "AI regulation"} # 应该是数组
coerced = validator.coerce_and_validate(data)
assert coerced["topics"] == ["AI regulation"]
unit/test_news_fetcher.py
"""新闻抓取器单元测试"""
import pytest
import httpx
from unittest.mock import patch, MagicMock
class TestSearchNews:
"""搜索功能测试"""
def test_search_returns_articles(self, news_fetcher, sample_search_response):
with patch.object(news_fetcher.client, 'get') as mock_get:
mock_resp = MagicMock()
mock_resp.json.return_value = sample_search_response
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
articles = news_fetcher.search_news("AI regulation")
assert len(articles) == len(sample_search_response["results"])
assert all(hasattr(a, 'title') for a in articles)
assert all(hasattr(a, 'url') for a in articles)
def test_search_handles_api_error(self, news_fetcher):
"""测试 API 错误的优雅处理"""
with patch.object(news_fetcher.client, 'get') as mock_get:
mock_resp = MagicMock()
mock_resp.raise_for_status.side_effect = httpx.HTTPStatusError(
"429 Too Many Requests",
request=MagicMock(),
response=MagicMock(status_code=429)
)
mock_get.return_value = mock_resp
with pytest.raises(httpx.HTTPStatusError):
news_fetcher.search_news("test query")
def test_search_query_includes_topic(self, news_fetcher, sample_search_response):
"""验证搜索查询包含了提供的主题"""
with patch.object(news_fetcher.client, 'get') as mock_get:
mock_resp = MagicMock()
mock_resp.json.return_value = sample_search_response
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
news_fetcher.search_news("AI regulation Europe")
call_args = mock_get.call_args
query_param = call_args.kwargs.get("params", {}).get("q", "")
assert "AI regulation Europe" in query_param
class TestFetchArticle:
"""文章内容获取测试"""
def test_fetch_returns_content(self, news_fetcher):
html_content = "<html><body><p>Article content here.</p></body></html>"
with patch.object(news_fetcher.client, 'get') as mock_get:
mock_resp = MagicMock()
mock_resp.text = html_content
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
content = news_fetcher.fetch_article_content("https://example.com")
assert content is not None
assert "Article content here" in content
def test_fetch_returns_none_on_timeout(self, news_fetcher):
with patch.object(news_fetcher.client, 'get') as mock_get:
mock_get.side_effect = httpx.TimeoutException("Connection timed out")
content = news_fetcher.fetch_article_content("https://slow.example.com")
assert content is None # 不抛异常,返回 None
def test_fetch_truncates_long_content(self, news_fetcher):
"""验证超长内容被截断到 2000 字符"""
long_html = f"<html><body><p>{'A' * 5000}</p></body></html>"
with patch.object(news_fetcher.client, 'get') as mock_get:
mock_resp = MagicMock()
mock_resp.text = long_html
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
content = news_fetcher.fetch_article_content("https://example.com")
assert len(content) <= 2000
32.3 Mock 工具调用的方法
Hermes 工具调用 Mock 框架
"""Mock 工具调用,无需真实 API 即可测试完整 Agent 工作流"""
from typing import Any, Callable, Optional
from dataclasses import dataclass, field
import json
@dataclass
class MockToolCall:
tool_name: str
input: dict
timestamp: float
@dataclass
class MockToolConfig:
"""Mock 工具的配置"""
response: Any = None
error: Optional[Exception] = None
side_effect: Optional[Callable] = None # 动态生成响应
call_limit: Optional[int] = None # 限制调用次数(测试用)
call_count: int = field(default=0, init=False)
class HermesToolMocker:
"""
Hermes 工具调用 Mock 器。
使用方式:
1. 注册 Mock 工具
2. 运行 Skill
3. 验证工具调用记录
"""
def __init__(self):
self._mocks: dict[str, MockToolConfig] = {}
self._call_history: list[MockToolCall] = []
def register(
self,
tool_name: str,
response: Any = None,
error: Exception = None,
side_effect: Callable = None,
call_limit: int = None
) -> "HermesToolMocker":
"""注册 Mock 工具(支持链式调用)"""
self._mocks[tool_name] = MockToolConfig(
response=response,
error=error,
side_effect=side_effect,
call_limit=call_limit
)
return self
def handle_tool_call(self, tool_name: str, tool_input: dict) -> Any:
"""处理工具调用,返回 Mock 响应"""
import time
self._call_history.append(MockToolCall(
tool_name=tool_name,
input=tool_input,
timestamp=time.time()
))
if tool_name not in self._mocks:
raise ValueError(f"Tool '{tool_name}' has no mock configured. "
f"Register it with mocker.register('{tool_name}', ...)")
config = self._mocks[tool_name]
config.call_count += 1
if config.call_limit and config.call_count > config.call_limit:
raise RuntimeError(f"Tool '{tool_name}' exceeded call limit of {config.call_limit}")
if config.error:
raise config.error
if config.side_effect:
return config.side_effect(tool_input)
return config.response
# ===== 断言方法 =====
def assert_tool_called(self, tool_name: str, times: int = None):
"""断言工具被调用(可指定次数)"""
calls = [c for c in self._call_history if c.tool_name == tool_name]
if not calls:
raise AssertionError(f"Expected tool '{tool_name}' to be called, but it wasn't")
if times is not None and len(calls) != times:
raise AssertionError(
f"Expected tool '{tool_name}' to be called {times} times, "
f"but it was called {len(calls)} times"
)
def assert_tool_not_called(self, tool_name: str):
"""断言工具未被调用"""
calls = [c for c in self._call_history if c.tool_name == tool_name]
if calls:
raise AssertionError(
f"Expected tool '{tool_name}' NOT to be called, "
f"but it was called {len(calls)} times"
)
def assert_tool_called_with(self, tool_name: str, **expected_params):
"""断言工具被以特定参数调用"""
matching = [
c for c in self._call_history
if c.tool_name == tool_name and
all(c.input.get(k) == v for k, v in expected_params.items())
]
if not matching:
all_calls = [c.input for c in self._call_history if c.tool_name == tool_name]
raise AssertionError(
f"Tool '{tool_name}' was not called with {expected_params}. "
f"Actual calls: {all_calls}"
)
@property
def call_count(self) -> dict[str, int]:
"""返回每个工具的调用次数"""
counts = {}
for call in self._call_history:
counts[call.tool_name] = counts.get(call.tool_name, 0) + 1
return counts
def get_calls(self, tool_name: str) -> list[MockToolCall]:
"""获取特定工具的所有调用记录"""
return [c for c in self._call_history if c.tool_name == tool_name]
# 测试使用示例
class TestSkillWithMocks:
def test_news_digest_calls_search(self, sample_articles):
"""测试 Skill 调用了搜索工具"""
mocker = HermesToolMocker()
mocker.register(
"web_search",
response={"results": [
{"title": "AI News", "url": "https://example.com", "snippet": "..."}
]}
).register(
"fetch_url",
response="Full article content about AI..."
)
# 运行 Skill(使用 Mock 工具)
skill = NewsDigestSkill(tool_handler=mocker.handle_tool_call)
result = skill.run({"topics": ["AI regulation"]})
# 验证
mocker.assert_tool_called("web_search")
mocker.assert_tool_called_with("web_search", query="AI regulation news today 2024")
assert result["status"] == "success"
def test_skill_handles_search_failure(self):
"""测试搜索失败时的错误处理"""
mocker = HermesToolMocker()
mocker.register(
"web_search",
error=Exception("API rate limit exceeded")
)
skill = NewsDigestSkill(tool_handler=mocker.handle_tool_call)
result = skill.run({"topics": ["AI"]})
assert result["status"] == "failed"
assert any(e["code"] == "SEARCH_API_ERROR" for e in result["errors"])
def test_skill_saves_file_when_requested(self):
"""测试请求保存文件时调用 write_file 工具"""
mocker = HermesToolMocker()
mocker.register("web_search", response={"results": [...]})
mocker.register("fetch_url", response="Content...")
mocker.register("write_file", response={"success": True})
skill = NewsDigestSkill(tool_handler=mocker.handle_tool_call)
result = skill.run({"topics": ["AI"], "save_to_file": True})
mocker.assert_tool_called("write_file", times=1)
write_call = mocker.get_calls("write_file")[0]
assert write_call.input["path"].endswith(".md")
32.4 性能与边界测试
性能测试(pytest-benchmark)
"""性能测试:验证 Skill 在时间约束内完成"""
import pytest
import time
from unittest.mock import patch
class TestPerformance:
def test_input_validation_performance(self, benchmark, validator):
"""验证输入验证在 1ms 内完成"""
valid_input = {"topics": ["AI regulation", "climate change"], "max_articles_per_topic": 5}
result = benchmark(validator.validate, valid_input)
assert result[0] is True
# benchmark 自动报告平均时间、最小时间、标准差
def test_formatter_performance(self, benchmark, formatter, sample_articles):
"""验证格式化在 100ms 内完成(不含 LLM 调用)"""
result = benchmark(
formatter.format_digest,
topic="AI News",
articles=sample_articles,
max_articles=5
)
assert len(result) > 0
@pytest.mark.timeout(5) # 最长 5 秒
def test_full_skill_completes_within_timeout(self, mocker_with_responses):
"""验证完整 Skill 在 5 秒内完成(Mock 工具调用)"""
start = time.time()
skill = NewsDigestSkill(tool_handler=mocker_with_responses.handle_tool_call)
result = skill.run({"topics": ["AI"]})
elapsed = time.time() - start
assert elapsed < 5.0
assert result["status"] == "success"
class TestBoundaryConditions:
"""边界值和异常输入测试"""
@pytest.mark.parametrize("topic_length", [2, 50, 100])
def test_topic_at_length_boundary(self, validator, topic_length):
"""测试 topic 字符串长度边界"""
data = {"topics": ["A" * topic_length]}
is_valid, _ = validator.validate(data)
assert is_valid is True
def test_topic_too_short_fails(self, validator):
data = {"topics": ["A"]} # 最短 2 字符
is_valid, _ = validator.validate(data)
assert is_valid is False
def test_topic_too_long_fails(self, validator):
data = {"topics": ["A" * 101]} # 最长 100 字符
is_valid, _ = validator.validate(data)
assert is_valid is False
def test_unicode_topics(self, validator):
"""测试 Unicode 主题(中文、阿拉伯语等)"""
data = {"topics": ["人工智能监管", "気候変動", "Künstliche Intelligenz"]}
is_valid, _ = validator.validate(data)
assert is_valid is True
def test_sql_injection_in_topic(self, mocker):
"""测试 SQL 注入式输入不会破坏 Skill"""
malicious_input = {"topics": ["'; DROP TABLE news; --"]}
skill = NewsDigestSkill(tool_handler=mocker.handle_tool_call)
# 不应崩溃,只是把恶意字符串当作普通搜索词
result = skill.run(malicious_input)
# 不抛出异常即通过
def test_extremely_long_topic(self, validator):
"""测试极长输入的处理"""
data = {"topics": ["A" * 10000]} # 远超限制
is_valid, errors = validator.validate(data)
assert is_valid is False
# 应该有清晰的错误信息而不是崩溃
assert len(errors) > 0
32.5 自动化回归测试 CI/CD 配置
GitHub Actions 完整配置
# .github/workflows/skill-tests.yml
name: Skill Test Suite
on:
push:
branches: [main, develop]
pull_request:
branches: [main]
schedule:
- cron: '0 8 * * *' # 每天 UTC 8:00 运行集成测试
env:
PYTHON_VERSION: '3.11'
jobs:
# ===== 快速验证(每次 push 必跑)=====
lint-and-validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
pip install ruff mypy hermes-cli
pip install -r requirements.txt
- name: Lint with ruff
run: ruff check .
- name: Type check with mypy
run: mypy tools/ --ignore-missing-imports
- name: Validate SKILL.md
run: hermes skill validate
# ===== 单元测试(每次 push)=====
unit-tests:
runs-on: ubuntu-latest
needs: lint-and-validate
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install pytest pytest-cov pytest-timeout pytest-benchmark
- name: Run unit tests with coverage
run: |
pytest tests/unit/ \
--cov=tools \
--cov-report=xml \
--cov-report=term-missing \
--cov-fail-under=80 \
-v
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
file: ./coverage.xml
# ===== 边界测试(每次 push)=====
boundary-tests:
runs-on: ubuntu-latest
needs: unit-tests
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: pip install -r requirements.txt pytest hypothesis
- name: Run boundary tests
run: pytest tests/boundary/ -v --tb=short
# ===== 性能测试(PR 时运行)=====
performance-tests:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
needs: unit-tests
steps:
- uses: actions/checkout@v4
- name: Install dependencies
run: pip install -r requirements.txt pytest pytest-benchmark
- name: Run performance tests
run: |
pytest tests/performance/ \
--benchmark-json=benchmark-results.json \
-v
- name: Check performance regression
run: |
python scripts/check_benchmark_regression.py \
--current=benchmark-results.json \
--threshold=20 # 允许 20% 性能退化
# ===== 集成测试(每日定时,需要真实 API)=====
integration-tests:
runs-on: ubuntu-latest
if: github.event_name == 'schedule'
needs: unit-tests
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: pip install -r requirements.txt pytest hermes-sdk
- name: Run integration tests
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
SEARCH_API_KEY: ${{ secrets.SEARCH_API_KEY }}
run: |
pytest tests/integration/ \
-v \
--timeout=60 \
-m "not slow" # 跳过特别耗时的测试
# ===== 发布到 ClawHub(打 tag 时)=====
publish:
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/v')
needs: [unit-tests, boundary-tests, lint-and-validate]
steps:
- uses: actions/checkout@v4
- name: Install hermes-cli
run: pip install hermes-cli
- name: Publish to ClawHub
env:
CLAWHUB_TOKEN: ${{ secrets.CLAWHUB_TOKEN }}
run: hermes skill publish --token $CLAWHUB_TOKEN
32.6 常见 Skill Bug 模式与预防
Bug 模式库
"""
常见 Skill Bug 模式与对应的预防测试
"""
# ===== Bug 模式 1:工具调用结果未检查 =====
# 症状:fetch_url 返回 None(超时),但代码继续处理 None.split()
# 预防:
def test_handles_none_fetch_result(mocker):
mocker.register("web_search", response={"results": [{"url": "https://x.com"}]})
mocker.register("fetch_url", response=None) # 模拟超时返回 None
skill = NewsDigestSkill(tool_handler=mocker.handle_tool_call)
result = skill.run({"topics": ["AI"]})
# 不应崩溃,应降级使用 snippet
assert result["status"] in ["success", "partial_success"]
assert not any(e["code"] == "INTERNAL_ERROR" for e in result.get("errors", []))
# ===== Bug 模式 2:状态跨调用泄漏 =====
# 症状:第一次调用的搜索结果"混入"第二次调用
# 预防:
def test_no_state_leak_between_calls(mocker_factory):
"""验证每次 Skill 调用之间没有状态泄漏"""
responses_1 = [{"title": "AI News Call 1", "url": "https://ai.com"}]
responses_2 = [{"title": "Climate News Call 2", "url": "https://climate.com"}]
skill = NewsDigestSkill()
# 第一次调用
mocker1 = mocker_factory(search_response=responses_1)
result1 = skill.run({"topics": ["AI"]}, tool_handler=mocker1.handle_tool_call)
# 第二次调用
mocker2 = mocker_factory(search_response=responses_2)
result2 = skill.run({"topics": ["Climate"]}, tool_handler=mocker2.handle_tool_call)
# 两次结果应该完全独立
titles_1 = [a["title"] for t in result1["topics_covered"] for a in t["articles"]]
titles_2 = [a["title"] for t in result2["topics_covered"] for a in t["articles"]]
assert "Call 1" not in str(titles_2)
assert "Call 2" not in str(titles_1)
# ===== Bug 模式 3:遗忘错误聚合(部分成功场景)=====
# 症状:3 个 topic 中有 1 个失败,整个 Skill 失败而不是 partial_success
# 预防:
def test_partial_success_when_one_topic_fails(mocker):
call_count = {"count": 0}
def search_side_effect(input: dict) -> dict:
call_count["count"] += 1
if call_count["count"] == 2:
raise Exception("Rate limit for this query")
return {"results": [{"title": "News", "url": "https://x.com"}]}
mocker.register("web_search", side_effect=search_side_effect)
mocker.register("fetch_url", response="Content...")
skill = NewsDigestSkill(tool_handler=mocker.handle_tool_call)
result = skill.run({"topics": ["AI", "Climate", "Tech"]})
# 两个成功,一个失败 → partial_success
assert result["status"] == "partial_success"
assert len(result["topics_covered"]) == 2 # 只有成功的两个
assert len(result["errors"]) == 1 # 一个错误记录
# ===== Bug 模式 4:Token 预算超限 =====
# 症状:文章内容太长,超出上下文窗口
# 预防:
def test_large_content_does_not_exceed_context(mocker):
"""验证大量文章内容不会超出 Token 限制"""
huge_content = "Very long article content. " * 1000 # 约 5000 词
mocker.register("web_search", response={"results": [
{"url": f"https://example{i}.com", "title": f"Article {i}", "snippet": "..."}
for i in range(10)
]})
mocker.register("fetch_url", response=huge_content)
skill = NewsDigestSkill(tool_handler=mocker.handle_tool_call)
result = skill.run({"topics": ["AI"], "max_articles_per_topic": 10})
# 验证输出在合理范围内
output_text = str(result)
# 粗略估计:每字符约 0.25 个 token,输出不超过 10000 token(约 40000 字符)
assert len(output_text) < 40_000
测试覆盖率目标
| 模块 | 目标覆盖率 | 说明 |
|---|---|---|
| 核心逻辑(tools/) | > 90% | 必须高覆盖 |
| 输入验证 | > 95% | 关键安全层 |
| 错误处理路径 | > 85% | 确保降级策略有效 |
| 格式化输出 | > 80% | 多种格式都要测 |
| 整体目标 | > 80% | CI 强制检查 |
32.7 小结
完整的 Skill 测试体系是高质量 Skill 的基础保障:
- 四维测试:功能(70%)→ 性能/边界(20%)→ 集成(10%)的金字塔结构
- Mock 优先:工具调用使用
HermesToolMockerMock,测试可离线运行、不消耗 API 额度 - 边界测试:Unicode、SQL 注入、None 返回、极长输入——每种边界都要有测试覆盖
- CI/CD 自动化:每次 push 跑单元测试,PR 跑性能对比,每日定时跑集成测试
- Bug 模式库:识别并预防 4 类常见 Skill Bug(None 检查、状态泄漏、错误聚合、Token 超限)
测试是 Skill 质量的护城河——投资测试就是投资 Skill 的长期可维护性。
思考题
-
本章的 Mock 工具测试方案能有效测试 Skill 的逻辑,但无法测试 SKILL.md 中描述的"行为"(如 LLM 是否真的按照 Usage 部分的步骤执行)。你会如何设计一个测试来验证 SKILL.md 的有效性?
-
集成测试需要真实 API 调用,成本高且不稳定。如何设计一个"录制-回放"(Record and Replay)机制,使集成测试既能验证真实行为,又能在 CI 中稳定运行?
-
test_partial_success_when_one_topic_fails测试验证了部分成功场景。但如果 3 个 topic 都失败了,应该返回failed而非partial_success。如何在代码中优雅区分这两种情况?请设计相应的测试。 -
性能测试设定了"允许 20% 的性能退化"阈值。在什么情况下你会允许更大的性能退化?在什么情况下应该把阈值收紧到 5%?决策依据是什么?