Claude.ai Connector 开发:Remote MCP + OAuth 2.1 完整接入指南
第五十七章:Plugin 测试策略:单元测试、集成测试与沙箱环境
57.1 为什么 Plugin 测试至关重要
Plugin 是 Claude 能力的延伸触角。一个 Plugin 可以搜索网络、操作数据库、调用外部 API、修改文件系统。这意味着 Plugin 的错误不再局限于"输出一段错误文本",而可能造成真实的副作用:删除了不该删除的记录、发送了错误的邮件、触发了误操作的付款流程。
与普通函数不同,Plugin 的测试还需要面对几个特殊挑战:
- 外部依赖复杂:Plugin 往往调用第三方 API,测试时无法依赖真实服务的稳定性
- Claude 调用不可预测:同样的 prompt 不一定每次都触发同样的工具调用
- 工具参数需要验证:Claude 生成的 JSON 参数必须符合 schema,否则执行失败
- 副作用难以回滚:对数据库、文件系统的写操作在测试环境中需要隔离
一套完善的 Plugin 测试策略应当覆盖三个层次:单元测试(验证工具函数逻辑)、集成测试(验证 Claude 与工具的协作)、沙箱测试(在隔离环境中验证端到端流程)。
57.2 Plugin 架构回顾
在进入测试策略之前,先回顾 Plugin 的典型结构,这决定了我们在哪个层面切入测试。
# 典型的 Plugin 结构
import anthropic
import json
from typing import Any
# 工具函数(纯业务逻辑层)
def search_products(query: str, max_results: int = 5) -> list[dict]:
"""搜索产品数据库"""
# 数据库查询逻辑
...
def create_order(user_id: str, product_id: str, quantity: int) -> dict:
"""创建订单"""
# 写操作逻辑
...
# 工具 Schema 定义层
TOOLS = [
{
"name": "search_products",
"description": "搜索产品目录,返回匹配的产品列表",
"input_schema": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "搜索关键词"},
"max_results": {"type": "integer", "description": "最大返回数量", "default": 5}
},
"required": ["query"]
}
},
{
"name": "create_order",
"description": "根据用户选择创建新订单",
"input_schema": {
"type": "object",
"properties": {
"user_id": {"type": "string"},
"product_id": {"type": "string"},
"quantity": {"type": "integer", "minimum": 1}
},
"required": ["user_id", "product_id", "quantity"]
}
}
]
# 工具调度层
def execute_tool(tool_name: str, tool_input: dict) -> Any:
if tool_name == "search_products":
return search_products(**tool_input)
elif tool_name == "create_order":
return create_order(**tool_input)
else:
raise ValueError(f"未知工具: {tool_name}")
这个结构清晰地分为三层:业务逻辑层、Schema 定义层、调度层。测试策略也围绕这三层展开。
57.3 单元测试:验证工具函数逻辑
单元测试针对纯粹的工具函数,不涉及 Claude API 调用。目标是验证每个工具函数在各种输入下的行为是否符合预期。
57.3.1 使用 pytest 构建工具函数测试套件
# tests/test_tools_unit.py
import pytest
from unittest.mock import patch, MagicMock
from plugins.ecommerce import search_products, create_order
class TestSearchProducts:
"""search_products 工具的单元测试"""
def test_basic_search_returns_results(self, mock_db):
"""正常搜索返回结果列表"""
mock_db.query.return_value = [
{"id": "p001", "name": "蓝牙耳机", "price": 299},
{"id": "p002", "name": "蓝牙音箱", "price": 199}
]
results = search_products("蓝牙")
assert len(results) == 2
assert results[0]["id"] == "p001"
def test_max_results_limit(self, mock_db):
"""max_results 参数限制返回数量"""
mock_db.query.return_value = [{"id": f"p{i}"} for i in range(10)]
results = search_products("耳机", max_results=3)
assert len(results) == 3
def test_empty_query_raises_error(self):
"""空查询字符串应抛出异常"""
with pytest.raises(ValueError, match="查询字符串不能为空"):
search_products("")
def test_no_results_returns_empty_list(self, mock_db):
"""无匹配结果时返回空列表,不抛异常"""
mock_db.query.return_value = []
results = search_products("完全不存在的商品xyz")
assert results == []
def test_sql_injection_sanitized(self, mock_db):
"""SQL 注入字符串应被清洗"""
# 确保不会抛出数据库异常
results = search_products("'; DROP TABLE products; --")
mock_db.query.assert_called_once()
# 验证传入 query 的参数经过了转义
call_args = mock_db.query.call_args
assert "DROP TABLE" not in str(call_args)
class TestCreateOrder:
"""create_order 工具的单元测试"""
def test_successful_order_creation(self, mock_db):
"""成功创建订单返回订单 ID"""
mock_db.insert.return_value = "order_12345"
result = create_order("user_001", "p001", 2)
assert result["order_id"] == "order_12345"
assert result["status"] == "created"
def test_zero_quantity_raises_error(self):
"""数量为 0 应抛出验证错误"""
with pytest.raises(ValueError, match="数量必须大于 0"):
create_order("user_001", "p001", 0)
def test_negative_quantity_raises_error(self):
"""负数数量应抛出验证错误"""
with pytest.raises(ValueError):
create_order("user_001", "p001", -1)
def test_nonexistent_product_raises_error(self, mock_db):
"""不存在的产品 ID 应抛出错误"""
mock_db.get_product.return_value = None
with pytest.raises(ValueError, match="产品不存在"):
create_order("user_001", "nonexistent_product", 1)
@pytest.fixture
def mock_db():
"""数据库 mock fixture"""
with patch("plugins.ecommerce.db") as mock:
yield mock
57.3.2 测试工具 Schema 的合法性
Schema 定义错误会导致 Claude 无法正确调用工具,或者调用时传入不合法的参数。我们需要专门测试 schema 本身。
# tests/test_schema_validation.py
import pytest
import jsonschema
from plugins.ecommerce import TOOLS
def get_tool_schema(tool_name: str) -> dict:
"""根据名称获取工具 schema"""
for tool in TOOLS:
if tool["name"] == tool_name:
return tool["input_schema"]
raise KeyError(f"工具 {tool_name} 不存在")
class TestSearchProductsSchema:
schema = None
@pytest.fixture(autouse=True)
def setup(self):
self.schema = get_tool_schema("search_products")
def test_valid_minimal_input(self):
"""仅提供必填字段应通过验证"""
jsonschema.validate({"query": "耳机"}, self.schema)
def test_valid_full_input(self):
"""提供所有字段应通过验证"""
jsonschema.validate({"query": "耳机", "max_results": 10}, self.schema)
def test_missing_required_field_fails(self):
"""缺少必填字段 query 应验证失败"""
with pytest.raises(jsonschema.ValidationError):
jsonschema.validate({"max_results": 5}, self.schema)
def test_wrong_type_for_max_results(self):
"""max_results 传入字符串应验证失败"""
with pytest.raises(jsonschema.ValidationError):
jsonschema.validate({"query": "耳机", "max_results": "five"}, self.schema)
class TestCreateOrderSchema:
def test_quantity_minimum_constraint(self):
"""数量小于 1 应验证失败"""
schema = get_tool_schema("create_order")
with pytest.raises(jsonschema.ValidationError):
jsonschema.validate(
{"user_id": "u1", "product_id": "p1", "quantity": 0},
schema
)
57.4 集成测试:验证 Claude 与工具的协作
集成测试的目标是验证 Claude 在接收到用户消息后,能否正确识别需要调用哪个工具、传入正确的参数,以及工具返回结果后 Claude 能否给出合理的最终回复。
57.4.1 使用真实 Claude API 的集成测试
# tests/test_integration_claude.py
import pytest
import anthropic
from plugins.ecommerce import TOOLS, execute_tool
@pytest.fixture
def claude_client():
return anthropic.Anthropic()
def run_tool_use_loop(client, messages: list, tools: list) -> dict:
"""执行完整的工具调用循环,返回最终结果"""
tool_calls = []
final_text = ""
while True:
response = client.messages.create(
model="claude-opus-4-5",
max_tokens=1024,
tools=tools,
messages=messages
)
if response.stop_reason == "end_turn":
for block in response.content:
if hasattr(block, "text"):
final_text = block.text
break
if response.stop_reason == "tool_use":
# 处理工具调用
tool_results = []
for block in response.content:
if block.type == "tool_use":
tool_calls.append({
"name": block.name,
"input": block.input
})
result = execute_tool(block.name, block.input)
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": str(result)
})
messages = messages + [
{"role": "assistant", "content": response.content},
{"role": "user", "content": tool_results}
]
else:
break
return {
"final_text": final_text,
"tool_calls": tool_calls,
"message_count": len(messages)
}
class TestClaudeToolIntegration:
"""Claude 与工具的集成测试"""
@pytest.mark.integration
def test_search_triggered_for_product_query(self, claude_client, mock_db):
"""询问产品时应触发 search_products 工具"""
mock_db.query.return_value = [
{"id": "p001", "name": "索尼 WH-1000XM5", "price": 2499}
]
result = run_tool_use_loop(
claude_client,
[{"role": "user", "content": "帮我搜索一下降噪耳机"}],
TOOLS
)
assert any(tc["name"] == "search_products" for tc in result["tool_calls"])
assert "降噪" in result["tool_calls"][0]["input"].get("query", "")
@pytest.mark.integration
def test_create_order_requires_confirmation(self, claude_client, mock_db):
"""创建订单前 Claude 应请求确认"""
result = run_tool_use_loop(
claude_client,
[{"role": "user", "content": "帮我直接下单购买产品 p001,用户 ID user_001,买 2 个"}],
TOOLS
)
# Claude 通常会请求确认,而不是直接调用 create_order
# 如果调用了,验证参数正确
order_calls = [tc for tc in result["tool_calls"] if tc["name"] == "create_order"]
if order_calls:
assert order_calls[0]["input"]["quantity"] == 2
assert order_calls[0]["input"]["product_id"] == "p001"
@pytest.mark.integration
def test_multi_turn_tool_use(self, claude_client, mock_db):
"""多轮对话中工具调用保持上下文"""
mock_db.query.return_value = [
{"id": "p001", "name": "耳机A", "price": 299}
]
mock_db.insert.return_value = "order_789"
messages = [
{"role": "user", "content": "搜索一下蓝牙耳机"}
]
# 第一轮:搜索
result1 = run_tool_use_loop(claude_client, messages, TOOLS)
assert any(tc["name"] == "search_products" for tc in result1["tool_calls"])
57.4.2 使用 Mock Claude 的轻量级集成测试
每次集成测试都调用真实 Claude API 既昂贵又缓慢。我们可以构建一个 Mock Claude 来模拟工具调用行为,用于 CI/CD 流水线中的快速验证。
# tests/mocks/mock_claude.py
from dataclasses import dataclass
from typing import Any
import json
@dataclass
class MockToolUseBlock:
type: str = "tool_use"
id: str = "mock_tool_001"
name: str = ""
input: dict = None
@dataclass
class MockTextBlock:
type: str = "text"
text: str = ""
@dataclass
class MockResponse:
stop_reason: str
content: list
class MockClaudeClient:
"""
模拟 Claude 工具调用行为的 Mock 客户端
根据用户消息内容决定触发哪个工具
"""
def __init__(self, tool_call_plan: list[dict]):
"""
tool_call_plan: 预定义的工具调用序列
每个元素包含 {name, input} 表示一次工具调用
"""
self.tool_call_plan = tool_call_plan
self.call_index = 0
self.messages_received = []
def messages_create(self, **kwargs):
self.messages_received.append(kwargs.get("messages", []))
if self.call_index < len(self.tool_call_plan):
plan = self.tool_call_plan[self.call_index]
self.call_index += 1
return MockResponse(
stop_reason="tool_use",
content=[
MockToolUseBlock(
id=f"mock_tool_{self.call_index:03d}",
name=plan["name"],
input=plan["input"]
)
]
)
else:
return MockResponse(
stop_reason="end_turn",
content=[MockTextBlock(text="根据搜索结果,我找到了以下产品...")]
)
# tests/test_integration_mock.py
from tests.mocks.mock_claude import MockClaudeClient
from plugins.ecommerce import TOOLS, execute_tool
class TestWithMockClaude:
def test_search_flow_with_mock(self, mock_db):
"""使用 Mock Claude 测试搜索流程,无需真实 API 调用"""
mock_db.query.return_value = [{"id": "p001", "name": "耳机"}]
mock_client = MockClaudeClient(tool_call_plan=[
{"name": "search_products", "input": {"query": "耳机", "max_results": 5}}
])
# 执行工具循环(使用 mock client)
messages = [{"role": "user", "content": "帮我搜索耳机"}]
# ... 执行 run_tool_use_loop 的类似逻辑
assert mock_client.call_index == 1
mock_db.query.assert_called_once()
57.5 沙箱环境:隔离副作用
对于有写操作的 Plugin(创建订单、发送邮件、修改数据库),必须在沙箱环境中测试,防止测试数据污染生产环境。
57.5.1 Docker 沙箱配置
# docker-compose.test.yml
version: "3.8"
services:
test-db:
image: postgres:15-alpine
environment:
POSTGRES_DB: plugin_test
POSTGRES_USER: testuser
POSTGRES_PASSWORD: testpass
ports:
- "5433:5432" # 使用不同端口避免与开发库冲突
volumes:
- ./tests/fixtures/init.sql:/docker-entrypoint-initdb.d/init.sql
test-redis:
image: redis:7-alpine
ports:
- "6380:6379"
wiremock:
image: wiremock/wiremock:3.3.1
ports:
- "8080:8080"
volumes:
- ./tests/wiremock:/home/wiremock
plugin-test-runner:
build:
context: .
dockerfile: Dockerfile.test
environment:
DATABASE_URL: postgresql://testuser:testpass@test-db:5432/plugin_test
REDIS_URL: redis://test-redis:6379
EXTERNAL_API_BASE: http://wiremock:8080
depends_on:
- test-db
- test-redis
- wiremock
command: pytest tests/ -m "sandbox" -v
57.5.2 WireMock 模拟外部 API
// tests/wiremock/mappings/payment-api.json
{
"mappings": [
{
"request": {
"method": "POST",
"url": "/v1/payments",
"bodyPatterns": [
{"matchesJsonPath": "$.amount"}
]
},
"response": {
"status": 200,
"headers": {"Content-Type": "application/json"},
"jsonBody": {
"payment_id": "pay_test_001",
"status": "succeeded",
"amount": "{{jsonPath request.body '$.amount'}}"
},
"transformers": ["response-template"]
}
},
{
"request": {
"method": "POST",
"url": "/v1/payments",
"bodyPatterns": [
{"matchesJsonPath": "$.amount[?(@>10000)]"}
]
},
"response": {
"status": 402,
"jsonBody": {
"error": "payment_failed",
"message": "金额超过限制"
}
}
}
]
}
57.5.3 事务回滚策略
# tests/conftest.py
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
@pytest.fixture(scope="function")
def db_session():
"""每个测试函数使用独立事务,测试后自动回滚"""
engine = create_engine(os.environ["DATABASE_URL"])
connection = engine.connect()
transaction = connection.begin()
Session = sessionmaker(bind=connection)
session = Session()
# 将 session 注入到 plugin 模块
import plugins.ecommerce as plugin
original_session = plugin.db_session
plugin.db_session = session
yield session
# 测试结束后回滚,无论测试成功或失败
session.close()
transaction.rollback()
connection.close()
plugin.db_session = original_session
@pytest.mark.sandbox
class TestCreateOrderSandbox:
"""在沙箱环境中测试 create_order 的完整流程"""
def test_order_persisted_to_database(self, db_session, mock_external_api):
"""订单创建后应持久化到数据库"""
result = create_order("user_001", "p001", 2)
# 验证数据库中确实有这条记录
order = db_session.query(Order).filter_by(id=result["order_id"]).first()
assert order is not None
assert order.quantity == 2
# 测试结束后事务自动回滚,不影响其他测试
def test_payment_failure_rolls_back_order(self, db_session, mock_external_api):
"""支付失败时订单应回滚"""
mock_external_api.set_payment_fail(amount=99999)
with pytest.raises(PaymentError):
create_order("user_001", "p001", 999) # 超出限额
# 验证数据库中没有半成品订单
orders = db_session.query(Order).filter_by(user_id="user_001").all()
assert len(orders) == 0
57.6 测试覆盖率与 CI/CD 集成
57.6.1 pytest 配置与标记策略
# pytest.ini
[pytest]
markers =
unit: 单元测试,无外部依赖
integration: 集成测试,需要 Claude API
sandbox: 沙箱测试,需要 Docker 环境
slow: 执行时间超过 5 秒的测试
testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*
# 覆盖率配置
addopts =
--cov=plugins
--cov-report=html:coverage_report
--cov-report=term-missing
--cov-fail-under=80
57.6.2 GitHub Actions CI 流水线
# .github/workflows/plugin-tests.yml
name: Plugin Test Suite
on:
push:
paths: ['plugins/**', 'tests/**']
pull_request:
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- run: pip install -r requirements-test.txt
- run: pytest -m "unit" -v --tb=short
- uses: codecov/codecov-action@v3
integration-tests:
runs-on: ubuntu-latest
needs: unit-tests
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- run: pip install -r requirements-test.txt
- run: pytest -m "integration" -v --tb=long
timeout-minutes: 10
sandbox-tests:
runs-on: ubuntu-latest
needs: unit-tests
services:
postgres:
image: postgres:15
env:
POSTGRES_DB: plugin_test
POSTGRES_USER: testuser
POSTGRES_PASSWORD: testpass
ports: ['5432:5432']
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- run: pip install -r requirements-test.txt
- run: pytest -m "sandbox" -v
env:
DATABASE_URL: postgresql://testuser:testpass@localhost:5432/plugin_test
57.7 常见测试陷阱与解决方案
陷阱一:测试依赖 Claude 的非确定性行为
Claude 的工具选择在某些边界情况下是非确定性的。不要写出这样的测试:
# 错误写法:过度依赖 Claude 的具体行为
def test_exact_tool_call_sequence(claude_client):
result = run_tool_use_loop(claude_client, [...], TOOLS)
# Claude 可能先搜索再下单,也可能直接请求更多信息
assert result["tool_calls"][0]["name"] == "search_products" # 脆弱!
assert result["tool_calls"][1]["name"] == "create_order" # 脆弱!
# 正确写法:测试结果的属性而非具体序列
def test_order_eventually_created(claude_client, mock_db):
mock_db.query.return_value = [{"id": "p001", ...}]
mock_db.insert.return_value = "order_001"
result = run_tool_use_loop(
claude_client,
[{"role": "user", "content": "帮我买 2 个 p001 产品,用户 ID user_001"}],
TOOLS
)
# 验证最终结果:订单被创建了
order_calls = [tc for tc in result["tool_calls"] if tc["name"] == "create_order"]
assert len(order_calls) >= 1
陷阱二:忽略工具返回内容的格式
# 工具应返回字符串或可序列化的 JSON,而非复杂对象
def search_products_bad(query: str) -> list:
return db.query(Product).filter(...).all() # SQLAlchemy 对象无法序列化!
def search_products_good(query: str) -> list[dict]:
results = db.query(Product).filter(...).all()
return [{"id": p.id, "name": p.name, "price": p.price} for p in results]
陷阱三:沙箱环境与生产环境配置不一致
使用环境变量严格区分环境,并在测试初始化阶段验证:
# tests/conftest.py
def pytest_configure(config):
"""确保测试不会意外连接到生产数据库"""
db_url = os.environ.get("DATABASE_URL", "")
if "prod" in db_url or "production" in db_url:
pytest.exit("检测到生产数据库 URL,终止测试!", returncode=1)
if not db_url.endswith(("_test", "_dev", "localhost", "127.0.0.1")):
import warnings
warnings.warn(f"数据库 URL 可能不是测试环境: {db_url}")
小结
完善的 Plugin 测试策略遵循金字塔模型:大量的单元测试(快速、无依赖)作为基础,适量的集成测试(验证 Claude 协作)作为中层,少量的沙箱测试(验证端到端副作用)作为顶层。关键实践包括:Schema 验证测试防止参数错误、事务回滚隔离写操作副作用、Mock Claude 客户端加速 CI 流水线、环境变量守卫防止误操作生产环境。测试不是负担,而是 Plugin 从原型走向生产的必经之路。