第 57 章

Claude.ai Connector 开发:Remote MCP + OAuth 2.1 完整接入指南

第五十七章:Plugin 测试策略:单元测试、集成测试与沙箱环境

57.1 为什么 Plugin 测试至关重要

Plugin 是 Claude 能力的延伸触角。一个 Plugin 可以搜索网络、操作数据库、调用外部 API、修改文件系统。这意味着 Plugin 的错误不再局限于"输出一段错误文本",而可能造成真实的副作用:删除了不该删除的记录、发送了错误的邮件、触发了误操作的付款流程。

与普通函数不同,Plugin 的测试还需要面对几个特殊挑战:

  1. 外部依赖复杂:Plugin 往往调用第三方 API,测试时无法依赖真实服务的稳定性
  2. Claude 调用不可预测:同样的 prompt 不一定每次都触发同样的工具调用
  3. 工具参数需要验证:Claude 生成的 JSON 参数必须符合 schema,否则执行失败
  4. 副作用难以回滚:对数据库、文件系统的写操作在测试环境中需要隔离

一套完善的 Plugin 测试策略应当覆盖三个层次:单元测试(验证工具函数逻辑)、集成测试(验证 Claude 与工具的协作)、沙箱测试(在隔离环境中验证端到端流程)。

57.2 Plugin 架构回顾

在进入测试策略之前,先回顾 Plugin 的典型结构,这决定了我们在哪个层面切入测试。

# 典型的 Plugin 结构
import anthropic
import json
from typing import Any

# 工具函数(纯业务逻辑层)
def search_products(query: str, max_results: int = 5) -> list[dict]:
    """搜索产品数据库"""
    # 数据库查询逻辑
    ...

def create_order(user_id: str, product_id: str, quantity: int) -> dict:
    """创建订单"""
    # 写操作逻辑
    ...

# 工具 Schema 定义层
TOOLS = [
    {
        "name": "search_products",
        "description": "搜索产品目录,返回匹配的产品列表",
        "input_schema": {
            "type": "object",
            "properties": {
                "query": {"type": "string", "description": "搜索关键词"},
                "max_results": {"type": "integer", "description": "最大返回数量", "default": 5}
            },
            "required": ["query"]
        }
    },
    {
        "name": "create_order",
        "description": "根据用户选择创建新订单",
        "input_schema": {
            "type": "object",
            "properties": {
                "user_id": {"type": "string"},
                "product_id": {"type": "string"},
                "quantity": {"type": "integer", "minimum": 1}
            },
            "required": ["user_id", "product_id", "quantity"]
        }
    }
]

# 工具调度层
def execute_tool(tool_name: str, tool_input: dict) -> Any:
    if tool_name == "search_products":
        return search_products(**tool_input)
    elif tool_name == "create_order":
        return create_order(**tool_input)
    else:
        raise ValueError(f"未知工具: {tool_name}")

这个结构清晰地分为三层:业务逻辑层Schema 定义层调度层。测试策略也围绕这三层展开。

57.3 单元测试:验证工具函数逻辑

单元测试针对纯粹的工具函数,不涉及 Claude API 调用。目标是验证每个工具函数在各种输入下的行为是否符合预期。

57.3.1 使用 pytest 构建工具函数测试套件

# tests/test_tools_unit.py
import pytest
from unittest.mock import patch, MagicMock
from plugins.ecommerce import search_products, create_order

class TestSearchProducts:
    """search_products 工具的单元测试"""

    def test_basic_search_returns_results(self, mock_db):
        """正常搜索返回结果列表"""
        mock_db.query.return_value = [
            {"id": "p001", "name": "蓝牙耳机", "price": 299},
            {"id": "p002", "name": "蓝牙音箱", "price": 199}
        ]
        results = search_products("蓝牙")
        assert len(results) == 2
        assert results[0]["id"] == "p001"

    def test_max_results_limit(self, mock_db):
        """max_results 参数限制返回数量"""
        mock_db.query.return_value = [{"id": f"p{i}"} for i in range(10)]
        results = search_products("耳机", max_results=3)
        assert len(results) == 3

    def test_empty_query_raises_error(self):
        """空查询字符串应抛出异常"""
        with pytest.raises(ValueError, match="查询字符串不能为空"):
            search_products("")

    def test_no_results_returns_empty_list(self, mock_db):
        """无匹配结果时返回空列表,不抛异常"""
        mock_db.query.return_value = []
        results = search_products("完全不存在的商品xyz")
        assert results == []

    def test_sql_injection_sanitized(self, mock_db):
        """SQL 注入字符串应被清洗"""
        # 确保不会抛出数据库异常
        results = search_products("'; DROP TABLE products; --")
        mock_db.query.assert_called_once()
        # 验证传入 query 的参数经过了转义
        call_args = mock_db.query.call_args
        assert "DROP TABLE" not in str(call_args)

class TestCreateOrder:
    """create_order 工具的单元测试"""

    def test_successful_order_creation(self, mock_db):
        """成功创建订单返回订单 ID"""
        mock_db.insert.return_value = "order_12345"
        result = create_order("user_001", "p001", 2)
        assert result["order_id"] == "order_12345"
        assert result["status"] == "created"

    def test_zero_quantity_raises_error(self):
        """数量为 0 应抛出验证错误"""
        with pytest.raises(ValueError, match="数量必须大于 0"):
            create_order("user_001", "p001", 0)

    def test_negative_quantity_raises_error(self):
        """负数数量应抛出验证错误"""
        with pytest.raises(ValueError):
            create_order("user_001", "p001", -1)

    def test_nonexistent_product_raises_error(self, mock_db):
        """不存在的产品 ID 应抛出错误"""
        mock_db.get_product.return_value = None
        with pytest.raises(ValueError, match="产品不存在"):
            create_order("user_001", "nonexistent_product", 1)

@pytest.fixture
def mock_db():
    """数据库 mock fixture"""
    with patch("plugins.ecommerce.db") as mock:
        yield mock

57.3.2 测试工具 Schema 的合法性

Schema 定义错误会导致 Claude 无法正确调用工具,或者调用时传入不合法的参数。我们需要专门测试 schema 本身。

# tests/test_schema_validation.py
import pytest
import jsonschema
from plugins.ecommerce import TOOLS

def get_tool_schema(tool_name: str) -> dict:
    """根据名称获取工具 schema"""
    for tool in TOOLS:
        if tool["name"] == tool_name:
            return tool["input_schema"]
    raise KeyError(f"工具 {tool_name} 不存在")

class TestSearchProductsSchema:
    schema = None

    @pytest.fixture(autouse=True)
    def setup(self):
        self.schema = get_tool_schema("search_products")

    def test_valid_minimal_input(self):
        """仅提供必填字段应通过验证"""
        jsonschema.validate({"query": "耳机"}, self.schema)

    def test_valid_full_input(self):
        """提供所有字段应通过验证"""
        jsonschema.validate({"query": "耳机", "max_results": 10}, self.schema)

    def test_missing_required_field_fails(self):
        """缺少必填字段 query 应验证失败"""
        with pytest.raises(jsonschema.ValidationError):
            jsonschema.validate({"max_results": 5}, self.schema)

    def test_wrong_type_for_max_results(self):
        """max_results 传入字符串应验证失败"""
        with pytest.raises(jsonschema.ValidationError):
            jsonschema.validate({"query": "耳机", "max_results": "five"}, self.schema)

class TestCreateOrderSchema:
    def test_quantity_minimum_constraint(self):
        """数量小于 1 应验证失败"""
        schema = get_tool_schema("create_order")
        with pytest.raises(jsonschema.ValidationError):
            jsonschema.validate(
                {"user_id": "u1", "product_id": "p1", "quantity": 0},
                schema
            )

57.4 集成测试:验证 Claude 与工具的协作

集成测试的目标是验证 Claude 在接收到用户消息后,能否正确识别需要调用哪个工具、传入正确的参数,以及工具返回结果后 Claude 能否给出合理的最终回复。

57.4.1 使用真实 Claude API 的集成测试

# tests/test_integration_claude.py
import pytest
import anthropic
from plugins.ecommerce import TOOLS, execute_tool

@pytest.fixture
def claude_client():
    return anthropic.Anthropic()

def run_tool_use_loop(client, messages: list, tools: list) -> dict:
    """执行完整的工具调用循环,返回最终结果"""
    tool_calls = []
    final_text = ""

    while True:
        response = client.messages.create(
            model="claude-opus-4-5",
            max_tokens=1024,
            tools=tools,
            messages=messages
        )

        if response.stop_reason == "end_turn":
            for block in response.content:
                if hasattr(block, "text"):
                    final_text = block.text
            break

        if response.stop_reason == "tool_use":
            # 处理工具调用
            tool_results = []
            for block in response.content:
                if block.type == "tool_use":
                    tool_calls.append({
                        "name": block.name,
                        "input": block.input
                    })
                    result = execute_tool(block.name, block.input)
                    tool_results.append({
                        "type": "tool_result",
                        "tool_use_id": block.id,
                        "content": str(result)
                    })

            messages = messages + [
                {"role": "assistant", "content": response.content},
                {"role": "user", "content": tool_results}
            ]
        else:
            break

    return {
        "final_text": final_text,
        "tool_calls": tool_calls,
        "message_count": len(messages)
    }

class TestClaudeToolIntegration:
    """Claude 与工具的集成测试"""

    @pytest.mark.integration
    def test_search_triggered_for_product_query(self, claude_client, mock_db):
        """询问产品时应触发 search_products 工具"""
        mock_db.query.return_value = [
            {"id": "p001", "name": "索尼 WH-1000XM5", "price": 2499}
        ]

        result = run_tool_use_loop(
            claude_client,
            [{"role": "user", "content": "帮我搜索一下降噪耳机"}],
            TOOLS
        )

        assert any(tc["name"] == "search_products" for tc in result["tool_calls"])
        assert "降噪" in result["tool_calls"][0]["input"].get("query", "")

    @pytest.mark.integration
    def test_create_order_requires_confirmation(self, claude_client, mock_db):
        """创建订单前 Claude 应请求确认"""
        result = run_tool_use_loop(
            claude_client,
            [{"role": "user", "content": "帮我直接下单购买产品 p001,用户 ID user_001,买 2 个"}],
            TOOLS
        )
        # Claude 通常会请求确认,而不是直接调用 create_order
        # 如果调用了,验证参数正确
        order_calls = [tc for tc in result["tool_calls"] if tc["name"] == "create_order"]
        if order_calls:
            assert order_calls[0]["input"]["quantity"] == 2
            assert order_calls[0]["input"]["product_id"] == "p001"

    @pytest.mark.integration
    def test_multi_turn_tool_use(self, claude_client, mock_db):
        """多轮对话中工具调用保持上下文"""
        mock_db.query.return_value = [
            {"id": "p001", "name": "耳机A", "price": 299}
        ]
        mock_db.insert.return_value = "order_789"

        messages = [
            {"role": "user", "content": "搜索一下蓝牙耳机"}
        ]

        # 第一轮:搜索
        result1 = run_tool_use_loop(claude_client, messages, TOOLS)
        assert any(tc["name"] == "search_products" for tc in result1["tool_calls"])

57.4.2 使用 Mock Claude 的轻量级集成测试

每次集成测试都调用真实 Claude API 既昂贵又缓慢。我们可以构建一个 Mock Claude 来模拟工具调用行为,用于 CI/CD 流水线中的快速验证。

# tests/mocks/mock_claude.py
from dataclasses import dataclass
from typing import Any
import json

@dataclass
class MockToolUseBlock:
    type: str = "tool_use"
    id: str = "mock_tool_001"
    name: str = ""
    input: dict = None

@dataclass  
class MockTextBlock:
    type: str = "text"
    text: str = ""

@dataclass
class MockResponse:
    stop_reason: str
    content: list

class MockClaudeClient:
    """
    模拟 Claude 工具调用行为的 Mock 客户端
    根据用户消息内容决定触发哪个工具
    """

    def __init__(self, tool_call_plan: list[dict]):
        """
        tool_call_plan: 预定义的工具调用序列
        每个元素包含 {name, input} 表示一次工具调用
        """
        self.tool_call_plan = tool_call_plan
        self.call_index = 0
        self.messages_received = []

    def messages_create(self, **kwargs):
        self.messages_received.append(kwargs.get("messages", []))

        if self.call_index < len(self.tool_call_plan):
            plan = self.tool_call_plan[self.call_index]
            self.call_index += 1
            return MockResponse(
                stop_reason="tool_use",
                content=[
                    MockToolUseBlock(
                        id=f"mock_tool_{self.call_index:03d}",
                        name=plan["name"],
                        input=plan["input"]
                    )
                ]
            )
        else:
            return MockResponse(
                stop_reason="end_turn",
                content=[MockTextBlock(text="根据搜索结果,我找到了以下产品...")]
            )

# tests/test_integration_mock.py
from tests.mocks.mock_claude import MockClaudeClient
from plugins.ecommerce import TOOLS, execute_tool

class TestWithMockClaude:
    def test_search_flow_with_mock(self, mock_db):
        """使用 Mock Claude 测试搜索流程,无需真实 API 调用"""
        mock_db.query.return_value = [{"id": "p001", "name": "耳机"}]

        mock_client = MockClaudeClient(tool_call_plan=[
            {"name": "search_products", "input": {"query": "耳机", "max_results": 5}}
        ])

        # 执行工具循环(使用 mock client)
        messages = [{"role": "user", "content": "帮我搜索耳机"}]
        # ... 执行 run_tool_use_loop 的类似逻辑
        
        assert mock_client.call_index == 1
        mock_db.query.assert_called_once()

57.5 沙箱环境:隔离副作用

对于有写操作的 Plugin(创建订单、发送邮件、修改数据库),必须在沙箱环境中测试,防止测试数据污染生产环境。

57.5.1 Docker 沙箱配置

# docker-compose.test.yml
version: "3.8"
services:
  test-db:
    image: postgres:15-alpine
    environment:
      POSTGRES_DB: plugin_test
      POSTGRES_USER: testuser
      POSTGRES_PASSWORD: testpass
    ports:
      - "5433:5432"  # 使用不同端口避免与开发库冲突
    volumes:
      - ./tests/fixtures/init.sql:/docker-entrypoint-initdb.d/init.sql

  test-redis:
    image: redis:7-alpine
    ports:
      - "6380:6379"

  wiremock:
    image: wiremock/wiremock:3.3.1
    ports:
      - "8080:8080"
    volumes:
      - ./tests/wiremock:/home/wiremock

  plugin-test-runner:
    build:
      context: .
      dockerfile: Dockerfile.test
    environment:
      DATABASE_URL: postgresql://testuser:testpass@test-db:5432/plugin_test
      REDIS_URL: redis://test-redis:6379
      EXTERNAL_API_BASE: http://wiremock:8080
    depends_on:
      - test-db
      - test-redis
      - wiremock
    command: pytest tests/ -m "sandbox" -v

57.5.2 WireMock 模拟外部 API

// tests/wiremock/mappings/payment-api.json
{
  "mappings": [
    {
      "request": {
        "method": "POST",
        "url": "/v1/payments",
        "bodyPatterns": [
          {"matchesJsonPath": "$.amount"}
        ]
      },
      "response": {
        "status": 200,
        "headers": {"Content-Type": "application/json"},
        "jsonBody": {
          "payment_id": "pay_test_001",
          "status": "succeeded",
          "amount": "{{jsonPath request.body '$.amount'}}"
        },
        "transformers": ["response-template"]
      }
    },
    {
      "request": {
        "method": "POST",
        "url": "/v1/payments",
        "bodyPatterns": [
          {"matchesJsonPath": "$.amount[?(@>10000)]"}
        ]
      },
      "response": {
        "status": 402,
        "jsonBody": {
          "error": "payment_failed",
          "message": "金额超过限制"
        }
      }
    }
  ]
}

57.5.3 事务回滚策略

# tests/conftest.py
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

@pytest.fixture(scope="function")
def db_session():
    """每个测试函数使用独立事务,测试后自动回滚"""
    engine = create_engine(os.environ["DATABASE_URL"])
    connection = engine.connect()
    transaction = connection.begin()

    Session = sessionmaker(bind=connection)
    session = Session()

    # 将 session 注入到 plugin 模块
    import plugins.ecommerce as plugin
    original_session = plugin.db_session
    plugin.db_session = session

    yield session

    # 测试结束后回滚,无论测试成功或失败
    session.close()
    transaction.rollback()
    connection.close()
    plugin.db_session = original_session

@pytest.mark.sandbox
class TestCreateOrderSandbox:
    """在沙箱环境中测试 create_order 的完整流程"""

    def test_order_persisted_to_database(self, db_session, mock_external_api):
        """订单创建后应持久化到数据库"""
        result = create_order("user_001", "p001", 2)
        
        # 验证数据库中确实有这条记录
        order = db_session.query(Order).filter_by(id=result["order_id"]).first()
        assert order is not None
        assert order.quantity == 2
        # 测试结束后事务自动回滚,不影响其他测试

    def test_payment_failure_rolls_back_order(self, db_session, mock_external_api):
        """支付失败时订单应回滚"""
        mock_external_api.set_payment_fail(amount=99999)
        
        with pytest.raises(PaymentError):
            create_order("user_001", "p001", 999)  # 超出限额
        
        # 验证数据库中没有半成品订单
        orders = db_session.query(Order).filter_by(user_id="user_001").all()
        assert len(orders) == 0

57.6 测试覆盖率与 CI/CD 集成

57.6.1 pytest 配置与标记策略

# pytest.ini
[pytest]
markers =
    unit: 单元测试,无外部依赖
    integration: 集成测试,需要 Claude API
    sandbox: 沙箱测试,需要 Docker 环境
    slow: 执行时间超过 5 秒的测试

testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*

# 覆盖率配置
addopts = 
    --cov=plugins
    --cov-report=html:coverage_report
    --cov-report=term-missing
    --cov-fail-under=80

57.6.2 GitHub Actions CI 流水线

# .github/workflows/plugin-tests.yml
name: Plugin Test Suite

on:
  push:
    paths: ['plugins/**', 'tests/**']
  pull_request:

jobs:
  unit-tests:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
        with:
          python-version: '3.11'
      - run: pip install -r requirements-test.txt
      - run: pytest -m "unit" -v --tb=short
      - uses: codecov/codecov-action@v3

  integration-tests:
    runs-on: ubuntu-latest
    needs: unit-tests
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    env:
      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
        with:
          python-version: '3.11'
      - run: pip install -r requirements-test.txt
      - run: pytest -m "integration" -v --tb=long
        timeout-minutes: 10

  sandbox-tests:
    runs-on: ubuntu-latest
    needs: unit-tests
    services:
      postgres:
        image: postgres:15
        env:
          POSTGRES_DB: plugin_test
          POSTGRES_USER: testuser
          POSTGRES_PASSWORD: testpass
        ports: ['5432:5432']
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
        with:
          python-version: '3.11'
      - run: pip install -r requirements-test.txt
      - run: pytest -m "sandbox" -v
        env:
          DATABASE_URL: postgresql://testuser:testpass@localhost:5432/plugin_test

57.7 常见测试陷阱与解决方案

陷阱一:测试依赖 Claude 的非确定性行为

Claude 的工具选择在某些边界情况下是非确定性的。不要写出这样的测试:

# 错误写法:过度依赖 Claude 的具体行为
def test_exact_tool_call_sequence(claude_client):
    result = run_tool_use_loop(claude_client, [...], TOOLS)
    # Claude 可能先搜索再下单,也可能直接请求更多信息
    assert result["tool_calls"][0]["name"] == "search_products"  # 脆弱!
    assert result["tool_calls"][1]["name"] == "create_order"     # 脆弱!
# 正确写法:测试结果的属性而非具体序列
def test_order_eventually_created(claude_client, mock_db):
    mock_db.query.return_value = [{"id": "p001", ...}]
    mock_db.insert.return_value = "order_001"
    
    result = run_tool_use_loop(
        claude_client,
        [{"role": "user", "content": "帮我买 2 个 p001 产品,用户 ID user_001"}],
        TOOLS
    )
    # 验证最终结果:订单被创建了
    order_calls = [tc for tc in result["tool_calls"] if tc["name"] == "create_order"]
    assert len(order_calls) >= 1

陷阱二:忽略工具返回内容的格式

# 工具应返回字符串或可序列化的 JSON,而非复杂对象
def search_products_bad(query: str) -> list:
    return db.query(Product).filter(...).all()  # SQLAlchemy 对象无法序列化!

def search_products_good(query: str) -> list[dict]:
    results = db.query(Product).filter(...).all()
    return [{"id": p.id, "name": p.name, "price": p.price} for p in results]

陷阱三:沙箱环境与生产环境配置不一致

使用环境变量严格区分环境,并在测试初始化阶段验证:

# tests/conftest.py
def pytest_configure(config):
    """确保测试不会意外连接到生产数据库"""
    db_url = os.environ.get("DATABASE_URL", "")
    if "prod" in db_url or "production" in db_url:
        pytest.exit("检测到生产数据库 URL,终止测试!", returncode=1)
    if not db_url.endswith(("_test", "_dev", "localhost", "127.0.0.1")):
        import warnings
        warnings.warn(f"数据库 URL 可能不是测试环境: {db_url}")

小结

完善的 Plugin 测试策略遵循金字塔模型:大量的单元测试(快速、无依赖)作为基础,适量的集成测试(验证 Claude 协作)作为中层,少量的沙箱测试(验证端到端副作用)作为顶层。关键实践包括:Schema 验证测试防止参数错误、事务回滚隔离写操作副作用、Mock Claude 客户端加速 CI 流水线、环境变量守卫防止误操作生产环境。测试不是负担,而是 Plugin 从原型走向生产的必经之路。

本章评分
4.7  / 5  (3 评分)

💬 留言讨论