第 15 章

Extended Thinking 深度:budget_tokens / display 模式 / 多轮传递的完整机制

第十五章:Extended Thinking:深度推理的开启、预算控制与 Token 经济

15.1 Extended Thinking 是什么

Extended Thinking(扩展思考)是 Claude 的一项特殊能力:在生成最终回复之前,模型会在一个隐含的"思考空间"中进行详细的推理过程。这些思考内容通过 API 返回给开发者,但通常不直接展示给最终用户。

它的本质是让模型在给出答案前"先打草稿"——对复杂问题进行多角度分析、自我质疑、试错和逐步推导。这与人类面对难题时"先在纸上列提纲"的思维过程非常相似。

Extended Thinking 能显著提升的场景

不适合用 Extended Thinking 的场景

15.2 基础用法:开启 Extended Thinking

import anthropic

client = anthropic.Anthropic()

response = client.messages.create(
    model="claude-opus-4-6",  # Extended Thinking 在 opus 上效果最佳
    max_tokens=16000,  # 必须足够大,因为思考过程本身会消耗大量 token
    thinking={
        "type": "enabled",
        "budget_tokens": 10000  # 允许思考使用的最大 token 数
    },
    messages=[
        {
            "role": "user",
            "content": "证明:对于任意正整数 n,1+2+3+...+n = n(n+1)/2"
        }
    ]
)

# 响应包含两种内容块
for block in response.content:
    if block.type == "thinking":
        print(f"[思考过程]\n{block.thinking}\n")
    elif block.type == "text":
        print(f"[最终回答]\n{block.text}")

max_tokens 与 budget_tokens 的关系

这是使用 Extended Thinking 时最容易出错的地方:

max_tokens >= budget_tokens + 预计输出 token 数
# 常见错误:max_tokens 设置不够大
BAD_CONFIG = {
    "max_tokens": 1024,        # 太小!
    "thinking": {"type": "enabled", "budget_tokens": 10000}  # 10000 思考 + 输出超过 1024
}

# 正确配置
GOOD_CONFIG = {
    "max_tokens": 16000,       # 足够容纳思考 + 输出
    "thinking": {"type": "enabled", "budget_tokens": 10000}
}

15.3 budget_tokens 预算控制

预算对效果的影响

budget_tokens 不是一个"最小值",而是模型可以使用的"上限"。模型会根据任务复杂度决定实际使用多少思考 token:

def solve_with_variable_budget(problem: str, budget: int) -> dict:
    """使用不同预算解决同一问题,比较效果"""
    
    response = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=budget + 4096,  # 为输出预留足够空间
        thinking={
            "type": "enabled",
            "budget_tokens": budget
        },
        messages=[{"role": "user", "content": problem}]
    )
    
    thinking_tokens = 0
    answer = ""
    
    for block in response.content:
        if block.type == "thinking":
            thinking_tokens = len(block.thinking.split())  # 近似计算
        elif block.type == "text":
            answer = block.text
    
    return {
        "budget": budget,
        "thinking_approx_tokens": thinking_tokens,
        "answer": answer,
        "total_tokens": response.usage.output_tokens
    }

# 比较不同预算
problem = "如果在一个 8x8 棋盘上放 8 个皇后,使得任意两个皇后都不互相攻击,有多少种不同的放法?"

for budget in [1000, 5000, 10000, 20000]:
    result = solve_with_variable_budget(problem, budget)
    print(f"预算 {budget}: 思考约 {result['thinking_approx_tokens']} tokens, "
          f"总输出 {result['total_tokens']} tokens")

推荐预算设置

任务类型 推荐 budget_tokens max_tokens 建议
简单推理(几步) 1,000–3,000 4,000–6,000
中等复杂(数学、逻辑) 5,000–10,000 12,000–16,000
复杂分析(架构设计、策略) 10,000–20,000 20,000–30,000
极难问题(研究级别) 20,000–50,000 60,000+

重要:使用 betas=["output-128k-2025-02-19"] 可以将输出上限扩展到 128K tokens,适用于需要超长输出的场景:

response = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=100000,
    betas=["output-128k-2025-02-19"],  # 扩展输出限制到 128K
    thinking={
        "type": "enabled",
        "budget_tokens": 80000
    },
    messages=[{"role": "user", "content": "对这个复杂的系统架构进行全面分析..."}]
)

15.4 Token 经济分析

思考 Token 的计费方式

Extended Thinking 中的思考内容(thinking blocks)同样按 输出 token 计费。这意味着:

总成本 = 输入 token 费用 + (thinking tokens + text tokens) × 输出 token 单价

claude-opus-4-6 为例($15/MTok 输入,$75/MTok 输出):

def calculate_thinking_cost(
    input_tokens: int,
    thinking_tokens: int,
    output_tokens: int,
    model: str = "claude-opus-4-6"
) -> dict:
    """计算 Extended Thinking 请求的实际成本"""
    
    PRICES = {
        "claude-opus-4-6": {"input": 15.0, "output": 75.0},
        "claude-sonnet-4-6": {"input": 3.0, "output": 15.0},
    }
    
    p = PRICES.get(model, PRICES["claude-opus-4-6"])
    
    input_cost = (input_tokens / 1_000_000) * p["input"]
    thinking_cost = (thinking_tokens / 1_000_000) * p["output"]
    output_cost = (output_tokens / 1_000_000) * p["output"]
    
    return {
        "input_cost": round(input_cost, 6),
        "thinking_cost": round(thinking_cost, 6),
        "output_cost": round(output_cost, 6),
        "total_cost": round(input_cost + thinking_cost + output_cost, 6),
        "thinking_percentage": round(thinking_cost / (thinking_cost + output_cost) * 100, 1)
    }

# 示例:一个使用 10000 token 思考的请求
cost = calculate_thinking_cost(
    input_tokens=500,
    thinking_tokens=8000,
    output_tokens=500
)
print(cost)
# {'input_cost': 0.0075, 'thinking_cost': 0.6, 'output_cost': 0.0375, 'total_cost': 0.645, 'thinking_percentage': 94.1}

成本控制策略

策略 1:按任务复杂度分级

def smart_thinking_call(
    client: anthropic.Anthropic,
    problem: str,
    complexity: str = "auto"
) -> str:
    """根据任务复杂度动态选择推理模式"""
    
    if complexity == "auto":
        # 简单启发式:用 haiku 快速判断复杂度
        probe = client.messages.create(
            model="claude-haiku-4-5-20251001",
            max_tokens=50,
            messages=[{
                "role": "user",
                "content": f"这个问题的复杂度如何(simple/medium/complex)?只回答一个词。\n\n{problem}"
            }]
        )
        complexity = probe.content[0].text.strip().lower()
    
    config = {
        "simple": {"model": "claude-haiku-4-5-20251001", "thinking": None, "max_tokens": 1024},
        "medium": {"model": "claude-sonnet-4-6", "thinking": {"type": "enabled", "budget_tokens": 5000}, "max_tokens": 8000},
        "complex": {"model": "claude-opus-4-6", "thinking": {"type": "enabled", "budget_tokens": 15000}, "max_tokens": 20000}
    }.get(complexity, {
        "model": "claude-sonnet-4-6",
        "thinking": {"type": "enabled", "budget_tokens": 5000},
        "max_tokens": 8000
    })
    
    kwargs = {
        "model": config["model"],
        "max_tokens": config["max_tokens"],
        "messages": [{"role": "user", "content": problem}]
    }
    
    if config["thinking"]:
        kwargs["thinking"] = config["thinking"]
    
    response = client.messages.create(**kwargs)
    
    # 只返回最终文本答案
    return next(
        (block.text for block in response.content if block.type == "text"),
        ""
    )

15.5 思考内容的访问与展示

思考过程的可见性

API 返回的 thinking 块内容是真实的模型推理过程,但要注意:

  1. 思考内容可能包含模型的"犹豫"、"错误尝试"和"自我纠正"
  2. 思考内容是只读的,不能被修改后重新发送给模型
  3. 在多轮对话中,可以将思考内容回传以保持上下文连贯性
def multi_turn_with_thinking(
    client: anthropic.Anthropic,
    initial_problem: str
) -> None:
    """多轮对话中保持思考上下文"""
    
    messages = [{"role": "user", "content": initial_problem}]
    
    response = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=16000,
        thinking={"type": "enabled", "budget_tokens": 10000},
        messages=messages
    )
    
    # 将完整的助手响应(包括 thinking blocks)添加到历史
    messages.append({
        "role": "assistant",
        "content": response.content  # 保留所有内容块,包括 thinking
    })
    
    # 用户追问
    messages.append({
        "role": "user",
        "content": "根据你的分析,还有哪些潜在的边界情况需要考虑?"
    })
    
    # 继续对话
    response2 = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=16000,
        thinking={"type": "enabled", "budget_tokens": 8000},
        messages=messages
    )
    
    for block in response2.content:
        if block.type == "text":
            print(block.text)

向用户展示思考过程

def stream_with_thinking_display(
    client: anthropic.Anthropic,
    problem: str,
    show_thinking: bool = False
) -> None:
    """流式输出,可选显示思考过程"""
    
    with client.messages.stream(
        model="claude-opus-4-6",
        max_tokens=16000,
        thinking={"type": "enabled", "budget_tokens": 10000},
        messages=[{"role": "user", "content": problem}]
    ) as stream:
        current_block_type = None
        
        for event in stream:
            if event.type == "content_block_start":
                current_block_type = event.content_block.type
                
                if current_block_type == "thinking" and show_thinking:
                    print("\n💭 思考中...\n", end="", flush=True)
                elif current_block_type == "text":
                    print("\n📝 回答:\n", end="", flush=True)
                    
            elif event.type == "content_block_delta":
                if event.delta.type == "thinking_delta" and show_thinking:
                    print(event.delta.thinking, end="", flush=True)
                elif event.delta.type == "text_delta":
                    print(event.delta.text, end="", flush=True)

15.6 Extended Thinking 与工具调用的结合

Extended Thinking 可以与工具调用结合,让模型在调用工具前先深度思考:

import anthropic
import json

client = anthropic.Anthropic()

tools = [
    {
        "name": "search_database",
        "description": "搜索数据库获取信息",
        "input_schema": {
            "type": "object",
            "properties": {
                "query": {"type": "string"},
                "limit": {"type": "integer", "default": 10}
            },
            "required": ["query"]
        }
    }
]

def deep_research_agent(question: str) -> str:
    """使用 Extended Thinking + 工具调用的深度研究 Agent"""
    
    messages = [{"role": "user", "content": question}]
    
    while True:
        response = client.messages.create(
            model="claude-opus-4-6",
            max_tokens=16000,
            thinking={"type": "enabled", "budget_tokens": 8000},
            tools=tools,
            messages=messages
        )
        
        # 检查是否需要工具调用
        has_tool_use = any(
            block.type == "tool_use" for block in response.content
        )
        
        if not has_tool_use or response.stop_reason == "end_turn":
            # 返回最终文本答案
            return next(
                (block.text for block in response.content if block.type == "text"),
                "无法生成回答"
            )
        
        # 处理工具调用
        messages.append({"role": "assistant", "content": response.content})
        
        tool_results = []
        for block in response.content:
            if block.type == "tool_use":
                # 执行工具
                if block.name == "search_database":
                    result = {"results": [f"数据库搜索结果: {block.input['query']}"]}
                else:
                    result = {"error": "未知工具"}
                
                tool_results.append({
                    "type": "tool_result",
                    "tool_use_id": block.id,
                    "content": json.dumps(result, ensure_ascii=False)
                })
        
        messages.append({"role": "user", "content": tool_results})

15.7 常见错误与调试

错误 1:thinking 内容出现在 text 之后

Extended Thinking 的内容块有严格的顺序要求:thinking blocks 必须出现在 text blocks 之前。

# 错误:在多轮对话中,不能在 text 之后添加 thinking
# 如果之前的助手回复中 thinking 在 text 之前,这个顺序必须保持

# 检查内容块顺序
def validate_content_order(content_blocks: list) -> bool:
    """验证 thinking 块是否在 text 块之前"""
    seen_text = False
    for block in content_blocks:
        if block.type == "text":
            seen_text = True
        elif block.type == "thinking" and seen_text:
            return False  # thinking 出现在 text 之后,无效
    return True

错误 2:与 Prefill 混用

# 错误:启用 thinking 时不能使用 prefill
BAD_REQUEST = {
    "model": "claude-opus-4-6",
    "max_tokens": 16000,
    "thinking": {"type": "enabled", "budget_tokens": 10000},
    "messages": [
        {"role": "user", "content": "解决这道数学题"},
        {"role": "assistant", "content": "答案是"}  # 不能有 prefill!
    ]
}

# 正确:不使用 prefill
GOOD_REQUEST = {
    "model": "claude-opus-4-6",
    "max_tokens": 16000,
    "thinking": {"type": "enabled", "budget_tokens": 10000},
    "messages": [
        {"role": "user", "content": "解决这道数学题"}
    ]
}

错误 3:temperature 设置不为 1

Extended Thinking 要求 temperature=1(默认值)。设置其他值会报错。

# 错误:设置 temperature 时 extended thinking 不支持
try:
    response = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=16000,
        temperature=0.5,  # 不允许!
        thinking={"type": "enabled", "budget_tokens": 10000},
        messages=[{"role": "user", "content": "..."}]
    )
except anthropic.BadRequestError as e:
    print(f"错误: {e}")  # temperature must be 1 when thinking is enabled

15.8 Extended Thinking 的效果评估

import anthropic
import time

client = anthropic.Anthropic()

def benchmark_thinking(
    problems: list[dict],  # [{"problem": str, "expected_answer": str}]
    budgets: list[int] = [0, 1000, 5000, 10000]
) -> dict:
    """系统性评估不同思考预算对准确率的影响"""
    
    results = {}
    
    for budget in budgets:
        correct = 0
        total_cost = 0.0
        total_time = 0.0
        
        for item in problems:
            start = time.time()
            
            kwargs = {
                "model": "claude-opus-4-6",
                "max_tokens": budget + 2048 if budget > 0 else 2048,
                "messages": [{"role": "user", "content": item["problem"]}]
            }
            
            if budget > 0:
                kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget}
            
            response = client.messages.create(**kwargs)
            elapsed = time.time() - start
            
            # 获取文本答案
            answer = next(
                (b.text for b in response.content if b.type == "text"), ""
            )
            
            # 简单正确性检查
            if item["expected_answer"].lower() in answer.lower():
                correct += 1
            
            # 估算成本(opus 价格)
            cost = (
                response.usage.input_tokens * 15 / 1_000_000 +
                response.usage.output_tokens * 75 / 1_000_000
            )
            total_cost += cost
            total_time += elapsed
        
        results[budget] = {
            "accuracy": correct / len(problems),
            "avg_cost": total_cost / len(problems),
            "avg_time": total_time / len(problems)
        }
    
    return results

小结

Extended Thinking 是 Claude 在复杂推理任务上的核心竞争力。掌握要点:

  1. 必须设置足够大的 max_tokensmax_tokens >= budget_tokens + 预计输出
  2. thinking 块的 token 按输出价格计费,是最主要的成本来源
  3. 不能与 prefill、非默认 temperature 混用
  4. 多轮对话时保留 thinking 块有助于上下文连贯性
  5. 用分级策略降低成本:简单任务不使用 thinking,复杂任务使用高预算
  6. betas=["output-128k-2025-02-19"] 可将输出上限扩展到 128K,适合超长深度分析
本章评分
4.8  / 5  (26 评分)

💬 留言讨论