第 21 章
Server Tools 实战:web_search / web_fetch / code_execution 三剑客
第二十一章:Computer Use:桌面自动化与 GUI 操作的完整指南
21.1 Computer Use 是什么
Computer Use 是 Anthropic 为 Claude 开发的一项革命性能力,它允许模型通过截图观察屏幕状态,并通过模拟鼠标点击、键盘输入、滚动等操作控制计算机界面。这意味着 Claude 可以操作任何具有图形界面的软件,而不需要该软件提供 API。
与传统 RPA(机器人流程自动化)工具相比,Computer Use 的核心优势在于:
- 理解界面语义:Claude 能理解按钮的含义、表单的用途,而不仅仅是坐标
- 错误恢复能力:遇到意外弹窗或界面变化时,Claude 能推理如何处理
- 自然语言指令:不需要录制操作,直接用自然语言描述任务
启用 Computer Use 需要使用 betas=["computer-use-2025-01-24"] 参数。
21.2 核心工具定义
Computer Use 提供三个内置工具类型:
ComputerTool(桌面操作工具)
computer_tool = {
"type": "computer_20250124",
"name": "computer",
"display_width_px": 1920,
"display_height_px": 1080,
"display_number": 1 # X11 显示号(可选)
}
ComputerTool 支持的 action 类型:
| action | 说明 | 必需参数 |
|---|---|---|
screenshot |
截取当前屏幕 | 无 |
left_click |
左键单击 | coordinate: [x, y] |
right_click |
右键单击 | coordinate: [x, y] |
double_click |
双击 | coordinate: [x, y] |
middle_click |
中键点击 | coordinate: [x, y] |
left_click_drag |
拖拽 | coordinate: [x, y], start_coordinate: [x, y] |
type |
输入文本 | text: str |
key |
按键 | text: str(xdotool 格式) |
scroll |
滚动 | coordinate: [x, y], direction: up/down/left/right, amount: int |
mouse_move |
移动鼠标 | coordinate: [x, y] |
cursor_position |
获取鼠标位置 | 无 |
TextEditorTool(文本编辑工具)
text_editor_tool = {
"type": "text_editor_20250124",
"name": "str_replace_editor"
}
TextEditorTool 支持的命令:
| command | 说明 |
|---|---|
view |
查看文件内容 |
create |
创建新文件 |
str_replace |
替换文件中的字符串 |
insert |
在指定行后插入内容 |
undo_edit |
撤销最后一次编辑 |
BashTool(命令行工具)
bash_tool = {
"type": "bash_20250124",
"name": "bash"
}
BashTool 在持久的 shell 会话中执行命令,支持状态保持(环境变量、当前目录等)。
21.3 完整的 Computer Use 实现
基础实现框架
import anthropic
import base64
from typing import Optional
import subprocess
client = anthropic.Anthropic()
# 三个工具的完整定义
COMPUTER_USE_TOOLS = [
{
"type": "computer_20250124",
"name": "computer",
"display_width_px": 1920,
"display_height_px": 1080
},
{
"type": "text_editor_20250124",
"name": "str_replace_editor"
},
{
"type": "bash_20250124",
"name": "bash"
}
]
def take_screenshot() -> str:
"""截取屏幕并返回 base64 编码的 PNG"""
# 使用 scrot(Linux)
subprocess.run(["scrot", "/tmp/screenshot.png"], check=True)
with open("/tmp/screenshot.png", "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def execute_computer_action(action: str, **kwargs) -> dict:
"""执行桌面操作"""
import pyautogui
import time
if action == "screenshot":
screenshot_data = take_screenshot()
return {
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot_data
}
}
elif action == "left_click":
x, y = kwargs["coordinate"]
pyautogui.click(x, y)
return {"result": f"左键点击 ({x}, {y})"}
elif action == "right_click":
x, y = kwargs["coordinate"]
pyautogui.rightClick(x, y)
return {"result": f"右键点击 ({x}, {y})"}
elif action == "double_click":
x, y = kwargs["coordinate"]
pyautogui.doubleClick(x, y)
return {"result": f"双击 ({x}, {y})"}
elif action == "type":
text = kwargs["text"]
pyautogui.typewrite(text, interval=0.05)
return {"result": f"输入文本: {text[:50]}..."}
elif action == "key":
key = kwargs["text"]
# 转换 xdotool 格式到 pyautogui 格式
key_map = {
"Return": "enter",
"Escape": "esc",
"BackSpace": "backspace",
"Tab": "tab",
"ctrl+c": "ctrl+c",
"ctrl+v": "ctrl+v",
"ctrl+a": "ctrl+a",
"super": "win"
}
pyautogui_key = key_map.get(key, key)
pyautogui.hotkey(*pyautogui_key.split("+"))
return {"result": f"按键: {key}"}
elif action == "scroll":
x, y = kwargs["coordinate"]
direction = kwargs["direction"]
amount = kwargs.get("amount", 3)
pyautogui.moveTo(x, y)
if direction == "up":
pyautogui.scroll(amount)
elif direction == "down":
pyautogui.scroll(-amount)
return {"result": f"在 ({x}, {y}) 向{direction}滚动 {amount} 格"}
elif action == "mouse_move":
x, y = kwargs["coordinate"]
pyautogui.moveTo(x, y)
return {"result": f"鼠标移至 ({x}, {y})"}
elif action == "left_click_drag":
start = kwargs["start_coordinate"]
end = kwargs["coordinate"]
pyautogui.drag(
end[0] - start[0], end[1] - start[1],
duration=0.5,
button='left'
)
return {"result": f"从 {start} 拖拽到 {end}"}
else:
return {"error": f"未知操作: {action}"}
def execute_bash_command(command: str, bash_session=None) -> str:
"""执行 bash 命令"""
result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
timeout=30
)
output = result.stdout
if result.stderr:
output += f"\nSTDERR: {result.stderr}"
return output
def execute_text_editor(command: str, path: str,
old_str: str = None, new_str: str = None,
insert_line: int = None, new_str_content: str = None) -> str:
"""执行文本编辑操作"""
if command == "view":
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
return content
elif command == "create":
with open(path, 'w', encoding='utf-8') as f:
f.write(new_str or "")
return f"文件已创建: {path}"
elif command == "str_replace":
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
if old_str not in content:
return f"错误:未找到要替换的字符串"
new_content = content.replace(old_str, new_str, 1)
with open(path, 'w', encoding='utf-8') as f:
f.write(new_content)
return "替换成功"
elif command == "insert":
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
lines.insert(insert_line, new_str_content + '\n')
with open(path, 'w', encoding='utf-8') as f:
f.writelines(lines)
return f"已在第 {insert_line} 行后插入内容"
return f"未知命令: {command}"
完整的工具调用循环
def process_tool_call(tool_name: str, tool_input: dict) -> str:
"""处理工具调用,返回结果内容"""
if tool_name == "computer":
action = tool_input["action"]
kwargs = {k: v for k, v in tool_input.items() if k != "action"}
result = execute_computer_action(action, **kwargs)
if action == "screenshot":
# 截图返回图像内容
return [result]
else:
return result.get("result", result.get("error", "操作完成"))
elif tool_name == "bash":
return execute_bash_command(tool_input["command"])
elif tool_name == "str_replace_editor":
command = tool_input["command"]
path = tool_input.get("path", "")
return execute_text_editor(
command=command,
path=path,
old_str=tool_input.get("old_str"),
new_str=tool_input.get("new_str"),
insert_line=tool_input.get("insert_line"),
new_str_content=tool_input.get("new_str")
)
return "未知工具"
def run_computer_use_agent(task: str, system_prompt: str = "") -> str:
"""运行 Computer Use Agent"""
default_system = """你是一个能够使用计算机完成任务的 AI 助手。
你有以下工具可用:
- computer: 截图、点击、输入、滚动等桌面操作
- bash: 执行命令行命令
- str_replace_editor: 查看和编辑文件
操作建议:
1. 开始任务前先截图了解当前屏幕状态
2. 操作后再截图确认操作结果
3. 遇到错误时先理解错误原因再重试
4. 完成任务后截图确认最终状态"""
messages = [{"role": "user", "content": task}]
while True:
response = client.messages.create(
model="claude-opus-4-5",
max_tokens=4096,
system=system_prompt or default_system,
tools=COMPUTER_USE_TOOLS,
messages=messages,
betas=["computer-use-2025-01-24"]
)
if response.stop_reason == "end_turn":
return ' '.join(b.text for b in response.content if b.type == "text")
if response.stop_reason == "tool_use":
tool_results = []
for block in response.content:
if block.type != "tool_use":
continue
print(f"[工具] {block.name}: {block.input.get('action', block.input)}")
result_content = process_tool_call(block.name, block.input)
# 截图结果是列表(包含图像内容),其他是字符串
if isinstance(result_content, list):
content = result_content
else:
content = str(result_content)
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": content
})
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": tool_results})
else:
break
return "任务完成"
21.4 实战场景示例
场景一:自动填写网页表单
# 自动填写并提交登录表单
task = """
请在浏览器中完成以下操作:
1. 截图查看当前状态
2. 找到用户名输入框,点击它
3. 输入用户名 "[email protected]"
4. 点击密码输入框
5. 输入密码 "SecurePass123"
6. 点击登录按钮
7. 截图确认登录成功
"""
result = run_computer_use_agent(task)
场景二:批量处理文件
# 使用 bash 工具批量重命名文件
task = """
请帮我把 ~/Downloads 目录下所有 .jpg 文件重命名,
在文件名前加上今天的日期前缀(格式 YYYYMMDD_)。
先列出文件,确认后再执行重命名。
"""
result = run_computer_use_agent(task)
场景三:GUI 软件操作
# 操作桌面应用
task = """
请打开计算器应用,计算 (2024 × 365) + 100,
然后截图显示计算结果。
"""
result = run_computer_use_agent(task)
21.5 安全性考虑与最佳实践
沙盒环境设置
Computer Use 具有强大的系统控制能力,必须在受控环境中运行:
# 推荐使用 Docker 容器隔离
# Dockerfile 示例(基于 Anthropic 官方示例)
"""
FROM ubuntu:22.04
# 安装必要软件
RUN apt-get update && apt-get install -y \\
python3 python3-pip \\
xvfb x11vnc \\
firefox-esr \\
scrot \\
xdotool
# 创建受限用户(不使用 root)
RUN useradd -m -s /bin/bash sandboxuser
USER sandboxuser
# 设置虚拟显示
ENV DISPLAY=:1
"""
# 启动虚拟显示的 Python 代码
import subprocess
def start_virtual_display(width: int = 1920, height: int = 1080, display_num: int = 1):
"""启动 Xvfb 虚拟显示"""
subprocess.Popen([
"Xvfb", f":{display_num}",
"-screen", "0", f"{width}x{height}x24"
])
import os
os.environ["DISPLAY"] = f":{display_num}"
print(f"虚拟显示已启动::{display_num} ({width}x{height})")
人工确认机制
对于高风险操作,应在执行前请求人工确认:
HIGH_RISK_ACTIONS = [
"删除文件",
"格式化",
"关机",
"发送邮件",
"提交表单",
"购买",
"转账"
]
def confirm_before_risky_action(action_description: str) -> bool:
"""在执行高风险操作前请求确认"""
is_risky = any(keyword in action_description for keyword in HIGH_RISK_ACTIONS)
if is_risky:
print(f"\n[高风险操作] {action_description}")
response = input("是否继续?(y/N): ")
return response.lower() == 'y'
return True
class SafeComputerUseAgent:
"""带安全确认的 Computer Use Agent"""
def __init__(self, require_confirmation: bool = True):
self.require_confirmation = require_confirmation
self.action_log = []
def process_action_safely(self, tool_name: str, tool_input: dict) -> str:
# 记录操作
self.action_log.append({
"tool": tool_name,
"input": tool_input
})
# 检查是否需要确认
if self.require_confirmation:
action_desc = f"{tool_name}: {tool_input}"
if not confirm_before_risky_action(action_desc):
return "操作已被用户取消"
return process_tool_call(tool_name, tool_input)
21.6 性能优化与调试技巧
截图压缩
高分辨率截图会消耗大量 token,压缩处理可以节省成本:
from PIL import Image
import io
def take_compressed_screenshot(max_size: tuple = (1280, 720)) -> str:
"""截取并压缩截图"""
# 截原始图
subprocess.run(["scrot", "/tmp/screenshot_raw.png"], check=True)
# 压缩
with Image.open("/tmp/screenshot_raw.png") as img:
# 缩小尺寸
img.thumbnail(max_size, Image.LANCZOS)
# 转为 JPEG 减小文件大小
buffer = io.BytesIO()
img.convert("RGB").save(buffer, format="JPEG", quality=85)
compressed_data = buffer.getvalue()
return base64.standard_b64encode(compressed_data).decode("utf-8")
操作间延迟
GUI 操作后需要等待界面响应:
import time
def click_and_wait(x: int, y: int, wait_seconds: float = 0.5):
"""点击后等待界面响应"""
import pyautogui
pyautogui.click(x, y)
time.sleep(wait_seconds)
def type_and_wait(text: str, wait_seconds: float = 0.3):
"""输入文本后等待"""
import pyautogui
pyautogui.typewrite(text, interval=0.05)
time.sleep(wait_seconds)
调试模式:保存所有截图
import os
from datetime import datetime
class DebugComputerUseAgent:
"""保存所有截图用于调试的 Agent"""
def __init__(self, debug_dir: str = "/tmp/computer_use_debug"):
self.debug_dir = debug_dir
os.makedirs(debug_dir, exist_ok=True)
self.screenshot_count = 0
def save_screenshot(self, screenshot_data: str, label: str = ""):
"""保存截图到调试目录"""
self.screenshot_count += 1
timestamp = datetime.now().strftime("%H%M%S")
filename = f"{self.screenshot_count:03d}_{timestamp}_{label}.png"
filepath = os.path.join(self.debug_dir, filename)
with open(filepath, "wb") as f:
f.write(base64.standard_b64decode(screenshot_data))
print(f"截图已保存: {filepath}")
return filepath
21.7 常见问题与解决方案
问题一:坐标精确度
Claude 对坐标的判断基于截图分析,可能存在偏差:
解决方案:
1. 在系统提示中指定显示分辨率
2. 操作前先截图确认元素位置
3. 对于关键按钮,描述其视觉特征而非直接给坐标
问题二:动态内容加载
点击后页面未立即加载完成:
def wait_for_page_load(max_wait: float = 10.0) -> bool:
"""等待页面加载完成(基于截图比较)"""
import time
prev_screenshot = take_screenshot()
start_time = time.time()
while time.time() - start_time < max_wait:
time.sleep(1.0)
current_screenshot = take_screenshot()
if current_screenshot == prev_screenshot:
return True # 截图不再变化,认为加载完成
prev_screenshot = current_screenshot
return False # 超时
问题三:弹窗处理
意外弹窗会打断正常操作流程:
解决方案:
在系统提示中告知 Claude 如何处理常见弹窗:
- 关闭按钮通常在右上角
- "确定"/"取消"按钮的视觉位置
- Cookie 同意弹窗的处理方式
小结
Computer Use 将 Claude 从"文字工作者"升级为"真正的计算机操作者"。核心要点:
- 三个工具类型(computer、bash、str_replace_editor)覆盖 GUI 操作、命令行和文件编辑
betas=["computer-use-2025-01-24"]是启用此功能的必要参数- 安全性至关重要:沙盒环境、人工确认机制、操作日志缺一不可
- 截图压缩和合理的操作延迟是生产使用的关键优化
下一章将探索 Tool Use 与 Extended Thinking 的结合——这是目前 Claude 最强大的推理与行动组合。