Setting Up Your Development Environment
Your First Computer Use Agent
6 min read
Let's build a simple agent that opens a browser, navigates to a website, and extracts information.
The Complete Agent
import anthropic
import base64
import subprocess
client = anthropic.Anthropic()
def take_screenshot():
"""Capture screen and return base64 encoded image."""
subprocess.run(["scrot", "/tmp/screen.png"], check=True)
with open("/tmp/screen.png", "rb") as f:
return base64.standard_b64encode(f.read()).decode()
def execute_action(action):
"""Execute the computer action."""
if action["type"] == "mouse_move":
subprocess.run(["xdotool", "mousemove",
str(action["coordinate"][0]),
str(action["coordinate"][1])])
elif action["type"] == "left_click":
subprocess.run(["xdotool", "click", "1"])
elif action["type"] == "type":
subprocess.run(["xdotool", "type", "--", action["text"]])
elif action["type"] == "key":
subprocess.run(["xdotool", "key", action["key"]])
elif action["type"] == "screenshot":
pass # Will be captured in loop
def run_agent(task: str):
"""Run the agentic loop."""
messages = []
screenshot = take_screenshot()
# Initial message with task and screenshot
messages.append({
"role": "user",
"content": [
{"type": "text", "text": task},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
}
]
})
while True:
response = client.messages.create(
model="claude-sonnet-4-5-20250514",
max_tokens=4096,
tools=[{
"type": "computer_20250124",
"name": "computer",
"display_width_px": 1024,
"display_height_px": 768,
"display_number": 0
}],
betas=["computer-use-2025-01-24"],
messages=messages
)
# Add assistant response
messages.append({
"role": "assistant",
"content": response.content
})
# Check if done
if response.stop_reason == "end_turn":
# Extract final text response
for block in response.content:
if hasattr(block, "text"):
return block.text
return "Task completed"
# Process tool calls
tool_results = []
for block in response.content:
if block.type == "tool_use":
execute_action(block.input)
screenshot = take_screenshot()
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": [{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
}]
})
messages.append({
"role": "user",
"content": tool_results
})
# Run it!
result = run_agent(
"Open Firefox, go to example.com, and tell me what the page says."
)
print(result)
Key Components Explained
1. Screenshot Capture
We use scrot to capture screenshots. The image is base64 encoded for the API.
2. Action Execution
xdotool handles mouse and keyboard actions in Linux.
3. The Agentic Loop
The loop continues until Claude returns stop_reason == "end_turn".
4. Tool Results
After each action, we send a new screenshot so Claude can see the result.
Running the Agent
# Inside Docker container
python first_agent.py
You should see:
- Firefox opens
- URL bar receives input
- Page loads
- Claude reads and reports content
Tip: Watch the VNC window to see Claude controlling the desktop in real-time.
In the next module, we'll build more sophisticated desktop automation workflows. :::