Setting Up Your Development Environment

Your First Computer Use Agent

6 min read

Let's build a simple agent that opens a browser, navigates to a website, and extracts information.

The Complete Agent

import anthropic
import base64
import subprocess

client = anthropic.Anthropic()

def take_screenshot():
    """Capture screen and return base64 encoded image."""
    subprocess.run(["scrot", "/tmp/screen.png"], check=True)
    with open("/tmp/screen.png", "rb") as f:
        return base64.standard_b64encode(f.read()).decode()

def execute_action(action):
    """Execute the computer action."""
    if action["type"] == "mouse_move":
        subprocess.run(["xdotool", "mousemove",
            str(action["coordinate"][0]),
            str(action["coordinate"][1])])

    elif action["type"] == "left_click":
        subprocess.run(["xdotool", "click", "1"])

    elif action["type"] == "type":
        subprocess.run(["xdotool", "type", "--", action["text"]])

    elif action["type"] == "key":
        subprocess.run(["xdotool", "key", action["key"]])

    elif action["type"] == "screenshot":
        pass  # Will be captured in loop

def run_agent(task: str):
    """Run the agentic loop."""
    messages = []
    screenshot = take_screenshot()

    # Initial message with task and screenshot
    messages.append({
        "role": "user",
        "content": [
            {"type": "text", "text": task},
            {
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/png",
                    "data": screenshot
                }
            }
        ]
    })

    while True:
        response = client.messages.create(
            model="claude-sonnet-4-5-20250514",
            max_tokens=4096,
            tools=[{
                "type": "computer_20250124",
                "name": "computer",
                "display_width_px": 1024,
                "display_height_px": 768,
                "display_number": 0
            }],
            betas=["computer-use-2025-01-24"],
            messages=messages
        )

        # Add assistant response
        messages.append({
            "role": "assistant",
            "content": response.content
        })

        # Check if done
        if response.stop_reason == "end_turn":
            # Extract final text response
            for block in response.content:
                if hasattr(block, "text"):
                    return block.text
            return "Task completed"

        # Process tool calls
        tool_results = []
        for block in response.content:
            if block.type == "tool_use":
                execute_action(block.input)
                screenshot = take_screenshot()

                tool_results.append({
                    "type": "tool_result",
                    "tool_use_id": block.id,
                    "content": [{
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": screenshot
                        }
                    }]
                })

        messages.append({
            "role": "user",
            "content": tool_results
        })

# Run it!
result = run_agent(
    "Open Firefox, go to example.com, and tell me what the page says."
)
print(result)

Key Components Explained

1. Screenshot Capture

We use scrot to capture screenshots. The image is base64 encoded for the API.

2. Action Execution

xdotool handles mouse and keyboard actions in Linux.

3. The Agentic Loop

The loop continues until Claude returns stop_reason == "end_turn".

4. Tool Results

After each action, we send a new screenshot so Claude can see the result.

Running the Agent

# Inside Docker container
python first_agent.py

You should see:

  1. Firefox opens
  2. URL bar receives input
  3. Page loads
  4. Claude reads and reports content

Tip: Watch the VNC window to see Claude controlling the desktop in real-time.

In the next module, we'll build more sophisticated desktop automation workflows. :::

Quiz

Module 2: Development Environment

Take Quiz