أول وكيل استخدام حاسوب خاص بك

دعنا نبني وكيلاً بسيطاً يفتح متصفحاً، وينتقل إلى موقع ويب، ويستخرج معلومات.

الوكيل الكامل

import anthropic
import base64
import subprocess

client = anthropic.Anthropic()

def take_screenshot():
    """التقاط الشاشة وإرجاع صورة مشفرة base64."""
    subprocess.run(["scrot", "/tmp/screen.png"], check=True)
    with open("/tmp/screen.png", "rb") as f:
        return base64.standard_b64encode(f.read()).decode()

def execute_action(action):
    """تنفيذ إجراء الحاسوب."""
    if action["type"] == "mouse_move":
        subprocess.run(["xdotool", "mousemove",
            str(action["coordinate"][0]),
            str(action["coordinate"][1])])

    elif action["type"] == "left_click":
        subprocess.run(["xdotool", "click", "1"])

    elif action["type"] == "type":
        subprocess.run(["xdotool", "type", "--", action["text"]])

    elif action["type"] == "key":
        subprocess.run(["xdotool", "key", action["key"]])

    elif action["type"] == "screenshot":
        pass  # سيتم التقاطها في الحلقة

def run_agent(task: str):
    """تشغيل حلقة الوكيل."""
    messages = []
    screenshot = take_screenshot()

    # الرسالة الأولية مع المهمة ولقطة الشاشة
    messages.append({
        "role": "user",
        "content": [
            {"type": "text", "text": task},
            {
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/png",
                    "data": screenshot
                }
            }
        ]
    })

    while True:
        response = client.messages.create(
            model="claude-sonnet-4-5-20250514",
            max_tokens=4096,
            tools=[{
                "type": "computer_20250124",
                "name": "computer",
                "display_width_px": 1024,
                "display_height_px": 768,
                "display_number": 0
            }],
            betas=["computer-use-2025-01-24"],
            messages=messages
        )

        # إضافة رد المساعد
        messages.append({
            "role": "assistant",
            "content": response.content
        })

        # التحقق من الانتهاء
        if response.stop_reason == "end_turn":
            # استخراج الرد النصي النهائي
            for block in response.content:
                if hasattr(block, "text"):
                    return block.text
            return "اكتملت المهمة"

        # معالجة استدعاءات الأدوات
        tool_results = []
        for block in response.content:
            if block.type == "tool_use":
                execute_action(block.input)
                screenshot = take_screenshot()

                tool_results.append({
                    "type": "tool_result",
                    "tool_use_id": block.id,
                    "content": [{
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": screenshot
                        }
                    }]
                })

        messages.append({
            "role": "user",
            "content": tool_results
        })

# تشغيله!
result = run_agent(
    "افتح Firefox، اذهب إلى example.com، وأخبرني ماذا تقول الصفحة."
)
print(result)

شرح المكونات الرئيسية

1. التقاط لقطة الشاشة

نستخدم scrot لالتقاط لقطات الشاشة. يتم تشفير الصورة بـ base64 لـ API.

2. تنفيذ الإجراء

يتعامل xdotool مع إجراءات الماوس ولوحة المفاتيح في Linux.

3. حلقة الوكيل

تستمر الحلقة حتى يُرجع كلود stop_reason == "end_turn".

4. نتائج الأداة

بعد كل إجراء، نرسل لقطة شاشة جديدة حتى يتمكن كلود من رؤية النتيجة.

تشغيل الوكيل

# داخل حاوية Docker
python first_agent.py

يجب أن ترى:

Firefox يفتح
شريط URL يستقبل الإدخال
الصفحة تُحمّل
كلود يقرأ ويُبلغ عن المحتوى

نصيحة: شاهد نافذة VNC لترى كلود يتحكم في سطح المكتب في الوقت الفعلي.

في الوحدة التالية، سنبني سير عمل أتمتة سطح مكتب أكثر تطوراً. :::