Debugging Agents

Agent debugging is uniquely challenging due to non-deterministic behavior and complex tool chains. Here are proven strategies for finding and fixing issues.

Comprehensive Logging

Log everything the agent does:

import logging
from datetime import datetime

class AgentLogger:
    def __init__(self, agent_id):
        self.agent_id = agent_id
        self.logger = logging.getLogger(f"agent.{agent_id}")
        self.trace = []

    def log_step(self, step_type, data):
        entry = {
            "timestamp": datetime.now().isoformat(),
            "agent_id": self.agent_id,
            "step_type": step_type,
            "data": data
        }
        self.trace.append(entry)
        self.logger.info(f"[{step_type}] {data}")

    def log_llm_call(self, prompt, response, tokens_used):
        self.log_step("llm_call", {
            "prompt_preview": prompt[:500],
            "response_preview": response[:500],
            "tokens": tokens_used
        })

    def log_tool_call(self, tool_name, params, result, duration_ms):
        self.log_step("tool_call", {
            "tool": tool_name,
            "params": params,
            "result": str(result)[:500],
            "duration_ms": duration_ms
        })

    def log_error(self, error_type, details, stack_trace):
        self.log_step("error", {
            "type": error_type,
            "details": details,
            "stack": stack_trace
        })

    def get_trace(self):
        return self.trace

Trace Visualization

Create readable execution traces:

def visualize_trace(trace):
    """Generate human-readable execution trace"""
    output = []
    indent = 0

    for entry in trace:
        step_type = entry["step_type"]
        data = entry["data"]

        if step_type == "llm_call":
            output.append(f"{'  ' * indent}🤖 LLM Call ({data['tokens']} tokens)")
            output.append(f"{'  ' * indent}   Input: {data['prompt_preview'][:100]}...")
            output.append(f"{'  ' * indent}   Output: {data['response_preview'][:100]}...")

        elif step_type == "tool_call":
            output.append(f"{'  ' * indent}🔧 Tool: {data['tool']} ({data['duration_ms']}ms)")
            output.append(f"{'  ' * indent}   Params: {data['params']}")
            output.append(f"{'  ' * indent}   Result: {data['result'][:100]}...")

        elif step_type == "error":
            output.append(f"{'  ' * indent}❌ Error: {data['type']}")
            output.append(f"{'  ' * indent}   {data['details']}")

    return "\n".join(output)

Replay and Reproduce

Save sessions for replay:

class SessionRecorder:
    def __init__(self, session_id):
        self.session_id = session_id
        self.events = []

    def record(self, event_type, data):
        self.events.append({
            "event_type": event_type,
            "data": data,
            "timestamp": datetime.now().isoformat()
        })

    def save(self, path):
        with open(path, 'w') as f:
            json.dump({
                "session_id": self.session_id,
                "events": self.events
            }, f)

    @classmethod
    def replay(cls, path, agent):
        """Replay a recorded session"""
        with open(path) as f:
            session = json.load(f)

        for event in session["events"]:
            if event["event_type"] == "user_input":
                response = agent.process(event["data"]["message"])
                print(f"Original: {event['data'].get('original_response')}")
                print(f"Replayed: {response}")

Common Debug Scenarios

1. Agent Loops

def diagnose_loop(trace):
    """Detect repeated actions in trace"""
    actions = [e["data"]["tool"] for e in trace if e["step_type"] == "tool_call"]

    # Find repeating patterns
    for window_size in range(2, len(actions) // 2):
        for i in range(len(actions) - window_size * 2):
            window1 = actions[i:i + window_size]
            window2 = actions[i + window_size:i + window_size * 2]
            if window1 == window2:
                return f"Loop detected: {window1} repeated at position {i}"

    return None

2. Tool Failures

def analyze_tool_failures(trace):
    """Summarize tool failure patterns"""
    failures = {}

    for entry in trace:
        if entry["step_type"] == "error" and "tool" in entry["data"]:
            tool = entry["data"]["tool"]
            error = entry["data"]["type"]

            if tool not in failures:
                failures[tool] = {}
            if error not in failures[tool]:
                failures[tool][error] = 0
            failures[tool][error] += 1

    return failures

3. Context Overflow

def check_context_usage(trace):
    """Track token usage over time"""
    total_tokens = 0
    max_tokens = 128000  # Adjust per model

    for entry in trace:
        if entry["step_type"] == "llm_call":
            total_tokens = entry["data"]["tokens"]

            if total_tokens > max_tokens * 0.9:
                return {
                    "warning": "Context nearly full",
                    "usage": f"{total_tokens}/{max_tokens}",
                    "timestamp": entry["timestamp"]
                }

    return {"status": "ok", "final_usage": total_tokens}

Debugging Tools Integration

# LangSmith integration for LangChain
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "my-agent-debug"

# Phoenix for observability
from phoenix.trace import langchain as phoenix_langchain
phoenix_langchain.instrument()

Debug Checklist

Check	Tool	When
Execution trace	Logger	Every run
Token usage	Token counter	Before deploy
Tool success rate	Failure analyzer	Weekly
Response quality	Evaluation suite	After changes
Latency	Performance monitor	Continuous

Best Practices

Log prompts and responses (with size limits)
Record all tool calls with timing
Enable replay for production issues
Set up alerts for error spikes
Use observability platforms (LangSmith, Phoenix, Arize)

Test your error handling knowledge in the module quiz! :::