Lesson 16 of 20

Error Handling & Recovery

Debugging Agents

3 min read

Agent debugging is uniquely challenging due to non-deterministic behavior and complex tool chains. Here are proven strategies for finding and fixing issues.

Comprehensive Logging

Log everything the agent does:

import logging
from datetime import datetime

class AgentLogger:
    def __init__(self, agent_id):
        self.agent_id = agent_id
        self.logger = logging.getLogger(f"agent.{agent_id}")
        self.trace = []

    def log_step(self, step_type, data):
        entry = {
            "timestamp": datetime.now().isoformat(),
            "agent_id": self.agent_id,
            "step_type": step_type,
            "data": data
        }
        self.trace.append(entry)
        self.logger.info(f"[{step_type}] {data}")

    def log_llm_call(self, prompt, response, tokens_used):
        self.log_step("llm_call", {
            "prompt_preview": prompt[:500],
            "response_preview": response[:500],
            "tokens": tokens_used
        })

    def log_tool_call(self, tool_name, params, result, duration_ms):
        self.log_step("tool_call", {
            "tool": tool_name,
            "params": params,
            "result": str(result)[:500],
            "duration_ms": duration_ms
        })

    def log_error(self, error_type, details, stack_trace):
        self.log_step("error", {
            "type": error_type,
            "details": details,
            "stack": stack_trace
        })

    def get_trace(self):
        return self.trace

Trace Visualization

Create readable execution traces:

def visualize_trace(trace):
    """Generate human-readable execution trace"""
    output = []
    indent = 0

    for entry in trace:
        step_type = entry["step_type"]
        data = entry["data"]

        if step_type == "llm_call":
            output.append(f"{'  ' * indent}🤖 LLM Call ({data['tokens']} tokens)")
            output.append(f"{'  ' * indent}   Input: {data['prompt_preview'][:100]}...")
            output.append(f"{'  ' * indent}   Output: {data['response_preview'][:100]}...")

        elif step_type == "tool_call":
            output.append(f"{'  ' * indent}🔧 Tool: {data['tool']} ({data['duration_ms']}ms)")
            output.append(f"{'  ' * indent}   Params: {data['params']}")
            output.append(f"{'  ' * indent}   Result: {data['result'][:100]}...")

        elif step_type == "error":
            output.append(f"{'  ' * indent}❌ Error: {data['type']}")
            output.append(f"{'  ' * indent}   {data['details']}")

    return "\n".join(output)

Replay and Reproduce

Save sessions for replay:

class SessionRecorder:
    def __init__(self, session_id):
        self.session_id = session_id
        self.events = []

    def record(self, event_type, data):
        self.events.append({
            "event_type": event_type,
            "data": data,
            "timestamp": datetime.now().isoformat()
        })

    def save(self, path):
        with open(path, 'w') as f:
            json.dump({
                "session_id": self.session_id,
                "events": self.events
            }, f)

    @classmethod
    def replay(cls, path, agent):
        """Replay a recorded session"""
        with open(path) as f:
            session = json.load(f)

        for event in session["events"]:
            if event["event_type"] == "user_input":
                response = agent.process(event["data"]["message"])
                print(f"Original: {event['data'].get('original_response')}")
                print(f"Replayed: {response}")

Common Debug Scenarios

1. Agent Loops

def diagnose_loop(trace):
    """Detect repeated actions in trace"""
    actions = [e["data"]["tool"] for e in trace if e["step_type"] == "tool_call"]

    # Find repeating patterns
    for window_size in range(2, len(actions) // 2):
        for i in range(len(actions) - window_size * 2):
            window1 = actions[i:i + window_size]
            window2 = actions[i + window_size:i + window_size * 2]
            if window1 == window2:
                return f"Loop detected: {window1} repeated at position {i}"

    return None

2. Tool Failures

def analyze_tool_failures(trace):
    """Summarize tool failure patterns"""
    failures = {}

    for entry in trace:
        if entry["step_type"] == "error" and "tool" in entry["data"]:
            tool = entry["data"]["tool"]
            error = entry["data"]["type"]

            if tool not in failures:
                failures[tool] = {}
            if error not in failures[tool]:
                failures[tool][error] = 0
            failures[tool][error] += 1

    return failures

3. Context Overflow

def check_context_usage(trace):
    """Track token usage over time"""
    total_tokens = 0
    max_tokens = 128000  # Adjust per model

    for entry in trace:
        if entry["step_type"] == "llm_call":
            total_tokens = entry["data"]["tokens"]

            if total_tokens > max_tokens * 0.9:
                return {
                    "warning": "Context nearly full",
                    "usage": f"{total_tokens}/{max_tokens}",
                    "timestamp": entry["timestamp"]
                }

    return {"status": "ok", "final_usage": total_tokens}

Debugging Tools Integration

# LangSmith integration for LangChain
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "my-agent-debug"

# Phoenix for observability
from phoenix.trace import langchain as phoenix_langchain
phoenix_langchain.instrument()

Debug Checklist

CheckToolWhen
Execution traceLoggerEvery run
Token usageToken counterBefore deploy
Tool success rateFailure analyzerWeekly
Response qualityEvaluation suiteAfter changes
LatencyPerformance monitorContinuous

Best Practices

  • Log prompts and responses (with size limits)
  • Record all tool calls with timing
  • Enable replay for production issues
  • Set up alerts for error spikes
  • Use observability platforms (LangSmith, Phoenix, Arize)

Test your error handling knowledge in the module quiz! :::

Quick check: how does this lesson land for you?

Quiz

Module 4: Error Handling & Recovery

Take Quiz
FREE WEEKLY NEWSLETTER

Stay on the Nerd Track

One email per week — courses, deep dives, tools, and AI experiments.

No spam. Unsubscribe anytime.