Monitoring & Observability

AI systems require specialized monitoring beyond traditional application metrics. This lesson covers what to measure, how to trace LLM calls, and building effective dashboards.

The Three Pillars

1. Metrics

Quantitative measurements over time:

from dataclasses import dataclass
from prometheus_client import Counter, Histogram, Gauge
import time

# LLM-specific metrics
llm_requests = Counter(
    "llm_requests_total",
    "Total LLM API requests",
    ["model", "endpoint", "status"]
)

llm_latency = Histogram(
    "llm_latency_seconds",
    "LLM request latency",
    ["model"],
    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0]
)

llm_tokens = Counter(
    "llm_tokens_total",
    "Total tokens used",
    ["model", "type"]  # type: input/output
)

active_sessions = Gauge(
    "active_agent_sessions",
    "Currently active agent sessions"
)

class MetricsMiddleware:
    def __init__(self, llm_client):
        self.client = llm_client

    async def complete(self, messages, model="gpt-4o", **kwargs):
        start = time.time()
        status = "success"

        try:
            response = await self.client.complete(
                messages=messages,
                model=model,
                **kwargs
            )

            # Record token usage
            llm_tokens.labels(model=model, type="input").inc(
                response.usage.prompt_tokens
            )
            llm_tokens.labels(model=model, type="output").inc(
                response.usage.completion_tokens
            )

            return response

        except Exception as e:
            status = "error"
            raise
        finally:
            # Record request metrics
            llm_requests.labels(
                model=model,
                endpoint="completion",
                status=status
            ).inc()

            llm_latency.labels(model=model).observe(
                time.time() - start
            )

2. Logs

Structured event records:

import structlog
from typing import Any

logger = structlog.get_logger()

class LLMLogger:
    def __init__(self):
        self.logger = structlog.get_logger()

    def log_request(
        self,
        request_id: str,
        model: str,
        messages: list,
        tools: list = None
    ):
        self.logger.info(
            "llm_request",
            request_id=request_id,
            model=model,
            message_count=len(messages),
            tool_count=len(tools) if tools else 0,
            # Don't log full messages in production (PII concerns)
            first_message_role=messages[0]["role"] if messages else None
        )

    def log_response(
        self,
        request_id: str,
        model: str,
        latency_ms: float,
        tokens: dict,
        tool_calls: list = None
    ):
        self.logger.info(
            "llm_response",
            request_id=request_id,
            model=model,
            latency_ms=latency_ms,
            input_tokens=tokens.get("input", 0),
            output_tokens=tokens.get("output", 0),
            tool_call_count=len(tool_calls) if tool_calls else 0
        )

    def log_error(
        self,
        request_id: str,
        error_type: str,
        error_message: str,
        model: str
    ):
        self.logger.error(
            "llm_error",
            request_id=request_id,
            error_type=error_type,
            error_message=error_message,
            model=model
        )

3. Traces

Distributed request tracking:

from opentelemetry import trace
from opentelemetry.trace import SpanKind
import uuid

tracer = trace.get_tracer(__name__)

class TracedAgent:
    def __init__(self, llm, tools):
        self.llm = llm
        self.tools = tools

    async def run(self, task: str) -> str:
        # Create root span for entire agent run
        with tracer.start_as_current_span(
            "agent_run",
            kind=SpanKind.SERVER
        ) as root_span:
            request_id = str(uuid.uuid4())
            root_span.set_attribute("request_id", request_id)
            root_span.set_attribute("task_length", len(task))

            messages = [{"role": "user", "content": task}]
            iteration = 0

            while iteration < 10:
                iteration += 1

                # Span for LLM call
                with tracer.start_as_current_span("llm_call") as llm_span:
                    llm_span.set_attribute("iteration", iteration)
                    llm_span.set_attribute("message_count", len(messages))

                    response = await self.llm.complete(messages)

                    llm_span.set_attribute(
                        "has_tool_calls",
                        bool(response.tool_calls)
                    )

                if response.tool_calls:
                    for call in response.tool_calls:
                        # Span for each tool execution
                        with tracer.start_as_current_span(
                            f"tool_{call.name}"
                        ) as tool_span:
                            tool_span.set_attribute("tool_name", call.name)

                            result = await self.tools[call.name].execute(
                                call.args
                            )

                            tool_span.set_attribute(
                                "result_length",
                                len(str(result))
                            )

                        messages.append({
                            "role": "tool",
                            "content": result,
                            "tool_call_id": call.id
                        })
                else:
                    root_span.set_attribute("total_iterations", iteration)
                    return response.content

            root_span.set_attribute("timeout", True)
            return "Max iterations reached"

Key Metrics to Track

Category	Metric	Why It Matters
Latency	P50, P95, P99 response time	User experience
Throughput	Requests per second	Capacity planning
Tokens	Input/output per request	Cost tracking
Errors	Rate by error type	Reliability
Tool Usage	Calls per tool	Optimization
Cache	Hit rate, miss rate	Efficiency
Quality	User ratings, corrections	Model performance

Building Dashboards

# Dashboard configuration example
dashboard_config = {
    "title": "AI System Health",
    "panels": [
        {
            "title": "Request Rate",
            "query": "rate(llm_requests_total[5m])",
            "type": "graph"
        },
        {
            "title": "Latency P95",
            "query": "histogram_quantile(0.95, llm_latency_seconds_bucket)",
            "type": "graph"
        },
        {
            "title": "Error Rate",
            "query": "rate(llm_requests_total{status='error'}[5m]) / rate(llm_requests_total[5m])",
            "type": "stat",
            "thresholds": {"warning": 0.01, "critical": 0.05}
        },
        {
            "title": "Cost (Hourly)",
            "query": "sum(increase(llm_tokens_total[1h])) * 0.00001",
            "type": "stat"
        },
        {
            "title": "Active Sessions",
            "query": "active_agent_sessions",
            "type": "gauge"
        }
    ]
}

Interview Tip

When discussing monitoring:

Business metrics - Not just technical (cost, user satisfaction)

Alerting strategy - What triggers pages vs. tickets?

Data retention - How long to keep traces/logs?

Privacy - Don't log user PII in prompts

Next, we'll cover error handling and fallback strategies. :::