Lesson 17 of 23

Production & Reliability

Monitoring & Observability

4 min read

AI systems require specialized monitoring beyond traditional application metrics. This lesson covers what to measure, how to trace LLM calls, and building effective dashboards.

The Three Pillars

1. Metrics

Quantitative measurements over time:

from dataclasses import dataclass
from prometheus_client import Counter, Histogram, Gauge
import time

# LLM-specific metrics
llm_requests = Counter(
    "llm_requests_total",
    "Total LLM API requests",
    ["model", "endpoint", "status"]
)

llm_latency = Histogram(
    "llm_latency_seconds",
    "LLM request latency",
    ["model"],
    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0]
)

llm_tokens = Counter(
    "llm_tokens_total",
    "Total tokens used",
    ["model", "type"]  # type: input/output
)

active_sessions = Gauge(
    "active_agent_sessions",
    "Currently active agent sessions"
)

class MetricsMiddleware:
    def __init__(self, llm_client):
        self.client = llm_client

    async def complete(self, messages, model="gpt-4", **kwargs):
        start = time.time()
        status = "success"

        try:
            response = await self.client.complete(
                messages=messages,
                model=model,
                **kwargs
            )

            # Record token usage
            llm_tokens.labels(model=model, type="input").inc(
                response.usage.prompt_tokens
            )
            llm_tokens.labels(model=model, type="output").inc(
                response.usage.completion_tokens
            )

            return response

        except Exception as e:
            status = "error"
            raise
        finally:
            # Record request metrics
            llm_requests.labels(
                model=model,
                endpoint="completion",
                status=status
            ).inc()

            llm_latency.labels(model=model).observe(
                time.time() - start
            )

2. Logs

Structured event records:

import structlog
from typing import Any

logger = structlog.get_logger()

class LLMLogger:
    def __init__(self):
        self.logger = structlog.get_logger()

    def log_request(
        self,
        request_id: str,
        model: str,
        messages: list,
        tools: list = None
    ):
        self.logger.info(
            "llm_request",
            request_id=request_id,
            model=model,
            message_count=len(messages),
            tool_count=len(tools) if tools else 0,
            # Don't log full messages in production (PII concerns)
            first_message_role=messages[0]["role"] if messages else None
        )

    def log_response(
        self,
        request_id: str,
        model: str,
        latency_ms: float,
        tokens: dict,
        tool_calls: list = None
    ):
        self.logger.info(
            "llm_response",
            request_id=request_id,
            model=model,
            latency_ms=latency_ms,
            input_tokens=tokens.get("input", 0),
            output_tokens=tokens.get("output", 0),
            tool_call_count=len(tool_calls) if tool_calls else 0
        )

    def log_error(
        self,
        request_id: str,
        error_type: str,
        error_message: str,
        model: str
    ):
        self.logger.error(
            "llm_error",
            request_id=request_id,
            error_type=error_type,
            error_message=error_message,
            model=model
        )

3. Traces

Distributed request tracking:

from opentelemetry import trace
from opentelemetry.trace import SpanKind
import uuid

tracer = trace.get_tracer(__name__)

class TracedAgent:
    def __init__(self, llm, tools):
        self.llm = llm
        self.tools = tools

    async def run(self, task: str) -> str:
        # Create root span for entire agent run
        with tracer.start_as_current_span(
            "agent_run",
            kind=SpanKind.SERVER
        ) as root_span:
            request_id = str(uuid.uuid4())
            root_span.set_attribute("request_id", request_id)
            root_span.set_attribute("task_length", len(task))

            messages = [{"role": "user", "content": task}]
            iteration = 0

            while iteration < 10:
                iteration += 1

                # Span for LLM call
                with tracer.start_as_current_span("llm_call") as llm_span:
                    llm_span.set_attribute("iteration", iteration)
                    llm_span.set_attribute("message_count", len(messages))

                    response = await self.llm.complete(messages)

                    llm_span.set_attribute(
                        "has_tool_calls",
                        bool(response.tool_calls)
                    )

                if response.tool_calls:
                    for call in response.tool_calls:
                        # Span for each tool execution
                        with tracer.start_as_current_span(
                            f"tool_{call.name}"
                        ) as tool_span:
                            tool_span.set_attribute("tool_name", call.name)

                            result = await self.tools[call.name].execute(
                                call.args
                            )

                            tool_span.set_attribute(
                                "result_length",
                                len(str(result))
                            )

                        messages.append({
                            "role": "tool",
                            "content": result,
                            "tool_call_id": call.id
                        })
                else:
                    root_span.set_attribute("total_iterations", iteration)
                    return response.content

            root_span.set_attribute("timeout", True)
            return "Max iterations reached"

Key Metrics to Track

Category Metric Why It Matters
Latency P50, P95, P99 response time User experience
Throughput Requests per second Capacity planning
Tokens Input/output per request Cost tracking
Errors Rate by error type Reliability
Tool Usage Calls per tool Optimization
Cache Hit rate, miss rate Efficiency
Quality User ratings, corrections Model performance

Building Dashboards

# Dashboard configuration example
dashboard_config = {
    "title": "AI System Health",
    "panels": [
        {
            "title": "Request Rate",
            "query": "rate(llm_requests_total[5m])",
            "type": "graph"
        },
        {
            "title": "Latency P95",
            "query": "histogram_quantile(0.95, llm_latency_seconds_bucket)",
            "type": "graph"
        },
        {
            "title": "Error Rate",
            "query": "rate(llm_requests_total{status='error'}[5m]) / rate(llm_requests_total[5m])",
            "type": "stat",
            "thresholds": {"warning": 0.01, "critical": 0.05}
        },
        {
            "title": "Cost (Hourly)",
            "query": "sum(increase(llm_tokens_total[1h])) * 0.00001",
            "type": "stat"
        },
        {
            "title": "Active Sessions",
            "query": "active_agent_sessions",
            "type": "gauge"
        }
    ]
}

Interview Tip

When discussing monitoring:

  1. Business metrics - Not just technical (cost, user satisfaction)
  2. Alerting strategy - What triggers pages vs. tickets?
  3. Data retention - How long to keep traces/logs?
  4. Privacy - Don't log user PII in prompts

Next, we'll cover error handling and fallback strategies. :::

Quiz

Module 5: Production & Reliability

Take Quiz