Production & Reliability
Monitoring & Observability
4 min read
AI systems require specialized monitoring beyond traditional application metrics. This lesson covers what to measure, how to trace LLM calls, and building effective dashboards.
The Three Pillars
1. Metrics
Quantitative measurements over time:
from dataclasses import dataclass
from prometheus_client import Counter, Histogram, Gauge
import time
# LLM-specific metrics
llm_requests = Counter(
"llm_requests_total",
"Total LLM API requests",
["model", "endpoint", "status"]
)
llm_latency = Histogram(
"llm_latency_seconds",
"LLM request latency",
["model"],
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0]
)
llm_tokens = Counter(
"llm_tokens_total",
"Total tokens used",
["model", "type"] # type: input/output
)
active_sessions = Gauge(
"active_agent_sessions",
"Currently active agent sessions"
)
class MetricsMiddleware:
def __init__(self, llm_client):
self.client = llm_client
async def complete(self, messages, model="gpt-4", **kwargs):
start = time.time()
status = "success"
try:
response = await self.client.complete(
messages=messages,
model=model,
**kwargs
)
# Record token usage
llm_tokens.labels(model=model, type="input").inc(
response.usage.prompt_tokens
)
llm_tokens.labels(model=model, type="output").inc(
response.usage.completion_tokens
)
return response
except Exception as e:
status = "error"
raise
finally:
# Record request metrics
llm_requests.labels(
model=model,
endpoint="completion",
status=status
).inc()
llm_latency.labels(model=model).observe(
time.time() - start
)
2. Logs
Structured event records:
import structlog
from typing import Any
logger = structlog.get_logger()
class LLMLogger:
def __init__(self):
self.logger = structlog.get_logger()
def log_request(
self,
request_id: str,
model: str,
messages: list,
tools: list = None
):
self.logger.info(
"llm_request",
request_id=request_id,
model=model,
message_count=len(messages),
tool_count=len(tools) if tools else 0,
# Don't log full messages in production (PII concerns)
first_message_role=messages[0]["role"] if messages else None
)
def log_response(
self,
request_id: str,
model: str,
latency_ms: float,
tokens: dict,
tool_calls: list = None
):
self.logger.info(
"llm_response",
request_id=request_id,
model=model,
latency_ms=latency_ms,
input_tokens=tokens.get("input", 0),
output_tokens=tokens.get("output", 0),
tool_call_count=len(tool_calls) if tool_calls else 0
)
def log_error(
self,
request_id: str,
error_type: str,
error_message: str,
model: str
):
self.logger.error(
"llm_error",
request_id=request_id,
error_type=error_type,
error_message=error_message,
model=model
)
3. Traces
Distributed request tracking:
from opentelemetry import trace
from opentelemetry.trace import SpanKind
import uuid
tracer = trace.get_tracer(__name__)
class TracedAgent:
def __init__(self, llm, tools):
self.llm = llm
self.tools = tools
async def run(self, task: str) -> str:
# Create root span for entire agent run
with tracer.start_as_current_span(
"agent_run",
kind=SpanKind.SERVER
) as root_span:
request_id = str(uuid.uuid4())
root_span.set_attribute("request_id", request_id)
root_span.set_attribute("task_length", len(task))
messages = [{"role": "user", "content": task}]
iteration = 0
while iteration < 10:
iteration += 1
# Span for LLM call
with tracer.start_as_current_span("llm_call") as llm_span:
llm_span.set_attribute("iteration", iteration)
llm_span.set_attribute("message_count", len(messages))
response = await self.llm.complete(messages)
llm_span.set_attribute(
"has_tool_calls",
bool(response.tool_calls)
)
if response.tool_calls:
for call in response.tool_calls:
# Span for each tool execution
with tracer.start_as_current_span(
f"tool_{call.name}"
) as tool_span:
tool_span.set_attribute("tool_name", call.name)
result = await self.tools[call.name].execute(
call.args
)
tool_span.set_attribute(
"result_length",
len(str(result))
)
messages.append({
"role": "tool",
"content": result,
"tool_call_id": call.id
})
else:
root_span.set_attribute("total_iterations", iteration)
return response.content
root_span.set_attribute("timeout", True)
return "Max iterations reached"
Key Metrics to Track
| Category | Metric | Why It Matters |
|---|---|---|
| Latency | P50, P95, P99 response time | User experience |
| Throughput | Requests per second | Capacity planning |
| Tokens | Input/output per request | Cost tracking |
| Errors | Rate by error type | Reliability |
| Tool Usage | Calls per tool | Optimization |
| Cache | Hit rate, miss rate | Efficiency |
| Quality | User ratings, corrections | Model performance |
Building Dashboards
# Dashboard configuration example
dashboard_config = {
"title": "AI System Health",
"panels": [
{
"title": "Request Rate",
"query": "rate(llm_requests_total[5m])",
"type": "graph"
},
{
"title": "Latency P95",
"query": "histogram_quantile(0.95, llm_latency_seconds_bucket)",
"type": "graph"
},
{
"title": "Error Rate",
"query": "rate(llm_requests_total{status='error'}[5m]) / rate(llm_requests_total[5m])",
"type": "stat",
"thresholds": {"warning": 0.01, "critical": 0.05}
},
{
"title": "Cost (Hourly)",
"query": "sum(increase(llm_tokens_total[1h])) * 0.00001",
"type": "stat"
},
{
"title": "Active Sessions",
"query": "active_agent_sessions",
"type": "gauge"
}
]
}
Interview Tip
When discussing monitoring:
- Business metrics - Not just technical (cost, user satisfaction)
- Alerting strategy - What triggers pages vs. tickets?
- Data retention - How long to keep traces/logs?
- Privacy - Don't log user PII in prompts
Next, we'll cover error handling and fallback strategies. :::