Monitoring and Metrics

Production guardrails require comprehensive monitoring to track safety, performance, and business impact. This lesson covers essential metrics and monitoring strategies.

Key Guardrail Metrics

Safety Metrics

Metric	Description	Target
Block Rate	% of requests blocked	< 5% for production
False Positive Rate	Safe content incorrectly blocked	< 1%
False Negative Rate	Unsafe content passed	< 0.1%
Category Distribution	Breakdown of violations by type	Varies
Escalation Rate	% requiring human review	< 2%

Performance Metrics

Metric	Description	Target
P50 Latency	Median guardrail time	< 50ms
P99 Latency	99th percentile	< 200ms
Throughput	Requests per second	Varies
Error Rate	Guardrail failures	< 0.1%
Timeout Rate	Guardrail timeouts	< 0.01%

Prometheus Metrics Implementation

from prometheus_client import Counter, Histogram, Gauge
import time
from functools import wraps

# Define metrics
GUARDRAIL_REQUESTS = Counter(
    'guardrail_requests_total',
    'Total guardrail requests',
    ['rail_type', 'result']
)

GUARDRAIL_LATENCY = Histogram(
    'guardrail_latency_seconds',
    'Guardrail processing latency',
    ['rail_type'],
    buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]
)

GUARDRAIL_BLOCKS = Counter(
    'guardrail_blocks_total',
    'Total blocked requests',
    ['rail_type', 'category']
)

ACTIVE_GUARDRAIL_CHECKS = Gauge(
    'guardrail_active_checks',
    'Currently running guardrail checks',
    ['rail_type']
)

class MetricsMiddleware:
    """Middleware for tracking guardrail metrics."""

    def __init__(self, rail_type: str):
        self.rail_type = rail_type

    def __call__(self, func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            ACTIVE_GUARDRAIL_CHECKS.labels(rail_type=self.rail_type).inc()
            start_time = time.time()

            try:
                result = await func(*args, **kwargs)

                # Track result
                outcome = "blocked" if result.blocked else "passed"
                GUARDRAIL_REQUESTS.labels(
                    rail_type=self.rail_type,
                    result=outcome
                ).inc()

                if result.blocked:
                    for category in result.categories:
                        GUARDRAIL_BLOCKS.labels(
                            rail_type=self.rail_type,
                            category=category
                        ).inc()

                return result

            except Exception as e:
                GUARDRAIL_REQUESTS.labels(
                    rail_type=self.rail_type,
                    result="error"
                ).inc()
                raise

            finally:
                duration = time.time() - start_time
                GUARDRAIL_LATENCY.labels(
                    rail_type=self.rail_type
                ).observe(duration)
                ACTIVE_GUARDRAIL_CHECKS.labels(rail_type=self.rail_type).dec()

        return wrapper

# Usage
@MetricsMiddleware(rail_type="toxicity")
async def check_toxicity(text: str):
    # Toxicity check logic
    pass

Structured Logging

import structlog
from datetime import datetime
from typing import Dict, Any
import json

# Configure structured logging
structlog.configure(
    processors=[
        structlog.stdlib.filter_by_level,
        structlog.stdlib.add_logger_name,
        structlog.stdlib.add_log_level,
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.JSONRenderer()
    ],
    wrapper_class=structlog.stdlib.BoundLogger,
    context_class=dict,
    logger_factory=structlog.stdlib.LoggerFactory(),
)

logger = structlog.get_logger()

class GuardrailLogger:
    """Structured logging for guardrail events."""

    def log_check(
        self,
        request_id: str,
        rail_type: str,
        input_text: str,
        result: Dict[str, Any],
        latency_ms: float
    ):
        """Log a guardrail check result."""
        logger.info(
            "guardrail_check",
            request_id=request_id,
            rail_type=rail_type,
            input_length=len(input_text),
            result=result.get("decision"),
            categories=result.get("categories", []),
            confidence=result.get("confidence"),
            latency_ms=latency_ms,
            timestamp=datetime.utcnow().isoformat()
        )

    def log_block(
        self,
        request_id: str,
        rail_type: str,
        reason: str,
        categories: list,
        input_preview: str = None
    ):
        """Log a blocked request."""
        logger.warning(
            "guardrail_block",
            request_id=request_id,
            rail_type=rail_type,
            reason=reason,
            categories=categories,
            input_preview=input_preview[:100] if input_preview else None
        )

    def log_error(
        self,
        request_id: str,
        rail_type: str,
        error: Exception,
        fallback_action: str
    ):
        """Log a guardrail error."""
        logger.error(
            "guardrail_error",
            request_id=request_id,
            rail_type=rail_type,
            error_type=type(error).__name__,
            error_message=str(error),
            fallback_action=fallback_action
        )

Real-time Dashboard

from dataclasses import dataclass
from datetime import datetime, timedelta
from collections import defaultdict
import asyncio

@dataclass
class MetricWindow:
    """Rolling window for metric aggregation."""
    window_seconds: int = 60
    buckets: dict = None

    def __post_init__(self):
        self.buckets = defaultdict(list)

    def record(self, key: str, value: float):
        now = datetime.now()
        self.buckets[key].append((now, value))
        self._cleanup(key)

    def _cleanup(self, key: str):
        cutoff = datetime.now() - timedelta(seconds=self.window_seconds)
        self.buckets[key] = [
            (ts, v) for ts, v in self.buckets[key]
            if ts > cutoff
        ]

    def get_stats(self, key: str) -> dict:
        values = [v for _, v in self.buckets[key]]
        if not values:
            return {"count": 0, "avg": 0, "p99": 0}

        values.sort()
        return {
            "count": len(values),
            "avg": sum(values) / len(values),
            "p50": values[len(values) // 2],
            "p99": values[int(len(values) * 0.99)]
        }

class RealTimeDashboard:
    """Real-time metrics dashboard."""

    def __init__(self):
        self.latency = MetricWindow(window_seconds=60)
        self.block_rates = MetricWindow(window_seconds=300)
        self.error_counts = defaultdict(int)

    def record_check(
        self,
        rail_type: str,
        latency_ms: float,
        blocked: bool
    ):
        self.latency.record(rail_type, latency_ms)
        self.block_rates.record(rail_type, 1 if blocked else 0)

    def record_error(self, rail_type: str):
        self.error_counts[rail_type] += 1

    def get_dashboard_data(self) -> dict:
        """Get current dashboard metrics."""
        data = {}
        for rail_type in self.latency.buckets:
            latency_stats = self.latency.get_stats(rail_type)
            block_stats = self.block_rates.get_stats(rail_type)

            data[rail_type] = {
                "latency_ms": {
                    "avg": latency_stats["avg"],
                    "p50": latency_stats["p50"],
                    "p99": latency_stats["p99"]
                },
                "requests_per_minute": latency_stats["count"],
                "block_rate": block_stats["avg"] * 100,
                "errors": self.error_counts[rail_type]
            }
        return data

Alerting Rules

# prometheus_alerts.yml
groups:
  - name: guardrails
    rules:
      - alert: HighGuardrailLatency
        expr: histogram_quantile(0.99, rate(guardrail_latency_seconds_bucket[5m])) > 0.2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Guardrail P99 latency above 200ms"

      - alert: HighBlockRate
        expr: rate(guardrail_blocks_total[5m]) / rate(guardrail_requests_total[5m]) > 0.1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Block rate above 10%"

      - alert: GuardrailErrors
        expr: rate(guardrail_requests_total{result="error"}[5m]) > 0.01
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Guardrail error rate above 1%"

      - alert: UnusualViolationSpike
        expr: rate(guardrail_blocks_total{category="hate_speech"}[5m]) > 2 * avg_over_time(rate(guardrail_blocks_total{category="hate_speech"}[5m])[1h:5m])
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Unusual spike in hate speech violations"

Monitoring Tip: Track both safety and performance metrics. A guardrail that's 100% accurate but adds 5 seconds of latency is unusable. Balance thoroughness with user experience.

Next: A/B testing guardrail configurations. :::