Production Deployment & Observability
A/B Testing Guardrails
3 min read
A/B testing guardrail configurations helps optimize the balance between safety and user experience. This lesson covers implementing controlled experiments for guardrail changes.
Why A/B Test Guardrails?
- Threshold Optimization: Find optimal toxicity thresholds
- Latency Impact: Measure new classifier latency in production
- False Positive Reduction: Test relaxed rules on safe traffic segments
- New Model Rollout: Gradually deploy new safety classifiers
Experiment Framework
from dataclasses import dataclass
from typing import Optional, Callable
import hashlib
import random
@dataclass
class Experiment:
"""A/B test experiment configuration."""
name: str
control_weight: float = 0.5 # % traffic to control
treatment_config: dict = None
control_config: dict = None
user_sticky: bool = True # Same user always gets same variant
class ExperimentManager:
"""Manage guardrail A/B experiments."""
def __init__(self):
self.experiments: dict[str, Experiment] = {}
def register_experiment(self, experiment: Experiment):
self.experiments[experiment.name] = experiment
def get_variant(
self,
experiment_name: str,
user_id: str = None,
request_id: str = None
) -> tuple[str, dict]:
"""
Get experiment variant for a request.
Returns:
tuple of (variant_name, config)
"""
experiment = self.experiments.get(experiment_name)
if not experiment:
return ("control", {})
# Determine assignment
if experiment.user_sticky and user_id:
# Hash user_id for consistent assignment
hash_input = f"{experiment_name}:{user_id}"
hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
assignment = (hash_value % 100) / 100
else:
assignment = random.random()
if assignment < experiment.control_weight:
return ("control", experiment.control_config or {})
else:
return ("treatment", experiment.treatment_config or {})
# Usage
manager = ExperimentManager()
# Register experiment
manager.register_experiment(Experiment(
name="toxicity_threshold_test",
control_weight=0.5,
control_config={"threshold": 0.8},
treatment_config={"threshold": 0.6}
))
# Get variant
variant, config = manager.get_variant(
"toxicity_threshold_test",
user_id="user_123"
)
Guardrail Experiment Wrapper
from typing import Dict, Any
import time
class ExperimentalGuardrail:
"""Guardrail with A/B testing support."""
def __init__(
self,
base_guardrail: callable,
experiment_manager: ExperimentManager,
experiment_name: str
):
self.base_guardrail = base_guardrail
self.experiment_manager = experiment_manager
self.experiment_name = experiment_name
async def check(
self,
content: str,
user_id: str = None,
request_id: str = None
) -> Dict[str, Any]:
"""Run guardrail with experiment configuration."""
# Get experiment variant
variant, config = self.experiment_manager.get_variant(
self.experiment_name,
user_id=user_id,
request_id=request_id
)
start_time = time.time()
# Run guardrail with variant config
result = await self.base_guardrail(content, **config)
latency_ms = (time.time() - start_time) * 1000
# Log experiment data
await self._log_experiment_result(
variant=variant,
config=config,
result=result,
latency_ms=latency_ms,
user_id=user_id,
request_id=request_id
)
return {
**result,
"experiment_variant": variant,
"experiment_name": self.experiment_name
}
async def _log_experiment_result(
self,
variant: str,
config: dict,
result: dict,
latency_ms: float,
user_id: str,
request_id: str
):
"""Log experiment result for analysis."""
from structlog import get_logger
logger = get_logger()
logger.info(
"experiment_result",
experiment_name=self.experiment_name,
variant=variant,
config=config,
blocked=result.get("blocked", False),
categories=result.get("categories", []),
confidence=result.get("confidence"),
latency_ms=latency_ms,
user_id=user_id,
request_id=request_id
)
Statistical Analysis
from scipy import stats
import numpy as np
from dataclasses import dataclass
@dataclass
class ExperimentResults:
control_blocked: int
control_total: int
treatment_blocked: int
treatment_total: int
control_latency: list[float]
treatment_latency: list[float]
class ExperimentAnalyzer:
"""Analyze A/B test results."""
def analyze(self, results: ExperimentResults) -> dict:
"""Perform statistical analysis on experiment results."""
# Block rate comparison
control_rate = results.control_blocked / results.control_total
treatment_rate = results.treatment_blocked / results.treatment_total
# Chi-squared test for block rates
contingency = [
[results.control_blocked, results.control_total - results.control_blocked],
[results.treatment_blocked, results.treatment_total - results.treatment_blocked]
]
chi2, p_value_blocks, _, _ = stats.chi2_contingency(contingency)
# T-test for latency
t_stat, p_value_latency = stats.ttest_ind(
results.control_latency,
results.treatment_latency
)
# Effect size (Cohen's d for latency)
cohens_d = self._cohens_d(
results.control_latency,
results.treatment_latency
)
return {
"block_rate": {
"control": control_rate,
"treatment": treatment_rate,
"relative_change": (treatment_rate - control_rate) / control_rate * 100,
"p_value": p_value_blocks,
"significant": p_value_blocks < 0.05
},
"latency": {
"control_mean": np.mean(results.control_latency),
"treatment_mean": np.mean(results.treatment_latency),
"control_p99": np.percentile(results.control_latency, 99),
"treatment_p99": np.percentile(results.treatment_latency, 99),
"p_value": p_value_latency,
"cohens_d": cohens_d,
"significant": p_value_latency < 0.05
},
"recommendation": self._get_recommendation(
control_rate, treatment_rate,
np.mean(results.control_latency),
np.mean(results.treatment_latency),
p_value_blocks
)
}
def _cohens_d(self, group1: list, group2: list) -> float:
"""Calculate Cohen's d effect size."""
n1, n2 = len(group1), len(group2)
var1, var2 = np.var(group1), np.var(group2)
pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
return (np.mean(group1) - np.mean(group2)) / pooled_std
def _get_recommendation(
self,
control_rate: float,
treatment_rate: float,
control_latency: float,
treatment_latency: float,
p_value: float
) -> str:
"""Generate experiment recommendation."""
if p_value >= 0.05:
return "No significant difference. Continue experiment or maintain control."
rate_change = (treatment_rate - control_rate) / control_rate * 100
latency_change = (treatment_latency - control_latency) / control_latency * 100
if rate_change < -10 and latency_change < 10:
return "SHIP: Treatment reduces blocks without latency impact."
elif rate_change > 20:
return "REJECT: Treatment blocks too much content."
elif latency_change > 20:
return "REJECT: Treatment adds too much latency."
else:
return "INVESTIGATE: Mixed results, review qualitative data."
Gradual Rollout
from datetime import datetime, timedelta
from typing import Callable
class GradualRollout:
"""Gradually roll out guardrail changes."""
def __init__(
self,
start_date: datetime,
end_date: datetime,
start_percentage: float = 5.0,
end_percentage: float = 100.0
):
self.start_date = start_date
self.end_date = end_date
self.start_percentage = start_percentage
self.end_percentage = end_percentage
def get_rollout_percentage(self) -> float:
"""Get current rollout percentage."""
now = datetime.now()
if now < self.start_date:
return 0.0
if now > self.end_date:
return self.end_percentage
total_duration = (self.end_date - self.start_date).total_seconds()
elapsed = (now - self.start_date).total_seconds()
progress = elapsed / total_duration
return self.start_percentage + (
self.end_percentage - self.start_percentage
) * progress
def should_use_new_guardrail(self, user_id: str) -> bool:
"""Determine if user should get new guardrail."""
percentage = self.get_rollout_percentage()
# Consistent hash for user
hash_value = int(hashlib.md5(user_id.encode()).hexdigest(), 16)
user_bucket = hash_value % 100
return user_bucket < percentage
# Usage
rollout = GradualRollout(
start_date=datetime(2026, 1, 15),
end_date=datetime(2026, 1, 22),
start_percentage=5.0,
end_percentage=100.0
)
# Check if user should get new guardrail
if rollout.should_use_new_guardrail(user_id="user_123"):
result = await new_guardrail.check(content)
else:
result = await old_guardrail.check(content)
A/B Testing Tip: Always have a rollback plan. If treatment shows significantly worse safety (higher false negatives), immediately stop the experiment and revert to control.
Next: Logging, auditing, and compliance. :::