Safety Classifiers Deep Dive
Classifier Benchmarks and Comparison
3 min read
Choosing the right safety classifier requires understanding real-world performance trade-offs. This lesson provides benchmark comparisons to guide production decisions.
Benchmark Overview
| Classifier | F1 Score | Precision | Recall | Latency (GPU) | Memory |
|---|---|---|---|---|---|
| LlamaGuard 3 1B | 0.81 | 0.79 | 0.83 | 50-100ms | 2GB |
| LlamaGuard 3 8B | 0.87 | 0.86 | 0.88 | 200-400ms | 16GB |
| ShieldGemma 2B | 0.81 | 0.82 | 0.80 | 30-60ms | 4GB |
| ShieldGemma 9B | 0.84 | 0.85 | 0.83 | 80-150ms | 18GB |
| ShieldGemma 27B | 0.88 | 0.89 | 0.87 | 200-400ms | 54GB |
| toxic-bert | 0.85 | 0.83 | 0.87 | 15-30ms | 0.4GB |
| Detoxify (original) | 0.89 | 0.87 | 0.91 | 100-200ms | 1.3GB |
Source: Benchmarks derived from official model releases and community evaluations (2024-2025).
Performance by Harm Category
Different classifiers excel at different harm types:
CATEGORY_PERFORMANCE = {
"hate_speech": {
"best": "ShieldGemma 27B",
"runner_up": "LlamaGuard 3 8B",
"notes": "ShieldGemma trained on broader hate speech data"
},
"violence": {
"best": "LlamaGuard 3 8B",
"runner_up": "ShieldGemma 27B",
"notes": "LlamaGuard has granular violence categories"
},
"sexual_content": {
"best": "LlamaGuard 3 8B",
"runner_up": "ShieldGemma 9B",
"notes": "LlamaGuard distinguishes explicit vs suggestive"
},
"self_harm": {
"best": "Detoxify",
"runner_up": "LlamaGuard 3 8B",
"notes": "Detoxify trained on suicide prevention data"
},
"prompt_injection": {
"best": "Custom classifier",
"runner_up": "LlamaGuard 3 8B",
"notes": "Generic classifiers miss encoded attacks"
}
}
Latency vs Accuracy Trade-off
import matplotlib.pyplot as plt
def plot_tradeoff():
"""Visualize the latency vs accuracy trade-off."""
classifiers = [
("toxic-bert", 22, 0.85),
("ShieldGemma 2B", 45, 0.81),
("LlamaGuard 3 1B", 75, 0.81),
("ShieldGemma 9B", 115, 0.84),
("Detoxify", 150, 0.89),
("LlamaGuard 3 8B", 300, 0.87),
("ShieldGemma 27B", 300, 0.88),
]
names, latencies, accuracies = zip(*classifiers)
plt.figure(figsize=(10, 6))
plt.scatter(latencies, accuracies, s=100)
for name, lat, acc in classifiers:
plt.annotate(name, (lat, acc), textcoords="offset points",
xytext=(5, 5), fontsize=9)
plt.xlabel("Latency (ms)")
plt.ylabel("F1 Score")
plt.title("Safety Classifier Trade-offs")
plt.grid(True, alpha=0.3)
# Highlight optimal zone
plt.axhline(y=0.85, color='g', linestyle='--', alpha=0.5, label='Good accuracy')
plt.axvline(x=100, color='r', linestyle='--', alpha=0.5, label='Fast threshold')
plt.legend()
return plt
Tiered Classification Strategy
Based on benchmarks, implement a tiered approach:
from enum import Enum
from dataclasses import dataclass
from typing import Optional
import time
class ClassifierTier(Enum):
FAST = "fast" # < 50ms
BALANCED = "balanced" # 50-200ms
THOROUGH = "thorough" # > 200ms
@dataclass
class TieredResult:
decision: str
confidence: float
tier_used: ClassifierTier
latency_ms: float
escalated: bool = False
class TieredClassifier:
"""Production tiered classification system."""
def __init__(self):
# Lazy loading for memory efficiency
self._fast = None
self._balanced = None
self._thorough = None
@property
def fast(self):
if self._fast is None:
from transformers import pipeline
self._fast = pipeline(
"text-classification",
model="martin-ha/toxic-comment-model"
)
return self._fast
@property
def balanced(self):
if self._balanced is None:
self._balanced = ShieldGemmaClassifier(model_size="9b")
return self._balanced
@property
def thorough(self):
if self._thorough is None:
self._thorough = LlamaGuard3Classifier(model_size="8B")
return self._thorough
def classify(
self,
content: str,
confidence_threshold: float = 0.85
) -> TieredResult:
"""
Classify with automatic tier escalation.
Strategy:
1. Fast tier first (toxic-bert)
2. If uncertain (0.15 < score < 0.85), escalate to balanced
3. If still uncertain, escalate to thorough
"""
start = time.time()
# Tier 1: Fast
fast_result = self.fast(content)[0]
fast_score = fast_result["score"]
fast_label = fast_result["label"]
# High confidence - return immediately
if fast_score > confidence_threshold:
return TieredResult(
decision="unsafe" if fast_label == "toxic" else "safe",
confidence=fast_score,
tier_used=ClassifierTier.FAST,
latency_ms=(time.time() - start) * 1000
)
# Low confidence on safe - also return
if fast_label == "non-toxic" and fast_score > confidence_threshold:
return TieredResult(
decision="safe",
confidence=fast_score,
tier_used=ClassifierTier.FAST,
latency_ms=(time.time() - start) * 1000
)
# Tier 2: Balanced (ShieldGemma)
balanced_result = self.balanced.classify_all(content)
max_harmful = max(
r["probability"] for r in balanced_result.values()
)
if max_harmful > confidence_threshold or max_harmful < (1 - confidence_threshold):
return TieredResult(
decision="unsafe" if max_harmful > 0.5 else "safe",
confidence=max_harmful if max_harmful > 0.5 else 1 - max_harmful,
tier_used=ClassifierTier.BALANCED,
latency_ms=(time.time() - start) * 1000,
escalated=True
)
# Tier 3: Thorough (LlamaGuard 8B)
decision, categories = self.thorough.classify(content)
return TieredResult(
decision=decision,
confidence=0.9, # LlamaGuard doesn't return confidence
tier_used=ClassifierTier.THOROUGH,
latency_ms=(time.time() - start) * 1000,
escalated=True
)
# Usage
classifier = TieredClassifier()
result = classifier.classify("How do I learn Python programming?")
print(f"Decision: {result.decision}, Tier: {result.tier_used.value}")
Choosing the Right Classifier
| Scenario | Recommended Classifier | Reason |
|---|---|---|
| High volume API | toxic-bert + LlamaGuard 1B | Speed critical, escalate uncertain |
| Moderation queue | LlamaGuard 3 8B | Accuracy for human review |
| Real-time chat | ShieldGemma 2B | Low latency, good probability |
| Enterprise compliance | LlamaGuard 3 8B | Granular category logging |
| Edge/mobile | ShieldGemma 2B | Smallest accurate model |
| Maximum accuracy | ShieldGemma 27B | Best benchmark scores |
Cost Analysis
CLASSIFIER_COSTS = {
# Assuming GPU hosting costs
"toxic-bert": {
"gpu": "T4",
"hourly_cost": 0.35,
"throughput_per_hour": 120000,
"cost_per_1k": 0.003
},
"LlamaGuard 3 1B": {
"gpu": "T4",
"hourly_cost": 0.35,
"throughput_per_hour": 36000,
"cost_per_1k": 0.01
},
"LlamaGuard 3 8B": {
"gpu": "A10G",
"hourly_cost": 1.00,
"throughput_per_hour": 9000,
"cost_per_1k": 0.11
},
"ShieldGemma 27B": {
"gpu": "A100",
"hourly_cost": 3.00,
"throughput_per_hour": 9000,
"cost_per_1k": 0.33
}
}
def calculate_monthly_cost(
daily_requests: int,
classifier: str,
escalation_rate: float = 0.15
) -> dict:
"""Calculate monthly classification costs with tiered approach."""
costs = CLASSIFIER_COSTS[classifier]
monthly_requests = daily_requests * 30
base_cost = (monthly_requests / 1000) * costs["cost_per_1k"]
# Add escalation costs if using tiered approach
if escalation_rate > 0:
escalated = monthly_requests * escalation_rate
escalation_cost = (escalated / 1000) * CLASSIFIER_COSTS["LlamaGuard 3 8B"]["cost_per_1k"]
total = base_cost + escalation_cost
else:
total = base_cost
return {
"base_cost": base_cost,
"escalation_cost": escalation_cost if escalation_rate > 0 else 0,
"total_monthly": total
}
Production Insight: The tiered approach typically reduces costs by 60-70% compared to running the most accurate classifier on every request, while maintaining 95%+ of the accuracy.
Next: Customizing safety taxonomies for your specific use case. :::