Safety Classifiers Deep Dive

Classifier Benchmarks and Comparison

3 min read

Choosing the right safety classifier requires understanding real-world performance trade-offs. This lesson provides benchmark comparisons to guide production decisions.

Benchmark Overview

Classifier F1 Score Precision Recall Latency (GPU) Memory
LlamaGuard 3 1B 0.81 0.79 0.83 50-100ms 2GB
LlamaGuard 3 8B 0.87 0.86 0.88 200-400ms 16GB
ShieldGemma 2B 0.81 0.82 0.80 30-60ms 4GB
ShieldGemma 9B 0.84 0.85 0.83 80-150ms 18GB
ShieldGemma 27B 0.88 0.89 0.87 200-400ms 54GB
toxic-bert 0.85 0.83 0.87 15-30ms 0.4GB
Detoxify (original) 0.89 0.87 0.91 100-200ms 1.3GB

Source: Benchmarks derived from official model releases and community evaluations (2024-2025).

Performance by Harm Category

Different classifiers excel at different harm types:

CATEGORY_PERFORMANCE = {
    "hate_speech": {
        "best": "ShieldGemma 27B",
        "runner_up": "LlamaGuard 3 8B",
        "notes": "ShieldGemma trained on broader hate speech data"
    },
    "violence": {
        "best": "LlamaGuard 3 8B",
        "runner_up": "ShieldGemma 27B",
        "notes": "LlamaGuard has granular violence categories"
    },
    "sexual_content": {
        "best": "LlamaGuard 3 8B",
        "runner_up": "ShieldGemma 9B",
        "notes": "LlamaGuard distinguishes explicit vs suggestive"
    },
    "self_harm": {
        "best": "Detoxify",
        "runner_up": "LlamaGuard 3 8B",
        "notes": "Detoxify trained on suicide prevention data"
    },
    "prompt_injection": {
        "best": "Custom classifier",
        "runner_up": "LlamaGuard 3 8B",
        "notes": "Generic classifiers miss encoded attacks"
    }
}

Latency vs Accuracy Trade-off

import matplotlib.pyplot as plt

def plot_tradeoff():
    """Visualize the latency vs accuracy trade-off."""
    classifiers = [
        ("toxic-bert", 22, 0.85),
        ("ShieldGemma 2B", 45, 0.81),
        ("LlamaGuard 3 1B", 75, 0.81),
        ("ShieldGemma 9B", 115, 0.84),
        ("Detoxify", 150, 0.89),
        ("LlamaGuard 3 8B", 300, 0.87),
        ("ShieldGemma 27B", 300, 0.88),
    ]

    names, latencies, accuracies = zip(*classifiers)

    plt.figure(figsize=(10, 6))
    plt.scatter(latencies, accuracies, s=100)

    for name, lat, acc in classifiers:
        plt.annotate(name, (lat, acc), textcoords="offset points",
                     xytext=(5, 5), fontsize=9)

    plt.xlabel("Latency (ms)")
    plt.ylabel("F1 Score")
    plt.title("Safety Classifier Trade-offs")
    plt.grid(True, alpha=0.3)

    # Highlight optimal zone
    plt.axhline(y=0.85, color='g', linestyle='--', alpha=0.5, label='Good accuracy')
    plt.axvline(x=100, color='r', linestyle='--', alpha=0.5, label='Fast threshold')

    plt.legend()
    return plt

Tiered Classification Strategy

Based on benchmarks, implement a tiered approach:

from enum import Enum
from dataclasses import dataclass
from typing import Optional
import time

class ClassifierTier(Enum):
    FAST = "fast"        # < 50ms
    BALANCED = "balanced" # 50-200ms
    THOROUGH = "thorough" # > 200ms

@dataclass
class TieredResult:
    decision: str
    confidence: float
    tier_used: ClassifierTier
    latency_ms: float
    escalated: bool = False

class TieredClassifier:
    """Production tiered classification system."""

    def __init__(self):
        # Lazy loading for memory efficiency
        self._fast = None
        self._balanced = None
        self._thorough = None

    @property
    def fast(self):
        if self._fast is None:
            from transformers import pipeline
            self._fast = pipeline(
                "text-classification",
                model="martin-ha/toxic-comment-model"
            )
        return self._fast

    @property
    def balanced(self):
        if self._balanced is None:
            self._balanced = ShieldGemmaClassifier(model_size="9b")
        return self._balanced

    @property
    def thorough(self):
        if self._thorough is None:
            self._thorough = LlamaGuard3Classifier(model_size="8B")
        return self._thorough

    def classify(
        self,
        content: str,
        confidence_threshold: float = 0.85
    ) -> TieredResult:
        """
        Classify with automatic tier escalation.

        Strategy:
        1. Fast tier first (toxic-bert)
        2. If uncertain (0.15 < score < 0.85), escalate to balanced
        3. If still uncertain, escalate to thorough
        """
        start = time.time()

        # Tier 1: Fast
        fast_result = self.fast(content)[0]
        fast_score = fast_result["score"]
        fast_label = fast_result["label"]

        # High confidence - return immediately
        if fast_score > confidence_threshold:
            return TieredResult(
                decision="unsafe" if fast_label == "toxic" else "safe",
                confidence=fast_score,
                tier_used=ClassifierTier.FAST,
                latency_ms=(time.time() - start) * 1000
            )

        # Low confidence on safe - also return
        if fast_label == "non-toxic" and fast_score > confidence_threshold:
            return TieredResult(
                decision="safe",
                confidence=fast_score,
                tier_used=ClassifierTier.FAST,
                latency_ms=(time.time() - start) * 1000
            )

        # Tier 2: Balanced (ShieldGemma)
        balanced_result = self.balanced.classify_all(content)
        max_harmful = max(
            r["probability"] for r in balanced_result.values()
        )

        if max_harmful > confidence_threshold or max_harmful < (1 - confidence_threshold):
            return TieredResult(
                decision="unsafe" if max_harmful > 0.5 else "safe",
                confidence=max_harmful if max_harmful > 0.5 else 1 - max_harmful,
                tier_used=ClassifierTier.BALANCED,
                latency_ms=(time.time() - start) * 1000,
                escalated=True
            )

        # Tier 3: Thorough (LlamaGuard 8B)
        decision, categories = self.thorough.classify(content)

        return TieredResult(
            decision=decision,
            confidence=0.9,  # LlamaGuard doesn't return confidence
            tier_used=ClassifierTier.THOROUGH,
            latency_ms=(time.time() - start) * 1000,
            escalated=True
        )

# Usage
classifier = TieredClassifier()
result = classifier.classify("How do I learn Python programming?")
print(f"Decision: {result.decision}, Tier: {result.tier_used.value}")

Choosing the Right Classifier

Scenario Recommended Classifier Reason
High volume API toxic-bert + LlamaGuard 1B Speed critical, escalate uncertain
Moderation queue LlamaGuard 3 8B Accuracy for human review
Real-time chat ShieldGemma 2B Low latency, good probability
Enterprise compliance LlamaGuard 3 8B Granular category logging
Edge/mobile ShieldGemma 2B Smallest accurate model
Maximum accuracy ShieldGemma 27B Best benchmark scores

Cost Analysis

CLASSIFIER_COSTS = {
    # Assuming GPU hosting costs
    "toxic-bert": {
        "gpu": "T4",
        "hourly_cost": 0.35,
        "throughput_per_hour": 120000,
        "cost_per_1k": 0.003
    },
    "LlamaGuard 3 1B": {
        "gpu": "T4",
        "hourly_cost": 0.35,
        "throughput_per_hour": 36000,
        "cost_per_1k": 0.01
    },
    "LlamaGuard 3 8B": {
        "gpu": "A10G",
        "hourly_cost": 1.00,
        "throughput_per_hour": 9000,
        "cost_per_1k": 0.11
    },
    "ShieldGemma 27B": {
        "gpu": "A100",
        "hourly_cost": 3.00,
        "throughput_per_hour": 9000,
        "cost_per_1k": 0.33
    }
}

def calculate_monthly_cost(
    daily_requests: int,
    classifier: str,
    escalation_rate: float = 0.15
) -> dict:
    """Calculate monthly classification costs with tiered approach."""
    costs = CLASSIFIER_COSTS[classifier]
    monthly_requests = daily_requests * 30

    base_cost = (monthly_requests / 1000) * costs["cost_per_1k"]

    # Add escalation costs if using tiered approach
    if escalation_rate > 0:
        escalated = monthly_requests * escalation_rate
        escalation_cost = (escalated / 1000) * CLASSIFIER_COSTS["LlamaGuard 3 8B"]["cost_per_1k"]
        total = base_cost + escalation_cost
    else:
        total = base_cost

    return {
        "base_cost": base_cost,
        "escalation_cost": escalation_cost if escalation_rate > 0 else 0,
        "total_monthly": total
    }

Production Insight: The tiered approach typically reduces costs by 60-70% compared to running the most accurate classifier on every request, while maintaining 95%+ of the accuracy.

Next: Customizing safety taxonomies for your specific use case. :::

Quiz

Module 3: Safety Classifiers Deep Dive

Take Quiz