Input Filtering at Scale

Fast Toxicity Classifiers

3 min read

Fast toxicity detection is essential for the first layer of input filtering. This lesson covers implementing lightweight classifiers that can process inputs in under 30ms while maintaining acceptable accuracy.

Toxicity Classification Models

Model Size Latency (CPU) Accuracy Use Case
toxic-bert (distilled) 66MB 15-30ms ~85% Fast first pass
unitary/toxic-bert 420MB 50-100ms ~90% Balanced
HateBERT 420MB 50-100ms ~88% Hate speech focus
detoxify 1.3GB 100-200ms ~93% High accuracy

Implementing DistilBERT Toxicity

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from typing import Dict, List
import torch

class FastToxicityClassifier:
    """Lightweight toxicity classifier for production input filtering."""

    def __init__(self, model_name: str = "martin-ha/toxic-comment-model"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

    @torch.no_grad()
    def classify(self, text: str) -> Dict[str, float]:
        """Classify single text for toxicity."""
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=512,
            padding=True
        ).to(self.device)

        outputs = self.model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)

        return {
            "toxic": float(probs[0][1]),
            "non_toxic": float(probs[0][0])
        }

    @torch.no_grad()
    def batch_classify(self, texts: List[str], batch_size: int = 32) -> List[Dict]:
        """Batch classification for throughput."""
        results = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            inputs = self.tokenizer(
                batch,
                return_tensors="pt",
                truncation=True,
                max_length=512,
                padding=True
            ).to(self.device)

            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)

            for j in range(len(batch)):
                results.append({
                    "toxic": float(probs[j][1]),
                    "non_toxic": float(probs[j][0])
                })

        return results

# Usage
classifier = FastToxicityClassifier()
result = classifier.classify("This is a test message")
print(f"Toxicity score: {result['toxic']:.2%}")

Multi-Category Toxicity with Detoxify

from detoxify import Detoxify

class DetailedToxicityClassifier:
    """Multi-category toxicity detection."""

    def __init__(self, model_type: str = "original"):
        # model_type: "original", "unbiased", "multilingual"
        self.model = Detoxify(model_type)
        self.categories = [
            "toxicity", "severe_toxicity", "obscene",
            "threat", "insult", "identity_attack"
        ]

    def classify(self, text: str) -> Dict[str, float]:
        """Get scores for all toxicity categories."""
        return self.model.predict(text)

    def check(self, text: str, thresholds: Dict[str, float] = None) -> bool:
        """Check if text exceeds any threshold."""
        default_thresholds = {
            "toxicity": 0.8,
            "severe_toxicity": 0.5,
            "threat": 0.5,
            "identity_attack": 0.5,
        }
        thresholds = thresholds or default_thresholds

        scores = self.classify(text)
        for category, threshold in thresholds.items():
            if scores.get(category, 0) > threshold:
                return True  # Toxic
        return False  # Safe

# Usage
classifier = DetailedToxicityClassifier()
scores = classifier.classify("Sample text to analyze")
# Returns: {'toxicity': 0.02, 'severe_toxicity': 0.001, ...}

ONNX Optimization for Production

Convert models to ONNX for 2-3x speedup:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from optimum.onnxruntime import ORTModelForSequenceClassification
import numpy as np

class ONNXToxicityClassifier:
    """ONNX-optimized toxicity classifier for production."""

    def __init__(self, model_name: str = "martin-ha/toxic-comment-model"):
        # Export and load as ONNX
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = ORTModelForSequenceClassification.from_pretrained(
            model_name,
            export=True  # Auto-export to ONNX
        )

    def classify(self, text: str) -> float:
        """Ultra-fast classification with ONNX."""
        inputs = self.tokenizer(
            text,
            return_tensors="np",
            truncation=True,
            max_length=512
        )

        outputs = self.model(**inputs)
        probs = self._softmax(outputs.logits[0])
        return float(probs[1])  # Toxic probability

    def _softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

# Benchmark comparison
import time

def benchmark_classifiers():
    text = "This is a test message for toxicity classification"

    # Standard PyTorch
    torch_classifier = FastToxicityClassifier()
    start = time.time()
    for _ in range(100):
        torch_classifier.classify(text)
    torch_time = (time.time() - start) / 100 * 1000

    # ONNX
    onnx_classifier = ONNXToxicityClassifier()
    start = time.time()
    for _ in range(100):
        onnx_classifier.classify(text)
    onnx_time = (time.time() - start) / 100 * 1000

    print(f"PyTorch: {torch_time:.2f}ms per inference")
    print(f"ONNX: {onnx_time:.2f}ms per inference")
    print(f"Speedup: {torch_time/onnx_time:.2f}x")

Integration with Guardrails Pipeline

from dataclasses import dataclass
from enum import Enum

class ToxicityDecision(Enum):
    SAFE = "safe"
    FLAGGED = "flagged"
    BLOCKED = "blocked"

@dataclass
class ToxicityResult:
    decision: ToxicityDecision
    score: float
    categories: Dict[str, float]

async def toxicity_filter_layer(
    user_input: str,
    block_threshold: float = 0.9,
    flag_threshold: float = 0.5
) -> ToxicityResult:
    """Production toxicity filter layer."""
    classifier = FastToxicityClassifier()

    # Fast classification
    result = classifier.classify(user_input)
    toxic_score = result["toxic"]

    # Decision logic
    if toxic_score > block_threshold:
        return ToxicityResult(
            decision=ToxicityDecision.BLOCKED,
            score=toxic_score,
            categories=result
        )
    elif toxic_score > flag_threshold:
        return ToxicityResult(
            decision=ToxicityDecision.FLAGGED,
            score=toxic_score,
            categories=result
        )

    return ToxicityResult(
        decision=ToxicityDecision.SAFE,
        score=toxic_score,
        categories=result
    )

Production Tip: Use ONNX-optimized models for CPU deployment. For GPU, batch multiple requests together for higher throughput. Target < 30ms latency for the first-pass toxicity check.

Next: Building custom input validators for domain-specific requirements. :::

Quiz

Module 2: Input Filtering at Scale

Take Quiz