Input Filtering at Scale
Fast Toxicity Classifiers
3 min read
Fast toxicity detection is essential for the first layer of input filtering. This lesson covers implementing lightweight classifiers that can process inputs in under 30ms while maintaining acceptable accuracy.
Toxicity Classification Models
| Model | Size | Latency (CPU) | Accuracy | Use Case |
|---|---|---|---|---|
| toxic-bert (distilled) | 66MB | 15-30ms | ~85% | Fast first pass |
| unitary/toxic-bert | 420MB | 50-100ms | ~90% | Balanced |
| HateBERT | 420MB | 50-100ms | ~88% | Hate speech focus |
| detoxify | 1.3GB | 100-200ms | ~93% | High accuracy |
Implementing DistilBERT Toxicity
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from typing import Dict, List
import torch
class FastToxicityClassifier:
"""Lightweight toxicity classifier for production input filtering."""
def __init__(self, model_name: str = "martin-ha/toxic-comment-model"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
self.model.to(self.device)
self.model.eval()
@torch.no_grad()
def classify(self, text: str) -> Dict[str, float]:
"""Classify single text for toxicity."""
inputs = self.tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=512,
padding=True
).to(self.device)
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
return {
"toxic": float(probs[0][1]),
"non_toxic": float(probs[0][0])
}
@torch.no_grad()
def batch_classify(self, texts: List[str], batch_size: int = 32) -> List[Dict]:
"""Batch classification for throughput."""
results = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
inputs = self.tokenizer(
batch,
return_tensors="pt",
truncation=True,
max_length=512,
padding=True
).to(self.device)
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
for j in range(len(batch)):
results.append({
"toxic": float(probs[j][1]),
"non_toxic": float(probs[j][0])
})
return results
# Usage
classifier = FastToxicityClassifier()
result = classifier.classify("This is a test message")
print(f"Toxicity score: {result['toxic']:.2%}")
Multi-Category Toxicity with Detoxify
from detoxify import Detoxify
class DetailedToxicityClassifier:
"""Multi-category toxicity detection."""
def __init__(self, model_type: str = "original"):
# model_type: "original", "unbiased", "multilingual"
self.model = Detoxify(model_type)
self.categories = [
"toxicity", "severe_toxicity", "obscene",
"threat", "insult", "identity_attack"
]
def classify(self, text: str) -> Dict[str, float]:
"""Get scores for all toxicity categories."""
return self.model.predict(text)
def check(self, text: str, thresholds: Dict[str, float] = None) -> bool:
"""Check if text exceeds any threshold."""
default_thresholds = {
"toxicity": 0.8,
"severe_toxicity": 0.5,
"threat": 0.5,
"identity_attack": 0.5,
}
thresholds = thresholds or default_thresholds
scores = self.classify(text)
for category, threshold in thresholds.items():
if scores.get(category, 0) > threshold:
return True # Toxic
return False # Safe
# Usage
classifier = DetailedToxicityClassifier()
scores = classifier.classify("Sample text to analyze")
# Returns: {'toxicity': 0.02, 'severe_toxicity': 0.001, ...}
ONNX Optimization for Production
Convert models to ONNX for 2-3x speedup:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from optimum.onnxruntime import ORTModelForSequenceClassification
import numpy as np
class ONNXToxicityClassifier:
"""ONNX-optimized toxicity classifier for production."""
def __init__(self, model_name: str = "martin-ha/toxic-comment-model"):
# Export and load as ONNX
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = ORTModelForSequenceClassification.from_pretrained(
model_name,
export=True # Auto-export to ONNX
)
def classify(self, text: str) -> float:
"""Ultra-fast classification with ONNX."""
inputs = self.tokenizer(
text,
return_tensors="np",
truncation=True,
max_length=512
)
outputs = self.model(**inputs)
probs = self._softmax(outputs.logits[0])
return float(probs[1]) # Toxic probability
def _softmax(self, x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
# Benchmark comparison
import time
def benchmark_classifiers():
text = "This is a test message for toxicity classification"
# Standard PyTorch
torch_classifier = FastToxicityClassifier()
start = time.time()
for _ in range(100):
torch_classifier.classify(text)
torch_time = (time.time() - start) / 100 * 1000
# ONNX
onnx_classifier = ONNXToxicityClassifier()
start = time.time()
for _ in range(100):
onnx_classifier.classify(text)
onnx_time = (time.time() - start) / 100 * 1000
print(f"PyTorch: {torch_time:.2f}ms per inference")
print(f"ONNX: {onnx_time:.2f}ms per inference")
print(f"Speedup: {torch_time/onnx_time:.2f}x")
Integration with Guardrails Pipeline
from dataclasses import dataclass
from enum import Enum
class ToxicityDecision(Enum):
SAFE = "safe"
FLAGGED = "flagged"
BLOCKED = "blocked"
@dataclass
class ToxicityResult:
decision: ToxicityDecision
score: float
categories: Dict[str, float]
async def toxicity_filter_layer(
user_input: str,
block_threshold: float = 0.9,
flag_threshold: float = 0.5
) -> ToxicityResult:
"""Production toxicity filter layer."""
classifier = FastToxicityClassifier()
# Fast classification
result = classifier.classify(user_input)
toxic_score = result["toxic"]
# Decision logic
if toxic_score > block_threshold:
return ToxicityResult(
decision=ToxicityDecision.BLOCKED,
score=toxic_score,
categories=result
)
elif toxic_score > flag_threshold:
return ToxicityResult(
decision=ToxicityDecision.FLAGGED,
score=toxic_score,
categories=result
)
return ToxicityResult(
decision=ToxicityDecision.SAFE,
score=toxic_score,
categories=result
)
Production Tip: Use ONNX-optimized models for CPU deployment. For GPU, batch multiple requests together for higher throughput. Target < 30ms latency for the first-pass toxicity check.
Next: Building custom input validators for domain-specific requirements. :::