Building Input/Output Guardrails
Output Sanitization
3 min read
Even with perfect input validation, LLM outputs can contain harmful content. Output sanitization ensures responses are safe before reaching users or downstream systems.
Why Output Sanitization Matters
┌─────────────────────────────────────────────────────────────┐
│ Output Risk Scenarios │
│ │
│ 1. XSS: LLM generates <script>alert('xss')</script> │
│ 2. Data leak: Model outputs training data (PII) │
│ 3. Harmful content: Violence, hate speech │
│ 4. Hallucination: False information presented as fact │
│ 5. Instruction leak: System prompt exposed │
└─────────────────────────────────────────────────────────────┘
HTML/XSS Sanitization
When LLM output is rendered in web contexts:
import html
import re
from typing import Optional
def sanitize_for_html(llm_output: str) -> str:
"""Sanitize LLM output for safe HTML rendering."""
# Escape HTML special characters
sanitized = html.escape(llm_output)
# Remove any remaining script-like patterns
sanitized = re.sub(
r'javascript:',
'[removed]:',
sanitized,
flags=re.IGNORECASE
)
sanitized = re.sub(
r'on\w+\s*=',
'[removed]=',
sanitized,
flags=re.IGNORECASE
)
return sanitized
def sanitize_for_markdown(llm_output: str) -> str:
"""Sanitize for markdown rendering (allows formatting)."""
# Allow safe markdown but escape HTML
sanitized = html.escape(llm_output)
# Re-enable safe markdown patterns
# Bold: **text** or __text__
sanitized = re.sub(r'<b>(.+?)</b>', r'**\1**', sanitized)
return sanitized
# Example
raw_output = "Here's code: <script>steal_cookies()</script>"
safe_output = sanitize_for_html(raw_output)
print(safe_output)
# Output: Here's code: <script>steal_cookies()</script>
PII Detection and Redaction
import re
from dataclasses import dataclass
from typing import List, Tuple
@dataclass
class PIIMatch:
type: str
value: str
start: int
end: int
class PIIRedactor:
"""Detect and redact personally identifiable information."""
PATTERNS = {
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
"phone_us": r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
"ssn": r'\b\d{3}-\d{2}-\d{4}\b',
"credit_card": r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
"ip_address": r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
}
def detect(self, text: str) -> List[PIIMatch]:
"""Find all PII in text."""
matches = []
for pii_type, pattern in self.PATTERNS.items():
for match in re.finditer(pattern, text):
matches.append(PIIMatch(
type=pii_type,
value=match.group(),
start=match.start(),
end=match.end()
))
return matches
def redact(self, text: str) -> Tuple[str, List[PIIMatch]]:
"""Redact all PII from text."""
matches = self.detect(text)
redacted = text
# Sort by position (reverse) to maintain indices
for match in sorted(matches, key=lambda m: m.start, reverse=True):
placeholder = f"[REDACTED_{match.type.upper()}]"
redacted = redacted[:match.start] + placeholder + redacted[match.end:]
return redacted, matches
# Usage
redactor = PIIRedactor()
llm_output = "Contact John at john@example.com or 555-123-4567"
safe_output, detected = redactor.redact(llm_output)
print(safe_output)
# Output: Contact John at [REDACTED_EMAIL] or [REDACTED_PHONE_US]
Content Moderation
from enum import Enum
from typing import Dict, List
class ContentCategory(Enum):
SAFE = "safe"
VIOLENCE = "violence"
HATE_SPEECH = "hate_speech"
SELF_HARM = "self_harm"
SEXUAL = "sexual"
DANGEROUS = "dangerous"
class OutputModerator:
"""Moderate LLM outputs for harmful content."""
def __init__(self):
# In production, use a proper moderation model
self.blocked_phrases = self._load_blocked_phrases()
def _load_blocked_phrases(self) -> Dict[ContentCategory, List[str]]:
"""Load blocked phrase patterns."""
return {
ContentCategory.VIOLENCE: [
"how to make a bomb",
"how to kill",
"attack instructions",
],
ContentCategory.DANGEROUS: [
"synthesize drugs",
"create malware",
"hack into",
],
}
def check(self, text: str) -> Tuple[bool, List[ContentCategory]]:
"""Check if output contains harmful content."""
text_lower = text.lower()
detected_categories = []
for category, phrases in self.blocked_phrases.items():
for phrase in phrases:
if phrase in text_lower:
detected_categories.append(category)
break
is_safe = len(detected_categories) == 0
return is_safe, detected_categories
def filter(self, text: str) -> str:
"""Filter harmful content from output."""
is_safe, categories = self.check(text)
if not is_safe:
return (
"I can't provide that information. "
"Please ask something else."
)
return text
# For production, use API-based moderation:
# - OpenAI Moderation API
# - Google Perspective API
# - Azure Content Safety
System Prompt Leak Detection
def detect_system_prompt_leak(
llm_output: str,
system_prompt: str,
threshold: float = 0.5
) -> Tuple[bool, float]:
"""Detect if output contains system prompt content."""
# Simple overlap detection
system_words = set(system_prompt.lower().split())
output_words = set(llm_output.lower().split())
if not system_words:
return False, 0.0
overlap = len(system_words & output_words)
overlap_ratio = overlap / len(system_words)
is_leaked = overlap_ratio > threshold
return is_leaked, overlap_ratio
def sanitize_system_prompt_leak(
llm_output: str,
system_prompt: str
) -> str:
"""Remove potential system prompt leaks from output."""
is_leaked, ratio = detect_system_prompt_leak(llm_output, system_prompt)
if is_leaked:
return (
"I can't share that information. "
"How else can I help you?"
)
return llm_output
Complete Output Sanitizer
from dataclasses import dataclass
from typing import Optional
@dataclass
class SanitizationResult:
original: str
sanitized: str
was_modified: bool
blocked: bool
reason: Optional[str] = None
class OutputSanitizer:
"""Complete output sanitization pipeline."""
def __init__(self, system_prompt: str = ""):
self.pii_redactor = PIIRedactor()
self.moderator = OutputModerator()
self.system_prompt = system_prompt
def sanitize(
self,
llm_output: str,
context: str = "html"
) -> SanitizationResult:
"""Run complete sanitization pipeline."""
original = llm_output
current = llm_output
# Step 1: Check for harmful content
is_safe, categories = self.moderator.check(current)
if not is_safe:
return SanitizationResult(
original=original,
sanitized="I can't provide that response.",
was_modified=True,
blocked=True,
reason=f"Blocked categories: {categories}"
)
# Step 2: Check for system prompt leak
if self.system_prompt:
is_leaked, _ = detect_system_prompt_leak(current, self.system_prompt)
if is_leaked:
return SanitizationResult(
original=original,
sanitized="I can't share that information.",
was_modified=True,
blocked=True,
reason="System prompt leak detected"
)
# Step 3: Redact PII
current, pii_matches = self.pii_redactor.redact(current)
# Step 4: Context-specific sanitization
if context == "html":
current = sanitize_for_html(current)
elif context == "markdown":
current = sanitize_for_markdown(current)
was_modified = current != original
return SanitizationResult(
original=original,
sanitized=current,
was_modified=was_modified,
blocked=False
)
# Usage
sanitizer = OutputSanitizer(system_prompt="You are a helpful assistant...")
result = sanitizer.sanitize(llm_response, context="html")
if not result.blocked:
display_to_user(result.sanitized)
Key Takeaway: Output sanitization is your last line of defense. Always sanitize for the specific output context (HTML, markdown, JSON) and check for PII leaks, harmful content, and system prompt exposure. :::