Building Input/Output Guardrails

Output Sanitization

3 min read

Even with perfect input validation, LLM outputs can contain harmful content. Output sanitization ensures responses are safe before reaching users or downstream systems.

Why Output Sanitization Matters

┌─────────────────────────────────────────────────────────────┐
│                    Output Risk Scenarios                     │
│                                                             │
│   1. XSS: LLM generates <script>alert('xss')</script>       │
│   2. Data leak: Model outputs training data (PII)           │
│   3. Harmful content: Violence, hate speech                 │
│   4. Hallucination: False information presented as fact     │
│   5. Instruction leak: System prompt exposed                │
└─────────────────────────────────────────────────────────────┘

HTML/XSS Sanitization

When LLM output is rendered in web contexts:

import html
import re
from typing import Optional

def sanitize_for_html(llm_output: str) -> str:
    """Sanitize LLM output for safe HTML rendering."""
    # Escape HTML special characters
    sanitized = html.escape(llm_output)

    # Remove any remaining script-like patterns
    sanitized = re.sub(
        r'javascript:',
        '[removed]:',
        sanitized,
        flags=re.IGNORECASE
    )
    sanitized = re.sub(
        r'on\w+\s*=',
        '[removed]=',
        sanitized,
        flags=re.IGNORECASE
    )

    return sanitized

def sanitize_for_markdown(llm_output: str) -> str:
    """Sanitize for markdown rendering (allows formatting)."""
    # Allow safe markdown but escape HTML
    sanitized = html.escape(llm_output)

    # Re-enable safe markdown patterns
    # Bold: **text** or __text__
    sanitized = re.sub(r'&lt;b&gt;(.+?)&lt;/b&gt;', r'**\1**', sanitized)

    return sanitized

# Example
raw_output = "Here's code: <script>steal_cookies()</script>"
safe_output = sanitize_for_html(raw_output)
print(safe_output)
# Output: Here's code: &lt;script&gt;steal_cookies()&lt;/script&gt;

PII Detection and Redaction

import re
from dataclasses import dataclass
from typing import List, Tuple

@dataclass
class PIIMatch:
    type: str
    value: str
    start: int
    end: int

class PIIRedactor:
    """Detect and redact personally identifiable information."""

    PATTERNS = {
        "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        "phone_us": r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
        "ssn": r'\b\d{3}-\d{2}-\d{4}\b',
        "credit_card": r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
        "ip_address": r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
    }

    def detect(self, text: str) -> List[PIIMatch]:
        """Find all PII in text."""
        matches = []
        for pii_type, pattern in self.PATTERNS.items():
            for match in re.finditer(pattern, text):
                matches.append(PIIMatch(
                    type=pii_type,
                    value=match.group(),
                    start=match.start(),
                    end=match.end()
                ))
        return matches

    def redact(self, text: str) -> Tuple[str, List[PIIMatch]]:
        """Redact all PII from text."""
        matches = self.detect(text)
        redacted = text

        # Sort by position (reverse) to maintain indices
        for match in sorted(matches, key=lambda m: m.start, reverse=True):
            placeholder = f"[REDACTED_{match.type.upper()}]"
            redacted = redacted[:match.start] + placeholder + redacted[match.end:]

        return redacted, matches

# Usage
redactor = PIIRedactor()
llm_output = "Contact John at john@example.com or 555-123-4567"
safe_output, detected = redactor.redact(llm_output)
print(safe_output)
# Output: Contact John at [REDACTED_EMAIL] or [REDACTED_PHONE_US]

Content Moderation

from enum import Enum
from typing import Dict, List

class ContentCategory(Enum):
    SAFE = "safe"
    VIOLENCE = "violence"
    HATE_SPEECH = "hate_speech"
    SELF_HARM = "self_harm"
    SEXUAL = "sexual"
    DANGEROUS = "dangerous"

class OutputModerator:
    """Moderate LLM outputs for harmful content."""

    def __init__(self):
        # In production, use a proper moderation model
        self.blocked_phrases = self._load_blocked_phrases()

    def _load_blocked_phrases(self) -> Dict[ContentCategory, List[str]]:
        """Load blocked phrase patterns."""
        return {
            ContentCategory.VIOLENCE: [
                "how to make a bomb",
                "how to kill",
                "attack instructions",
            ],
            ContentCategory.DANGEROUS: [
                "synthesize drugs",
                "create malware",
                "hack into",
            ],
        }

    def check(self, text: str) -> Tuple[bool, List[ContentCategory]]:
        """Check if output contains harmful content."""
        text_lower = text.lower()
        detected_categories = []

        for category, phrases in self.blocked_phrases.items():
            for phrase in phrases:
                if phrase in text_lower:
                    detected_categories.append(category)
                    break

        is_safe = len(detected_categories) == 0
        return is_safe, detected_categories

    def filter(self, text: str) -> str:
        """Filter harmful content from output."""
        is_safe, categories = self.check(text)

        if not is_safe:
            return (
                "I can't provide that information. "
                "Please ask something else."
            )
        return text

# For production, use API-based moderation:
# - OpenAI Moderation API
# - Google Perspective API
# - Azure Content Safety

System Prompt Leak Detection

def detect_system_prompt_leak(
    llm_output: str,
    system_prompt: str,
    threshold: float = 0.5
) -> Tuple[bool, float]:
    """Detect if output contains system prompt content."""
    # Simple overlap detection
    system_words = set(system_prompt.lower().split())
    output_words = set(llm_output.lower().split())

    if not system_words:
        return False, 0.0

    overlap = len(system_words & output_words)
    overlap_ratio = overlap / len(system_words)

    is_leaked = overlap_ratio > threshold
    return is_leaked, overlap_ratio

def sanitize_system_prompt_leak(
    llm_output: str,
    system_prompt: str
) -> str:
    """Remove potential system prompt leaks from output."""
    is_leaked, ratio = detect_system_prompt_leak(llm_output, system_prompt)

    if is_leaked:
        return (
            "I can't share that information. "
            "How else can I help you?"
        )
    return llm_output

Complete Output Sanitizer

from dataclasses import dataclass
from typing import Optional

@dataclass
class SanitizationResult:
    original: str
    sanitized: str
    was_modified: bool
    blocked: bool
    reason: Optional[str] = None

class OutputSanitizer:
    """Complete output sanitization pipeline."""

    def __init__(self, system_prompt: str = ""):
        self.pii_redactor = PIIRedactor()
        self.moderator = OutputModerator()
        self.system_prompt = system_prompt

    def sanitize(
        self,
        llm_output: str,
        context: str = "html"
    ) -> SanitizationResult:
        """Run complete sanitization pipeline."""
        original = llm_output
        current = llm_output

        # Step 1: Check for harmful content
        is_safe, categories = self.moderator.check(current)
        if not is_safe:
            return SanitizationResult(
                original=original,
                sanitized="I can't provide that response.",
                was_modified=True,
                blocked=True,
                reason=f"Blocked categories: {categories}"
            )

        # Step 2: Check for system prompt leak
        if self.system_prompt:
            is_leaked, _ = detect_system_prompt_leak(current, self.system_prompt)
            if is_leaked:
                return SanitizationResult(
                    original=original,
                    sanitized="I can't share that information.",
                    was_modified=True,
                    blocked=True,
                    reason="System prompt leak detected"
                )

        # Step 3: Redact PII
        current, pii_matches = self.pii_redactor.redact(current)

        # Step 4: Context-specific sanitization
        if context == "html":
            current = sanitize_for_html(current)
        elif context == "markdown":
            current = sanitize_for_markdown(current)

        was_modified = current != original
        return SanitizationResult(
            original=original,
            sanitized=current,
            was_modified=was_modified,
            blocked=False
        )

# Usage
sanitizer = OutputSanitizer(system_prompt="You are a helpful assistant...")
result = sanitizer.sanitize(llm_response, context="html")
if not result.blocked:
    display_to_user(result.sanitized)

Key Takeaway: Output sanitization is your last line of defense. Always sanitize for the specific output context (HTML, markdown, JSON) and check for PII leaks, harmful content, and system prompt exposure. :::

Quiz

Module 4: Building Input/Output Guardrails

Take Quiz