Input Filtering at Scale

Presidio PII Detection & Masking

3 min read

Microsoft Presidio is an open-source framework for detecting and anonymizing sensitive data in text. This lesson covers integrating Presidio into your LLM guardrails pipeline for production PII protection.

Why Presidio for LLM Applications

LLM applications face two critical PII risks:

  1. Input leakage: Users accidentally share sensitive data that shouldn't be processed
  2. Output leakage: LLMs may hallucinate or leak training data containing PII

Presidio addresses both with fast, customizable entity detection.

Installation and Setup

# Install Presidio components
# pip install presidio-analyzer presidio-anonymizer

from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

# Initialize engines
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Test basic detection
text = "Contact John Smith at john.smith@company.com or 555-123-4567"
results = analyzer.analyze(text=text, language="en")

for result in results:
    print(f"{result.entity_type}: {text[result.start:result.end]} (score: {result.score:.2f})")
# Output:
# PERSON: John Smith (score: 0.85)
# EMAIL_ADDRESS: john.smith@company.com (score: 1.00)
# PHONE_NUMBER: 555-123-4567 (score: 0.75)

Entity Types for LLM Applications

Presidio supports many entity types. Configure based on your requirements:

# Common entities for different industries
HEALTHCARE_ENTITIES = [
    "PERSON", "DATE_TIME", "PHONE_NUMBER", "EMAIL_ADDRESS",
    "MEDICAL_LICENSE", "US_SSN", "US_DRIVER_LICENSE"
]

FINANCE_ENTITIES = [
    "PERSON", "CREDIT_CARD", "US_BANK_NUMBER", "IBAN_CODE",
    "US_SSN", "EMAIL_ADDRESS", "PHONE_NUMBER"
]

GENERAL_ENTITIES = [
    "PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER",
    "CREDIT_CARD", "IP_ADDRESS", "LOCATION"
]

# Analyze with specific entities
results = analyzer.analyze(
    text=user_input,
    entities=GENERAL_ENTITIES,
    language="en"
)

Production Anonymization Strategies

Different use cases require different anonymization approaches:

from presidio_anonymizer.entities import OperatorConfig

class PIIAnonymizer:
    """Production PII anonymization with multiple strategies."""

    def __init__(self):
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()

    def mask(self, text: str) -> str:
        """Replace PII with masked values."""
        results = self.analyzer.analyze(text=text, language="en")

        return self.anonymizer.anonymize(
            text=text,
            analyzer_results=results,
            operators={
                "DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
                "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "[EMAIL]"}),
                "PHONE_NUMBER": OperatorConfig("replace", {"new_value": "[PHONE]"}),
                "CREDIT_CARD": OperatorConfig("replace", {"new_value": "[CARD]"}),
            }
        ).text

    def hash_pii(self, text: str) -> str:
        """Replace PII with hashed values (reversible with key)."""
        results = self.analyzer.analyze(text=text, language="en")

        return self.anonymizer.anonymize(
            text=text,
            analyzer_results=results,
            operators={
                "DEFAULT": OperatorConfig("hash", {"hash_type": "sha256"})
            }
        ).text

    def encrypt_pii(self, text: str, key: str) -> str:
        """Encrypt PII (fully reversible)."""
        results = self.analyzer.analyze(text=text, language="en")

        return self.anonymizer.anonymize(
            text=text,
            analyzer_results=results,
            operators={
                "DEFAULT": OperatorConfig("encrypt", {"key": key})
            }
        ).text

# Usage
anonymizer = PIIAnonymizer()

original = "Email me at jane@corp.com, my SSN is 123-45-6789"
masked = anonymizer.mask(original)
# Output: "Email me at [EMAIL], my SSN is [REDACTED]"

Async Integration for Production

Presidio isn't natively async, but we can wrap it efficiently:

import asyncio
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict

class AsyncPresidio:
    """Async wrapper for Presidio with connection pooling."""

    def __init__(self, max_workers: int = 4):
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()
        self.executor = ThreadPoolExecutor(max_workers=max_workers)

    async def analyze(self, text: str, entities: List[str] = None) -> List[Dict]:
        """Async entity analysis."""
        loop = asyncio.get_event_loop()
        results = await loop.run_in_executor(
            self.executor,
            lambda: self.analyzer.analyze(
                text=text,
                entities=entities,
                language="en"
            )
        )
        return [
            {
                "entity_type": r.entity_type,
                "start": r.start,
                "end": r.end,
                "score": r.score,
                "text": text[r.start:r.end]
            }
            for r in results
        ]

    async def anonymize(self, text: str, strategy: str = "mask") -> str:
        """Async anonymization."""
        loop = asyncio.get_event_loop()

        def _anonymize():
            results = self.analyzer.analyze(text=text, language="en")
            return self.anonymizer.anonymize(
                text=text,
                analyzer_results=results
            ).text

        return await loop.run_in_executor(self.executor, _anonymize)

# Usage in guardrails pipeline
async def pii_filter_layer(user_input: str) -> tuple[str, Dict]:
    presidio = AsyncPresidio()

    # Detect entities
    entities = await presidio.analyze(user_input)

    if not entities:
        return user_input, {"pii_detected": False}

    # Anonymize
    cleaned = await presidio.anonymize(user_input)

    return cleaned, {
        "pii_detected": True,
        "entities_found": len(entities),
        "entity_types": list(set(e["entity_type"] for e in entities))
    }

Custom Recognizers for Domain-Specific PII

from presidio_analyzer import PatternRecognizer, Pattern

# Custom recognizer for internal employee IDs
employee_id_recognizer = PatternRecognizer(
    supported_entity="EMPLOYEE_ID",
    patterns=[
        Pattern(
            name="employee_id",
            regex=r"EMP-\d{6}",
            score=0.9
        )
    ]
)

# Add to registry
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
registry.add_recognizer(employee_id_recognizer)

# Create analyzer with custom registry
custom_analyzer = AnalyzerEngine(registry=registry)

# Now detects custom entities
text = "Contact EMP-123456 for assistance"
results = custom_analyzer.analyze(text=text, language="en")
# Detects: EMPLOYEE_ID: EMP-123456

Production Tip: Presidio typically processes text in 20-50ms, making it fast enough for the pre-LLM filtering layer. For batch processing, use the ThreadPoolExecutor pattern shown above.

Next: Building prompt injection detection pipelines. :::

Quiz

Module 2: Input Filtering at Scale

Take Quiz