Input Filtering at Scale
Presidio PII Detection & Masking
Microsoft Presidio is an open-source framework for detecting and anonymizing sensitive data in text. This lesson covers integrating Presidio into your LLM guardrails pipeline for production PII protection.
Why Presidio for LLM Applications
LLM applications face two critical PII risks:
- Input leakage: Users accidentally share sensitive data that shouldn't be processed
- Output leakage: LLMs may hallucinate or leak training data containing PII
Presidio addresses both with fast, customizable entity detection.
Installation and Setup
# Install Presidio components
# pip install presidio-analyzer presidio-anonymizer
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
# Initialize engines
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
# Test basic detection
text = "Contact John Smith at john.smith@company.com or 555-123-4567"
results = analyzer.analyze(text=text, language="en")
for result in results:
print(f"{result.entity_type}: {text[result.start:result.end]} (score: {result.score:.2f})")
# Output:
# PERSON: John Smith (score: 0.85)
# EMAIL_ADDRESS: john.smith@company.com (score: 1.00)
# PHONE_NUMBER: 555-123-4567 (score: 0.75)
Entity Types for LLM Applications
Presidio supports many entity types. Configure based on your requirements:
# Common entities for different industries
HEALTHCARE_ENTITIES = [
"PERSON", "DATE_TIME", "PHONE_NUMBER", "EMAIL_ADDRESS",
"MEDICAL_LICENSE", "US_SSN", "US_DRIVER_LICENSE"
]
FINANCE_ENTITIES = [
"PERSON", "CREDIT_CARD", "US_BANK_NUMBER", "IBAN_CODE",
"US_SSN", "EMAIL_ADDRESS", "PHONE_NUMBER"
]
GENERAL_ENTITIES = [
"PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER",
"CREDIT_CARD", "IP_ADDRESS", "LOCATION"
]
# Analyze with specific entities
results = analyzer.analyze(
text=user_input,
entities=GENERAL_ENTITIES,
language="en"
)
Production Anonymization Strategies
Different use cases require different anonymization approaches:
from presidio_anonymizer.entities import OperatorConfig
class PIIAnonymizer:
"""Production PII anonymization with multiple strategies."""
def __init__(self):
self.analyzer = AnalyzerEngine()
self.anonymizer = AnonymizerEngine()
def mask(self, text: str) -> str:
"""Replace PII with masked values."""
results = self.analyzer.analyze(text=text, language="en")
return self.anonymizer.anonymize(
text=text,
analyzer_results=results,
operators={
"DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
"EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "[EMAIL]"}),
"PHONE_NUMBER": OperatorConfig("replace", {"new_value": "[PHONE]"}),
"CREDIT_CARD": OperatorConfig("replace", {"new_value": "[CARD]"}),
}
).text
def hash_pii(self, text: str) -> str:
"""Replace PII with hashed values (reversible with key)."""
results = self.analyzer.analyze(text=text, language="en")
return self.anonymizer.anonymize(
text=text,
analyzer_results=results,
operators={
"DEFAULT": OperatorConfig("hash", {"hash_type": "sha256"})
}
).text
def encrypt_pii(self, text: str, key: str) -> str:
"""Encrypt PII (fully reversible)."""
results = self.analyzer.analyze(text=text, language="en")
return self.anonymizer.anonymize(
text=text,
analyzer_results=results,
operators={
"DEFAULT": OperatorConfig("encrypt", {"key": key})
}
).text
# Usage
anonymizer = PIIAnonymizer()
original = "Email me at jane@corp.com, my SSN is 123-45-6789"
masked = anonymizer.mask(original)
# Output: "Email me at [EMAIL], my SSN is [REDACTED]"
Async Integration for Production
Presidio isn't natively async, but we can wrap it efficiently:
import asyncio
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict
class AsyncPresidio:
"""Async wrapper for Presidio with connection pooling."""
def __init__(self, max_workers: int = 4):
self.analyzer = AnalyzerEngine()
self.anonymizer = AnonymizerEngine()
self.executor = ThreadPoolExecutor(max_workers=max_workers)
async def analyze(self, text: str, entities: List[str] = None) -> List[Dict]:
"""Async entity analysis."""
loop = asyncio.get_event_loop()
results = await loop.run_in_executor(
self.executor,
lambda: self.analyzer.analyze(
text=text,
entities=entities,
language="en"
)
)
return [
{
"entity_type": r.entity_type,
"start": r.start,
"end": r.end,
"score": r.score,
"text": text[r.start:r.end]
}
for r in results
]
async def anonymize(self, text: str, strategy: str = "mask") -> str:
"""Async anonymization."""
loop = asyncio.get_event_loop()
def _anonymize():
results = self.analyzer.analyze(text=text, language="en")
return self.anonymizer.anonymize(
text=text,
analyzer_results=results
).text
return await loop.run_in_executor(self.executor, _anonymize)
# Usage in guardrails pipeline
async def pii_filter_layer(user_input: str) -> tuple[str, Dict]:
presidio = AsyncPresidio()
# Detect entities
entities = await presidio.analyze(user_input)
if not entities:
return user_input, {"pii_detected": False}
# Anonymize
cleaned = await presidio.anonymize(user_input)
return cleaned, {
"pii_detected": True,
"entities_found": len(entities),
"entity_types": list(set(e["entity_type"] for e in entities))
}
Custom Recognizers for Domain-Specific PII
from presidio_analyzer import PatternRecognizer, Pattern
# Custom recognizer for internal employee IDs
employee_id_recognizer = PatternRecognizer(
supported_entity="EMPLOYEE_ID",
patterns=[
Pattern(
name="employee_id",
regex=r"EMP-\d{6}",
score=0.9
)
]
)
# Add to registry
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
registry.add_recognizer(employee_id_recognizer)
# Create analyzer with custom registry
custom_analyzer = AnalyzerEngine(registry=registry)
# Now detects custom entities
text = "Contact EMP-123456 for assistance"
results = custom_analyzer.analyze(text=text, language="en")
# Detects: EMPLOYEE_ID: EMP-123456
Production Tip: Presidio typically processes text in 20-50ms, making it fast enough for the pre-LLM filtering layer. For batch processing, use the ThreadPoolExecutor pattern shown above.
Next: Building prompt injection detection pipelines. :::