Advanced Chunking Strategies
Optimal Chunk Design
3 min read
Chunk size, overlap, and format significantly impact retrieval quality. Here's how to optimize these parameters for your use case.
Chunk Size Guidelines
| Size (tokens) | Use Case | Trade-offs |
|---|---|---|
| 128-256 | Q&A, precise answers | High precision, may miss context |
| 256-512 | General RAG (recommended) | Balanced precision/context |
| 512-1024 | Long-form, complex topics | More context, lower precision |
| 1024+ | Full document context | Risk of noise, expensive |
Finding Optimal Size
Empirically test different sizes:
def evaluate_chunk_size(documents, queries, ground_truth, sizes=[256, 512, 768, 1024]):
"""Evaluate retrieval quality across chunk sizes."""
results = {}
for size in sizes:
# Create chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=size,
chunk_overlap=int(size * 0.1)
)
chunks = splitter.split_documents(documents)
# Index
vectorstore = Chroma.from_documents(chunks, embeddings)
# Evaluate
recall = calculate_recall(vectorstore, queries, ground_truth)
precision = calculate_precision(vectorstore, queries, ground_truth)
results[size] = {
"recall": recall,
"precision": precision,
"f1": 2 * (recall * precision) / (recall + precision),
"num_chunks": len(chunks)
}
return results
# Example output:
# 256: {"recall": 0.82, "precision": 0.71, "f1": 0.76}
# 512: {"recall": 0.78, "precision": 0.79, "f1": 0.78} <- Best F1
# 768: {"recall": 0.74, "precision": 0.82, "f1": 0.78}
# 1024: {"recall": 0.68, "precision": 0.85, "f1": 0.75}
Overlap Strategy
Overlap prevents information loss at chunk boundaries:
# Rule of thumb: 10-20% overlap
chunk_size = 512
chunk_overlap = 50 # ~10%
# For technical content with cross-references
chunk_overlap = 100 # ~20%
# Visual representation:
# Chunk 1: [===================]
# Chunk 2: [===================]
# Chunk 3: [===================]
# ^overlap^
When to increase overlap:
- Technical documentation with cross-references
- Legal text with dependent clauses
- Code with multi-line statements
Format Preservation
Maintain document structure in chunks:
class FormatPreservingChunker:
"""Preserve document formatting in chunks."""
def chunk(self, document: str) -> list[dict]:
chunks = []
# Detect document format
if self._is_markdown(document):
chunks = self._chunk_markdown(document)
elif self._is_code(document):
chunks = self._chunk_code(document)
else:
chunks = self._chunk_plain(document)
return chunks
def _chunk_markdown(self, document: str) -> list[dict]:
"""Keep markdown headers with their content."""
sections = re.split(r'(^#{1,3}\s.+$)', document, flags=re.MULTILINE)
chunks = []
current_header = ""
for section in sections:
if re.match(r'^#{1,3}\s', section):
current_header = section.strip()
elif section.strip():
chunks.append({
"content": f"{current_header}\n\n{section.strip()}",
"header": current_header,
"type": "markdown"
})
return chunks
def _chunk_code(self, document: str) -> list[dict]:
"""Split code by functions/classes."""
chunks = []
# Python function pattern
functions = re.findall(
r'((?:def|class)\s+\w+.*?(?=\n(?:def|class)|\Z))',
document,
re.DOTALL
)
for func in functions:
chunks.append({
"content": func.strip(),
"type": "code",
"language": "python"
})
return chunks
Content-Specific Strategies
Tables
Keep tables intact:
def preserve_tables(document: str) -> str:
"""Mark tables to prevent splitting."""
# Find markdown tables
table_pattern = r'(\|.+\|(?:\n\|.+\|)+)'
tables = re.findall(table_pattern, document)
# Replace with placeholders
for i, table in enumerate(tables):
document = document.replace(table, f"[[TABLE_{i}]]")
return document, tables
def restore_tables(chunks: list[str], tables: list[str]) -> list[str]:
"""Restore tables in chunks."""
restored = []
for chunk in chunks:
for i, table in enumerate(tables):
chunk = chunk.replace(f"[[TABLE_{i}]]", table)
restored.append(chunk)
return restored
Code Blocks
Never split code blocks:
def chunk_with_code_preservation(document: str, chunk_size: int) -> list[str]:
"""Chunk while preserving code blocks."""
# Extract code blocks
code_pattern = r'```[\s\S]*?```'
code_blocks = re.findall(code_pattern, document)
# Replace with placeholders
for i, block in enumerate(code_blocks):
document = document.replace(block, f"[[CODE_{i}]]")
# Split the text
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
chunks = splitter.split_text(document)
# Restore code blocks
restored = []
for chunk in chunks:
for i, block in enumerate(code_blocks):
chunk = chunk.replace(f"[[CODE_{i}]]", block)
restored.append(chunk)
return restored
Quality Checklist
| Check | Goal | Test |
|---|---|---|
| Completeness | No truncated sentences | Check chunk endings |
| Context | Chunks are self-contained | Can you understand without surrounding text? |
| Relevance | Focused content | Single topic per chunk |
| Format | Structure preserved | Tables, code, lists intact |
Diagnostic Tool
def diagnose_chunks(chunks: list[str]) -> dict:
"""Analyze chunk quality."""
diagnostics = {
"total_chunks": len(chunks),
"avg_length": sum(len(c) for c in chunks) / len(chunks),
"issues": []
}
for i, chunk in enumerate(chunks):
# Check for truncated sentences
if chunk[-1] not in '.!?"\'':
diagnostics["issues"].append(f"Chunk {i}: May be truncated")
# Check for orphaned references
if re.search(r'\b(it|this|these|that)\b', chunk[:50]):
diagnostics["issues"].append(f"Chunk {i}: Starts with pronoun reference")
# Check for code block integrity
if chunk.count('```') % 2 != 0:
diagnostics["issues"].append(f"Chunk {i}: Broken code block")
# Check length variance
if len(chunk) < diagnostics["avg_length"] * 0.3:
diagnostics["issues"].append(f"Chunk {i}: Very short ({len(chunk)} chars)")
return diagnostics
Recommended Defaults
# General purpose RAG
DEFAULT_CONFIG = {
"chunk_size": 512,
"chunk_overlap": 50,
"separators": ["\n\n", "\n", ". ", " ", ""],
"preserve_code_blocks": True,
"preserve_tables": True
}
# Q&A / FAQ
QA_CONFIG = {
"chunk_size": 256,
"chunk_overlap": 25,
"separators": ["\n\n", "\n", "? ", ". ", " "],
"keep_qa_pairs": True
}
# Technical documentation
TECHNICAL_CONFIG = {
"chunk_size": 768,
"chunk_overlap": 100,
"separators": ["\n## ", "\n### ", "\n\n", "\n", ". "],
"preserve_code_blocks": True,
"include_headers": True
}
Key Principle: The optimal chunk design depends on your queries, not your documents. Design chunks to match how users will search, not how content is organized.
In the next module, we'll explore hybrid search and reranking to maximize retrieval quality. :::