Advanced Chunking Strategies
Hierarchical & Contextual Chunking
3 min read
Advanced chunking strategies that preserve document structure and add context to improve retrieval accuracy.
The Context Problem
Standard chunks lose their surrounding context:
Document: "Chapter 3: Security Best Practices
...
Use environment variables for secrets.
Never hardcode API keys.
..."
Chunk: "Never hardcode API keys."
Query: "How should I handle API keys?"
Problem: The chunk matches but lacks context about WHY
and doesn't mention it's from security chapter
Hierarchical Chunking
Create parent-child relationships between chunks:
from langchain.text_splitter import RecursiveCharacterTextSplitter
class HierarchicalChunker:
def __init__(self):
# Parent chunks - larger, for context
self.parent_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=200
)
# Child chunks - smaller, for retrieval
self.child_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=50
)
def chunk(self, document: str) -> list[dict]:
parents = self.parent_splitter.split_text(document)
all_chunks = []
for parent_id, parent in enumerate(parents):
children = self.child_splitter.split_text(parent)
for child_id, child in enumerate(children):
all_chunks.append({
"id": f"p{parent_id}_c{child_id}",
"content": child,
"parent_id": f"p{parent_id}",
"parent_content": parent,
"metadata": {
"parent_id": f"p{parent_id}",
"child_index": child_id
}
})
return all_chunks
Parent Document Retrieval
Retrieve child, return parent for full context:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
# Store for parent documents
docstore = InMemoryStore()
# Create retriever
retriever = ParentDocumentRetriever(
vectorstore=vectorstore,
docstore=docstore,
child_splitter=child_splitter,
parent_splitter=parent_splitter
)
# Add documents - stores parents in docstore, children in vectorstore
retriever.add_documents(documents)
# Search - finds relevant children, returns parent documents
results = retriever.get_relevant_documents("API key handling")
# Returns full parent chunks with complete context
How it works:
- Index small child chunks for precise retrieval
- Link each child to its parent
- On retrieval, return the parent (larger context)
Contextual Chunking
Add context to each chunk before embedding:
class ContextualChunker:
"""Add surrounding context to each chunk."""
def __init__(self, llm):
self.llm = llm
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=50
)
def chunk_with_context(self, document: str) -> list[dict]:
chunks = self.splitter.split_text(document)
contextualized = []
# Get document summary for global context
doc_summary = self._summarize(document[:3000])
for i, chunk in enumerate(chunks):
# Generate context for this chunk
context = self._generate_context(
chunk=chunk,
prev_chunk=chunks[i-1] if i > 0 else None,
next_chunk=chunks[i+1] if i < len(chunks)-1 else None,
doc_summary=doc_summary
)
contextualized.append({
"original": chunk,
"contextualized": f"{context}\n\n{chunk}",
"context": context
})
return contextualized
def _generate_context(self, chunk, prev_chunk, next_chunk, doc_summary):
prompt = f"""Given this chunk from a document, write a brief context
(1-2 sentences) that situates this chunk within the document.
Document summary: {doc_summary}
Previous chunk: {prev_chunk[:200] if prev_chunk else 'N/A'}
Current chunk: {chunk}
Next chunk: {next_chunk[:200] if next_chunk else 'N/A'}
Context for this chunk:"""
return self.llm.invoke(prompt).content
Example output:
Original chunk:
"Never hardcode API keys in your source code."
Contextualized chunk:
"This section discusses security best practices for handling
sensitive credentials in production applications.
Never hardcode API keys in your source code."
Document Hierarchy
For structured documents, preserve the hierarchy:
class DocumentHierarchyChunker:
"""Preserve document structure in chunks."""
def chunk_with_hierarchy(self, document: str) -> list[dict]:
chunks = []
current_hierarchy = []
for line in document.split('\n'):
# Detect headers
if line.startswith('# '):
current_hierarchy = [line[2:]]
elif line.startswith('## '):
current_hierarchy = current_hierarchy[:1] + [line[3:]]
elif line.startswith('### '):
current_hierarchy = current_hierarchy[:2] + [line[4:]]
# Add hierarchy to chunk metadata
if self._is_content(line):
chunks.append({
"content": line,
"hierarchy": current_hierarchy.copy(),
"breadcrumb": " > ".join(current_hierarchy)
})
return self._merge_chunks(chunks)
def _merge_chunks(self, chunks: list[dict]) -> list[dict]:
"""Merge small chunks with same hierarchy."""
merged = []
current = None
for chunk in chunks:
if current and chunk["breadcrumb"] == current["breadcrumb"]:
current["content"] += "\n" + chunk["content"]
else:
if current:
merged.append(current)
current = chunk.copy()
if current:
merged.append(current)
return merged
Multi-Vector Retrieval
Store multiple representations per chunk:
class MultiVectorChunker:
"""Create multiple embeddings per chunk."""
def __init__(self, embed_model, llm):
self.embed_model = embed_model
self.llm = llm
def create_multi_vectors(self, chunk: str) -> dict:
# Original embedding
original_emb = self.embed_model.encode(chunk)
# Summary embedding (for abstract queries)
summary = self._summarize(chunk)
summary_emb = self.embed_model.encode(summary)
# Question embedding (what questions does this answer?)
questions = self._generate_questions(chunk)
question_embs = [self.embed_model.encode(q) for q in questions]
return {
"chunk": chunk,
"vectors": {
"original": original_emb,
"summary": summary_emb,
"questions": question_embs
}
}
def _generate_questions(self, chunk: str) -> list[str]:
prompt = f"""Generate 3 questions that this text answers:
Text: {chunk}
Questions:"""
response = self.llm.invoke(prompt)
return response.content.strip().split('\n')
Benefits:
- Original: Matches similar content
- Summary: Matches abstract queries
- Questions: Matches user questions directly
Comparison
| Strategy | Context Quality | Retrieval Precision | Complexity |
|---|---|---|---|
| Standard | Low | Medium | Low |
| Hierarchical | High | High | Medium |
| Contextual | High | High | High |
| Multi-vector | Highest | Highest | Highest |
Implementation Tip: Start with hierarchical chunking (parent-child) for immediate quality gains. Add contextual chunking when you have specific retrieval failures that lack context.
Next, let's design optimal chunk parameters for your specific use case. :::