Optimal Chunk Design

Chunk size, overlap, and format significantly impact retrieval quality. Here's how to optimize these parameters for your use case.

Chunk Size Guidelines

Size (tokens)	Use Case	Trade-offs
128-256	Q&A, precise answers	High precision, may miss context
256-512	General RAG (recommended)	Balanced precision/context
512-1024	Long-form, complex topics	More context, lower precision
1024+	Full document context	Risk of noise, expensive

Finding Optimal Size

Empirically test different sizes:

def evaluate_chunk_size(documents, queries, ground_truth, sizes=[256, 512, 768, 1024]):
    """Evaluate retrieval quality across chunk sizes."""
    results = {}

    for size in sizes:
        # Create chunks
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=size,
            chunk_overlap=int(size * 0.1)
        )
        chunks = splitter.split_documents(documents)

        # Index
        vectorstore = Chroma.from_documents(chunks, embeddings)

        # Evaluate
        recall = calculate_recall(vectorstore, queries, ground_truth)
        precision = calculate_precision(vectorstore, queries, ground_truth)

        results[size] = {
            "recall": recall,
            "precision": precision,
            "f1": 2 * (recall * precision) / (recall + precision),
            "num_chunks": len(chunks)
        }

    return results

# Example output:
# 256: {"recall": 0.82, "precision": 0.71, "f1": 0.76}
# 512: {"recall": 0.78, "precision": 0.79, "f1": 0.78}  <- Best F1
# 768: {"recall": 0.74, "precision": 0.82, "f1": 0.78}
# 1024: {"recall": 0.68, "precision": 0.85, "f1": 0.75}

Overlap Strategy

Overlap prevents information loss at chunk boundaries:

# Rule of thumb: 10-20% overlap
chunk_size = 512
chunk_overlap = 50  # ~10%

# For technical content with cross-references
chunk_overlap = 100  # ~20%

# Visual representation:
# Chunk 1: [===================]
# Chunk 2:              [===================]
# Chunk 3:                       [===================]
#                       ^overlap^

When to increase overlap:

Technical documentation with cross-references
Legal text with dependent clauses
Code with multi-line statements

Format Preservation

Maintain document structure in chunks:

class FormatPreservingChunker:
    """Preserve document formatting in chunks."""

    def chunk(self, document: str) -> list[dict]:
        chunks = []

        # Detect document format
        if self._is_markdown(document):
            chunks = self._chunk_markdown(document)
        elif self._is_code(document):
            chunks = self._chunk_code(document)
        else:
            chunks = self._chunk_plain(document)

        return chunks

    def _chunk_markdown(self, document: str) -> list[dict]:
        """Keep markdown headers with their content."""
        sections = re.split(r'(^#{1,3}\s.+$)', document, flags=re.MULTILINE)
        chunks = []
        current_header = ""

        for section in sections:
            if re.match(r'^#{1,3}\s', section):
                current_header = section.strip()
            elif section.strip():
                chunks.append({
                    "content": f"{current_header}\n\n{section.strip()}",
                    "header": current_header,
                    "type": "markdown"
                })

        return chunks

    def _chunk_code(self, document: str) -> list[dict]:
        """Split code by functions/classes."""
        chunks = []

        # Python function pattern
        functions = re.findall(
            r'((?:def|class)\s+\w+.*?(?=\n(?:def|class)|\Z))',
            document,
            re.DOTALL
        )

        for func in functions:
            chunks.append({
                "content": func.strip(),
                "type": "code",
                "language": "python"
            })

        return chunks

Content-Specific Strategies

Tables

Keep tables intact:

def preserve_tables(document: str) -> str:
    """Mark tables to prevent splitting."""
    # Find markdown tables
    table_pattern = r'(\|.+\|(?:\n\|.+\|)+)'
    tables = re.findall(table_pattern, document)

    # Replace with placeholders
    for i, table in enumerate(tables):
        document = document.replace(table, f"[[TABLE_{i}]]")

    return document, tables

def restore_tables(chunks: list[str], tables: list[str]) -> list[str]:
    """Restore tables in chunks."""
    restored = []
    for chunk in chunks:
        for i, table in enumerate(tables):
            chunk = chunk.replace(f"[[TABLE_{i}]]", table)
        restored.append(chunk)
    return restored

Code Blocks

Never split code blocks:

def chunk_with_code_preservation(document: str, chunk_size: int) -> list[str]:
    """Chunk while preserving code blocks."""
    # Extract code blocks
    code_pattern = r'```[\s\S]*?```'
    code_blocks = re.findall(code_pattern, document)

    # Replace with placeholders
    for i, block in enumerate(code_blocks):
        document = document.replace(block, f"[[CODE_{i}]]")

    # Split the text
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
    chunks = splitter.split_text(document)

    # Restore code blocks
    restored = []
    for chunk in chunks:
        for i, block in enumerate(code_blocks):
            chunk = chunk.replace(f"[[CODE_{i}]]", block)
        restored.append(chunk)

    return restored

Quality Checklist

Check	Goal	Test
Completeness	No truncated sentences	Check chunk endings
Context	Chunks are self-contained	Can you understand without surrounding text?
Relevance	Focused content	Single topic per chunk
Format	Structure preserved	Tables, code, lists intact

Diagnostic Tool

def diagnose_chunks(chunks: list[str]) -> dict:
    """Analyze chunk quality."""
    diagnostics = {
        "total_chunks": len(chunks),
        "avg_length": sum(len(c) for c in chunks) / len(chunks),
        "issues": []
    }

    for i, chunk in enumerate(chunks):
        # Check for truncated sentences
        if chunk[-1] not in '.!?"\'':
            diagnostics["issues"].append(f"Chunk {i}: May be truncated")

        # Check for orphaned references
        if re.search(r'\b(it|this|these|that)\b', chunk[:50]):
            diagnostics["issues"].append(f"Chunk {i}: Starts with pronoun reference")

        # Check for code block integrity
        if chunk.count('```') % 2 != 0:
            diagnostics["issues"].append(f"Chunk {i}: Broken code block")

        # Check length variance
        if len(chunk) < diagnostics["avg_length"] * 0.3:
            diagnostics["issues"].append(f"Chunk {i}: Very short ({len(chunk)} chars)")

    return diagnostics

Recommended Defaults

# General purpose RAG
DEFAULT_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 50,
    "separators": ["\n\n", "\n", ". ", " ", ""],
    "preserve_code_blocks": True,
    "preserve_tables": True
}

# Q&A / FAQ
QA_CONFIG = {
    "chunk_size": 256,
    "chunk_overlap": 25,
    "separators": ["\n\n", "\n", "? ", ". ", " "],
    "keep_qa_pairs": True
}

# Technical documentation
TECHNICAL_CONFIG = {
    "chunk_size": 768,
    "chunk_overlap": 100,
    "separators": ["\n## ", "\n### ", "\n\n", "\n", ". "],
    "preserve_code_blocks": True,
    "include_headers": True
}

Key Principle: The optimal chunk design depends on your queries, not your documents. Design chunks to match how users will search, not how content is organized.

In the next module, we'll explore hybrid search and reranking to maximize retrieval quality. :::