Lesson 7 of 24

Dataset Preparation

Data Quality and Cleaning

3 min read

Poor data quality is the #1 cause of fine-tuning failures. Let's learn how to identify and fix common data quality issues.

Common Data Quality Issues

1. Duplicates

Duplicate examples cause the model to overfit on repeated patterns.

from datasets import Dataset

def remove_duplicates(dataset):
    seen = set()
    unique_examples = []

    for example in dataset:
        # Create a hash of the instruction
        key = hash(example["instruction"].strip().lower())

        if key not in seen:
            seen.add(key)
            unique_examples.append(example)

    print(f"Removed {len(dataset) - len(unique_examples)} duplicates")
    return Dataset.from_list(unique_examples)

2. Empty or Malformed Examples

def validate_example(example):
    """Check if an example is valid."""
    # Check for empty fields
    if not example.get("instruction", "").strip():
        return False, "Empty instruction"

    if not example.get("output", "").strip():
        return False, "Empty output"

    # Check minimum length
    if len(example["instruction"]) < 10:
        return False, "Instruction too short"

    if len(example["output"]) < 5:
        return False, "Output too short"

    return True, "Valid"

def clean_dataset(dataset):
    valid_examples = []
    issues = []

    for i, example in enumerate(dataset):
        is_valid, reason = validate_example(example)
        if is_valid:
            valid_examples.append(example)
        else:
            issues.append(f"Example {i}: {reason}")

    print(f"Kept {len(valid_examples)}/{len(dataset)} examples")
    return Dataset.from_list(valid_examples), issues

3. Inconsistent Formatting

def normalize_example(example):
    """Normalize formatting across examples."""
    return {
        "instruction": example["instruction"].strip(),
        "output": example["output"].strip(),
        # Remove excessive whitespace
        "input": " ".join(example.get("input", "").split())
    }

dataset = dataset.map(normalize_example)

Quality Metrics

Diversity Score

Measure how diverse your instructions are:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_diversity(instructions):
    """Calculate diversity score (0-1, higher is more diverse)."""
    vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(instructions)

    # Calculate average pairwise similarity
    similarities = cosine_similarity(tfidf_matrix)

    # Exclude self-similarity (diagonal)
    np.fill_diagonal(similarities, 0)
    avg_similarity = similarities.sum() / (len(instructions) * (len(instructions) - 1))

    # Diversity = 1 - similarity
    diversity = 1 - avg_similarity
    return diversity

instructions = [ex["instruction"] for ex in dataset]
print(f"Diversity score: {calculate_diversity(instructions):.3f}")

Length Distribution

Check for balanced response lengths:

import matplotlib.pyplot as plt

def analyze_lengths(dataset):
    instruction_lengths = [len(ex["instruction"]) for ex in dataset]
    output_lengths = [len(ex["output"]) for ex in dataset]

    print(f"Instruction length: {np.mean(instruction_lengths):.0f} ± {np.std(instruction_lengths):.0f}")
    print(f"Output length: {np.mean(output_lengths):.0f} ± {np.std(output_lengths):.0f}")

    # Flag potential issues
    if np.std(output_lengths) < 50:
        print("Warning: Output lengths are very uniform - may lack diversity")

Filtering Strategies

Quality Filter

def quality_filter(example):
    """Filter out low-quality examples."""
    instruction = example["instruction"]
    output = example["output"]

    # Filter criteria
    filters = [
        len(output) >= 50,                    # Minimum output length
        len(output) <= 4000,                  # Maximum output length
        not output.startswith("I cannot"),   # Not a refusal
        not output.startswith("I'm sorry"),  # Not an apology
        "```" not in instruction or "```" in output,  # Code in output if asked
    ]

    return all(filters)

dataset = dataset.filter(quality_filter)

Semantic Deduplication

Remove examples that are semantically similar, even if not exact duplicates:

from sentence_transformers import SentenceTransformer

def semantic_dedup(dataset, threshold=0.95):
    """Remove semantically similar examples."""
    model = SentenceTransformer('all-MiniLM-L6-v2')

    instructions = [ex["instruction"] for ex in dataset]
    embeddings = model.encode(instructions)

    # Find similar pairs
    similarity_matrix = cosine_similarity(embeddings)

    keep_indices = []
    for i in range(len(dataset)):
        # Check if this example is too similar to any kept example
        is_unique = True
        for j in keep_indices:
            if similarity_matrix[i][j] > threshold:
                is_unique = False
                break

        if is_unique:
            keep_indices.append(i)

    return dataset.select(keep_indices)

Data Validation Pipeline

def validate_dataset(dataset):
    """Complete validation pipeline."""
    print("=" * 50)
    print("Dataset Validation Report")
    print("=" * 50)

    # 1. Basic stats
    print(f"\nTotal examples: {len(dataset)}")

    # 2. Remove duplicates
    dataset = remove_duplicates(dataset)

    # 3. Validate and clean
    dataset, issues = clean_dataset(dataset)

    # 4. Normalize formatting
    dataset = dataset.map(normalize_example)

    # 5. Quality filter
    before = len(dataset)
    dataset = dataset.filter(quality_filter)
    print(f"Quality filter removed: {before - len(dataset)} examples")

    # 6. Analyze diversity
    instructions = [ex["instruction"] for ex in dataset]
    diversity = calculate_diversity(instructions)
    print(f"Diversity score: {diversity:.3f}")

    if diversity < 0.5:
        print("Warning: Low diversity - consider adding more varied examples")

    # 7. Analyze lengths
    analyze_lengths(dataset)

    print("\n" + "=" * 50)
    print(f"Final dataset size: {len(dataset)}")
    print("=" * 50)

    return dataset

# Run validation
clean_dataset = validate_dataset(raw_dataset)

Best Practices Summary

Issue Solution
Duplicates Hash-based + semantic deduplication
Empty fields Validation filters
Low diversity Add varied examples, check coverage
Inconsistent format Normalization pipeline
Too short/long Length-based filtering
Low quality outputs Manual review + LLM scoring

Next, we'll learn how to create preference data for DPO training. :::

Quiz

Module 2: Dataset Preparation

Take Quiz