Dataset Preparation
Data Quality and Cleaning
3 min read
Poor data quality is the #1 cause of fine-tuning failures. Let's learn how to identify and fix common data quality issues.
Common Data Quality Issues
1. Duplicates
Duplicate examples cause the model to overfit on repeated patterns.
from datasets import Dataset
def remove_duplicates(dataset):
seen = set()
unique_examples = []
for example in dataset:
# Create a hash of the instruction
key = hash(example["instruction"].strip().lower())
if key not in seen:
seen.add(key)
unique_examples.append(example)
print(f"Removed {len(dataset) - len(unique_examples)} duplicates")
return Dataset.from_list(unique_examples)
2. Empty or Malformed Examples
def validate_example(example):
"""Check if an example is valid."""
# Check for empty fields
if not example.get("instruction", "").strip():
return False, "Empty instruction"
if not example.get("output", "").strip():
return False, "Empty output"
# Check minimum length
if len(example["instruction"]) < 10:
return False, "Instruction too short"
if len(example["output"]) < 5:
return False, "Output too short"
return True, "Valid"
def clean_dataset(dataset):
valid_examples = []
issues = []
for i, example in enumerate(dataset):
is_valid, reason = validate_example(example)
if is_valid:
valid_examples.append(example)
else:
issues.append(f"Example {i}: {reason}")
print(f"Kept {len(valid_examples)}/{len(dataset)} examples")
return Dataset.from_list(valid_examples), issues
3. Inconsistent Formatting
def normalize_example(example):
"""Normalize formatting across examples."""
return {
"instruction": example["instruction"].strip(),
"output": example["output"].strip(),
# Remove excessive whitespace
"input": " ".join(example.get("input", "").split())
}
dataset = dataset.map(normalize_example)
Quality Metrics
Diversity Score
Measure how diverse your instructions are:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def calculate_diversity(instructions):
"""Calculate diversity score (0-1, higher is more diverse)."""
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(instructions)
# Calculate average pairwise similarity
similarities = cosine_similarity(tfidf_matrix)
# Exclude self-similarity (diagonal)
np.fill_diagonal(similarities, 0)
avg_similarity = similarities.sum() / (len(instructions) * (len(instructions) - 1))
# Diversity = 1 - similarity
diversity = 1 - avg_similarity
return diversity
instructions = [ex["instruction"] for ex in dataset]
print(f"Diversity score: {calculate_diversity(instructions):.3f}")
Length Distribution
Check for balanced response lengths:
import matplotlib.pyplot as plt
def analyze_lengths(dataset):
instruction_lengths = [len(ex["instruction"]) for ex in dataset]
output_lengths = [len(ex["output"]) for ex in dataset]
print(f"Instruction length: {np.mean(instruction_lengths):.0f} ± {np.std(instruction_lengths):.0f}")
print(f"Output length: {np.mean(output_lengths):.0f} ± {np.std(output_lengths):.0f}")
# Flag potential issues
if np.std(output_lengths) < 50:
print("Warning: Output lengths are very uniform - may lack diversity")
Filtering Strategies
Quality Filter
def quality_filter(example):
"""Filter out low-quality examples."""
instruction = example["instruction"]
output = example["output"]
# Filter criteria
filters = [
len(output) >= 50, # Minimum output length
len(output) <= 4000, # Maximum output length
not output.startswith("I cannot"), # Not a refusal
not output.startswith("I'm sorry"), # Not an apology
"```" not in instruction or "```" in output, # Code in output if asked
]
return all(filters)
dataset = dataset.filter(quality_filter)
Semantic Deduplication
Remove examples that are semantically similar, even if not exact duplicates:
from sentence_transformers import SentenceTransformer
def semantic_dedup(dataset, threshold=0.95):
"""Remove semantically similar examples."""
model = SentenceTransformer('all-MiniLM-L6-v2')
instructions = [ex["instruction"] for ex in dataset]
embeddings = model.encode(instructions)
# Find similar pairs
similarity_matrix = cosine_similarity(embeddings)
keep_indices = []
for i in range(len(dataset)):
# Check if this example is too similar to any kept example
is_unique = True
for j in keep_indices:
if similarity_matrix[i][j] > threshold:
is_unique = False
break
if is_unique:
keep_indices.append(i)
return dataset.select(keep_indices)
Data Validation Pipeline
def validate_dataset(dataset):
"""Complete validation pipeline."""
print("=" * 50)
print("Dataset Validation Report")
print("=" * 50)
# 1. Basic stats
print(f"\nTotal examples: {len(dataset)}")
# 2. Remove duplicates
dataset = remove_duplicates(dataset)
# 3. Validate and clean
dataset, issues = clean_dataset(dataset)
# 4. Normalize formatting
dataset = dataset.map(normalize_example)
# 5. Quality filter
before = len(dataset)
dataset = dataset.filter(quality_filter)
print(f"Quality filter removed: {before - len(dataset)} examples")
# 6. Analyze diversity
instructions = [ex["instruction"] for ex in dataset]
diversity = calculate_diversity(instructions)
print(f"Diversity score: {diversity:.3f}")
if diversity < 0.5:
print("Warning: Low diversity - consider adding more varied examples")
# 7. Analyze lengths
analyze_lengths(dataset)
print("\n" + "=" * 50)
print(f"Final dataset size: {len(dataset)}")
print("=" * 50)
return dataset
# Run validation
clean_dataset = validate_dataset(raw_dataset)
Best Practices Summary
| Issue | Solution |
|---|---|
| Duplicates | Hash-based + semantic deduplication |
| Empty fields | Validation filters |
| Low diversity | Add varied examples, check coverage |
| Inconsistent format | Normalization pipeline |
| Too short/long | Length-based filtering |
| Low quality outputs | Manual review + LLM scoring |
Next, we'll learn how to create preference data for DPO training. :::