Alignment with DPO
Preparing DPO Data
3 min read
DPO requires preference data with chosen and rejected response pairs. Let's learn how to structure and prepare this data.
DPO Dataset Format
The standard format for DPO:
{
"prompt": "What is the capital of France?",
"chosen": "The capital of France is Paris. It's known for the Eiffel Tower, the Louvre Museum, and its rich cultural heritage.",
"rejected": "idk maybe paris or something"
}
Loading Existing Datasets
Many preference datasets are available:
from datasets import load_dataset
# UltraFeedback - High quality preference data
dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized")
# Anthropic HH-RLHF
dataset = load_dataset("Anthropic/hh-rlhf")
# Intel Orca DPO Pairs
dataset = load_dataset("Intel/orca_dpo_pairs")
Formatting for DPOTrainer
TRL's DPOTrainer expects specific column names:
from datasets import load_dataset
dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="train_prefs")
# Check columns
print(dataset.column_names)
# Should have: prompt, chosen, rejected
# Or map to correct format
def format_dpo(example):
return {
"prompt": example["question"],
"chosen": example["response_a"],
"rejected": example["response_b"]
}
dataset = dataset.map(format_dpo)
Using Chat Templates
For instruction models, format with chat templates:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
def format_with_template(example):
# Format prompt
prompt_messages = [
{"role": "user", "content": example["prompt"]}
]
# Format chosen response
chosen_messages = [
{"role": "user", "content": example["prompt"]},
{"role": "assistant", "content": example["chosen"]}
]
# Format rejected response
rejected_messages = [
{"role": "user", "content": example["prompt"]},
{"role": "assistant", "content": example["rejected"]}
]
return {
"prompt": tokenizer.apply_chat_template(prompt_messages, tokenize=False),
"chosen": tokenizer.apply_chat_template(chosen_messages, tokenize=False),
"rejected": tokenizer.apply_chat_template(rejected_messages, tokenize=False)
}
dataset = dataset.map(format_with_template)
Creating Custom Preference Data
From Model Outputs
Generate multiple responses and rank them:
from transformers import pipeline
import random
generator = pipeline("text-generation", model="meta-llama/Llama-3.2-3B-Instruct")
def create_preference_pair(prompt):
# Generate multiple responses
responses = []
for temp in [0.3, 0.7, 1.0, 1.2]:
response = generator(prompt, temperature=temp, max_new_tokens=200)
responses.append(response[0]["generated_text"])
# Have humans or LLM rank them
# Then select best and worst
chosen = responses[0] # Best ranked
rejected = responses[-1] # Worst ranked
return {
"prompt": prompt,
"chosen": chosen,
"rejected": rejected
}
From Existing Conversations
Convert feedback data to preference pairs:
def convert_feedback_to_dpo(feedback_data):
"""
Input: {"prompt": ..., "response": ..., "rating": 1-5}
Output: DPO pairs
"""
# Group by prompt
prompt_groups = {}
for item in feedback_data:
prompt = item["prompt"]
if prompt not in prompt_groups:
prompt_groups[prompt] = []
prompt_groups[prompt].append(item)
# Create pairs from different ratings
dpo_data = []
for prompt, responses in prompt_groups.items():
sorted_responses = sorted(responses, key=lambda x: x["rating"], reverse=True)
if len(sorted_responses) >= 2:
dpo_data.append({
"prompt": prompt,
"chosen": sorted_responses[0]["response"],
"rejected": sorted_responses[-1]["response"]
})
return dpo_data
Data Quality Guidelines
Good Preference Pairs
# Clear quality difference
{
"prompt": "Explain photosynthesis",
"chosen": "Photosynthesis is the process by which plants convert sunlight, water, and carbon dioxide into glucose and oxygen. It occurs in chloroplasts and is essential for life on Earth.",
"rejected": "Plants make food from sun."
}
Avoid
# Too similar (model can't learn)
{
"prompt": "What is 2+2?",
"chosen": "2+2 equals 4.",
"rejected": "The answer is 4."
}
# Same quality, different style (not preference)
{
"prompt": "Tell me about dogs",
"chosen": "Dogs are loyal companions...",
"rejected": "Canines have been domesticated..."
}
Validation
Check your dataset before training:
def validate_dpo_dataset(dataset):
issues = []
for i, example in enumerate(dataset):
# Check required fields
if not example.get("prompt"):
issues.append(f"Example {i}: Missing prompt")
if not example.get("chosen"):
issues.append(f"Example {i}: Missing chosen")
if not example.get("rejected"):
issues.append(f"Example {i}: Missing rejected")
# Check they're different
if example.get("chosen") == example.get("rejected"):
issues.append(f"Example {i}: Chosen equals rejected")
# Check lengths
if len(example.get("chosen", "")) < 10:
issues.append(f"Example {i}: Chosen too short")
print(f"Found {len(issues)} issues")
return issues
issues = validate_dpo_dataset(dataset)
Dataset Size Recommendations
| Dataset Size | Expected Results |
|---|---|
| 100-500 | Minimal effect |
| 500-2,000 | Noticeable improvement |
| 2,000-10,000 | Good alignment |
| 10,000+ | Strong alignment |
Tip: Quality matters more than quantity. 1,000 carefully curated pairs outperform 10,000 noisy ones.
Next, let's implement DPO training with TRL's DPOTrainer. :::