Alignment with DPO
Combining SFT + DPO
3 min read
The most effective fine-tuning approach combines SFT (to teach skills) with DPO (to improve quality). Let's build a complete two-stage pipeline.
The Two-Stage Pipeline
Stage 1: SFT
├── Input: Base model + instruction dataset
├── Output: Model that can follow instructions
└── Goal: Teach task-specific skills
Stage 2: DPO
├── Input: SFT model + preference dataset
├── Output: Aligned model with better responses
└── Goal: Improve response quality
Complete Pipeline Script
from unsloth import FastLanguageModel
from trl import SFTTrainer, DPOTrainer, DPOConfig
from transformers import TrainingArguments
from datasets import load_dataset
import torch
# ============================================
# Configuration
# ============================================
model_name = "unsloth/Llama-3.2-3B-Instruct"
max_seq_length = 2048
# ============================================
# STAGE 1: Supervised Fine-Tuning
# ============================================
print("=" * 50)
print("STAGE 1: SFT Training")
print("=" * 50)
# Load model for SFT
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
load_in_4bit=True,
)
model = FastLanguageModel.get_peft_model(
model,
r=16,
lora_alpha=16,
lora_dropout=0,
target_modules="all-linear",
use_gradient_checkpointing="unsloth",
)
# Load SFT dataset
sft_dataset = load_dataset("tatsu-lab/alpaca", split="train")
sft_dataset = sft_dataset.select(range(5000)) # Subset
def format_sft(example):
if example.get("input", ""):
text = f"""### Instruction:
{example['instruction']}
### Input:
{example['input']}
### Response:
{example['output']}<|eot_id|>"""
else:
text = f"""### Instruction:
{example['instruction']}
### Response:
{example['output']}<|eot_id|>"""
return {"text": text}
sft_dataset = sft_dataset.map(format_sft)
# SFT training
sft_args = TrainingArguments(
output_dir="./outputs/stage1-sft",
num_train_epochs=1,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
warmup_ratio=0.03,
logging_steps=20,
save_steps=500,
bf16=True,
optim="adamw_8bit",
)
sft_trainer = SFTTrainer(
model=model,
args=sft_args,
train_dataset=sft_dataset,
processing_class=tokenizer,
max_seq_length=max_seq_length,
dataset_text_field="text",
)
sft_trainer.train()
sft_trainer.save_model("./outputs/stage1-sft/final")
print("SFT training complete!")
# ============================================
# STAGE 2: DPO Alignment
# ============================================
print("=" * 50)
print("STAGE 2: DPO Training")
print("=" * 50)
# Reload the SFT model for DPO
# (Fresh load ensures proper reference model handling)
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="./outputs/stage1-sft/final",
max_seq_length=max_seq_length,
load_in_4bit=True,
)
model = FastLanguageModel.get_peft_model(
model,
r=16,
lora_alpha=16,
lora_dropout=0,
target_modules="all-linear",
use_gradient_checkpointing="unsloth",
)
# Load DPO dataset
dpo_dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="train_prefs")
dpo_dataset = dpo_dataset.select(range(2000)) # Subset
# DPO training
dpo_config = DPOConfig(
output_dir="./outputs/stage2-dpo",
beta=0.1,
learning_rate=5e-6,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=1,
warmup_ratio=0.1,
logging_steps=20,
save_steps=500,
bf16=True,
optim="adamw_8bit",
max_length=1024,
max_prompt_length=512,
)
dpo_trainer = DPOTrainer(
model=model,
args=dpo_config,
train_dataset=dpo_dataset,
processing_class=tokenizer,
)
dpo_trainer.train()
dpo_trainer.save_model("./outputs/stage2-dpo/final")
print("DPO training complete!")
print("Full pipeline finished!")
Best Practices
Stage 1: SFT
# Higher learning rate for learning new skills
learning_rate = 2e-4
# More epochs if teaching complex tasks
num_train_epochs = 1-3
# Focus on instruction-following
dataset = "instruction-response pairs"
Stage 2: DPO
# Much lower learning rate (fine refinement)
learning_rate = 5e-6
# Usually 1 epoch is enough
num_train_epochs = 1
# Focus on quality improvement
dataset = "preference pairs (chosen/rejected)"
Alternative: Single-Stage Training
Some recent approaches combine SFT and DPO:
# ORPO (Odds Ratio Preference Optimization)
from trl import ORPOTrainer, ORPOConfig
orpo_config = ORPOConfig(
output_dir="./outputs/orpo",
beta=0.1,
learning_rate=8e-6,
per_device_train_batch_size=2,
num_train_epochs=1,
)
# ORPO uses preference data but includes SFT loss
trainer = ORPOTrainer(
model=model,
args=orpo_config,
train_dataset=preference_dataset,
processing_class=tokenizer,
)
Comparing Approaches
| Approach | Stages | Data Needed | Complexity |
|---|---|---|---|
| SFT only | 1 | Instruction pairs | Low |
| SFT + DPO | 2 | Both types | Medium |
| ORPO | 1 | Preference pairs | Low |
| SFT + RLHF | 3+ | All types + reward model | High |
When to Use Each Stage
SFT First When:
- Teaching new domain knowledge
- Model doesn't understand your task
- Need specific output formats
- Training from base model
DPO After When:
- SFT model works but quality varies
- Want to reduce poor responses
- Need better instruction following
- Aligning with specific preferences
Evaluation Between Stages
Check quality after each stage:
def evaluate_model(model, tokenizer, test_prompts):
"""Quick evaluation of model responses."""
model.eval()
results = []
for prompt in test_prompts:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
results.append({"prompt": prompt, "response": response})
return results
# Test prompts
test_prompts = [
"Explain machine learning in simple terms.",
"Write a short poem about coding.",
"What are the benefits of exercise?"
]
# Evaluate after SFT
sft_results = evaluate_model(sft_model, tokenizer, test_prompts)
# Evaluate after DPO
dpo_results = evaluate_model(dpo_model, tokenizer, test_prompts)
# Compare responses
for sft, dpo in zip(sft_results, dpo_results):
print(f"Prompt: {sft['prompt']}")
print(f"SFT: {sft['response'][:200]}...")
print(f"DPO: {dpo['response'][:200]}...")
print("-" * 50)
Tip: The SFT+DPO pipeline is the gold standard for 2025. SFT teaches capabilities, DPO refines quality. Always evaluate between stages to ensure each is improving the model.
In the next module, we'll learn how to evaluate and deploy your fine-tuned model. :::