Lesson 20 of 24

Alignment with DPO

Combining SFT + DPO

3 min read

The most effective fine-tuning approach combines SFT (to teach skills) with DPO (to improve quality). Let's build a complete two-stage pipeline.

The Two-Stage Pipeline

Stage 1: SFT
├── Input: Base model + instruction dataset
├── Output: Model that can follow instructions
└── Goal: Teach task-specific skills

Stage 2: DPO
├── Input: SFT model + preference dataset
├── Output: Aligned model with better responses
└── Goal: Improve response quality

Complete Pipeline Script

from unsloth import FastLanguageModel
from trl import SFTTrainer, DPOTrainer, DPOConfig
from transformers import TrainingArguments
from datasets import load_dataset
import torch

# ============================================
# Configuration
# ============================================
model_name = "unsloth/Llama-3.2-3B-Instruct"
max_seq_length = 2048

# ============================================
# STAGE 1: Supervised Fine-Tuning
# ============================================
print("=" * 50)
print("STAGE 1: SFT Training")
print("=" * 50)

# Load model for SFT
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules="all-linear",
    use_gradient_checkpointing="unsloth",
)

# Load SFT dataset
sft_dataset = load_dataset("tatsu-lab/alpaca", split="train")
sft_dataset = sft_dataset.select(range(5000))  # Subset

def format_sft(example):
    if example.get("input", ""):
        text = f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}<|eot_id|>"""
    else:
        text = f"""### Instruction:
{example['instruction']}

### Response:
{example['output']}<|eot_id|>"""
    return {"text": text}

sft_dataset = sft_dataset.map(format_sft)

# SFT training
sft_args = TrainingArguments(
    output_dir="./outputs/stage1-sft",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    logging_steps=20,
    save_steps=500,
    bf16=True,
    optim="adamw_8bit",
)

sft_trainer = SFTTrainer(
    model=model,
    args=sft_args,
    train_dataset=sft_dataset,
    processing_class=tokenizer,
    max_seq_length=max_seq_length,
    dataset_text_field="text",
)

sft_trainer.train()
sft_trainer.save_model("./outputs/stage1-sft/final")

print("SFT training complete!")

# ============================================
# STAGE 2: DPO Alignment
# ============================================
print("=" * 50)
print("STAGE 2: DPO Training")
print("=" * 50)

# Reload the SFT model for DPO
# (Fresh load ensures proper reference model handling)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="./outputs/stage1-sft/final",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules="all-linear",
    use_gradient_checkpointing="unsloth",
)

# Load DPO dataset
dpo_dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="train_prefs")
dpo_dataset = dpo_dataset.select(range(2000))  # Subset

# DPO training
dpo_config = DPOConfig(
    output_dir="./outputs/stage2-dpo",
    beta=0.1,
    learning_rate=5e-6,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    warmup_ratio=0.1,
    logging_steps=20,
    save_steps=500,
    bf16=True,
    optim="adamw_8bit",
    max_length=1024,
    max_prompt_length=512,
)

dpo_trainer = DPOTrainer(
    model=model,
    args=dpo_config,
    train_dataset=dpo_dataset,
    processing_class=tokenizer,
)

dpo_trainer.train()
dpo_trainer.save_model("./outputs/stage2-dpo/final")

print("DPO training complete!")
print("Full pipeline finished!")

Best Practices

Stage 1: SFT

# Higher learning rate for learning new skills
learning_rate = 2e-4

# More epochs if teaching complex tasks
num_train_epochs = 1-3

# Focus on instruction-following
dataset = "instruction-response pairs"

Stage 2: DPO

# Much lower learning rate (fine refinement)
learning_rate = 5e-6

# Usually 1 epoch is enough
num_train_epochs = 1

# Focus on quality improvement
dataset = "preference pairs (chosen/rejected)"

Alternative: Single-Stage Training

Some recent approaches combine SFT and DPO:

# ORPO (Odds Ratio Preference Optimization)
from trl import ORPOTrainer, ORPOConfig

orpo_config = ORPOConfig(
    output_dir="./outputs/orpo",
    beta=0.1,
    learning_rate=8e-6,
    per_device_train_batch_size=2,
    num_train_epochs=1,
)

# ORPO uses preference data but includes SFT loss
trainer = ORPOTrainer(
    model=model,
    args=orpo_config,
    train_dataset=preference_dataset,
    processing_class=tokenizer,
)

Comparing Approaches

Approach Stages Data Needed Complexity
SFT only 1 Instruction pairs Low
SFT + DPO 2 Both types Medium
ORPO 1 Preference pairs Low
SFT + RLHF 3+ All types + reward model High

When to Use Each Stage

SFT First When:

  • Teaching new domain knowledge
  • Model doesn't understand your task
  • Need specific output formats
  • Training from base model

DPO After When:

  • SFT model works but quality varies
  • Want to reduce poor responses
  • Need better instruction following
  • Aligning with specific preferences

Evaluation Between Stages

Check quality after each stage:

def evaluate_model(model, tokenizer, test_prompts):
    """Quick evaluation of model responses."""
    model.eval()
    results = []

    for prompt in test_prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append({"prompt": prompt, "response": response})

    return results

# Test prompts
test_prompts = [
    "Explain machine learning in simple terms.",
    "Write a short poem about coding.",
    "What are the benefits of exercise?"
]

# Evaluate after SFT
sft_results = evaluate_model(sft_model, tokenizer, test_prompts)

# Evaluate after DPO
dpo_results = evaluate_model(dpo_model, tokenizer, test_prompts)

# Compare responses
for sft, dpo in zip(sft_results, dpo_results):
    print(f"Prompt: {sft['prompt']}")
    print(f"SFT: {sft['response'][:200]}...")
    print(f"DPO: {dpo['response'][:200]}...")
    print("-" * 50)

Tip: The SFT+DPO pipeline is the gold standard for 2025. SFT teaches capabilities, DPO refines quality. Always evaluate between stages to ensure each is improving the model.

In the next module, we'll learn how to evaluate and deploy your fine-tuned model. :::

Quiz

Module 5: Alignment with DPO

Take Quiz