LoRA & QLoRA in Practice
Your First Fine-tune
3 min read
Let's put everything together and run a complete fine-tuning job using SFTTrainer from the TRL library.
Complete Training Script
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset
# ============================================
# 1. Configuration
# ============================================
model_name = "meta-llama/Llama-3.2-3B-Instruct"
output_dir = "./outputs/my-first-finetune"
max_seq_length = 2048
# ============================================
# 2. Load and Prepare Dataset
# ============================================
dataset = load_dataset("tatsu-lab/alpaca", split="train")
# Optional: take a subset for testing
dataset = dataset.select(range(1000))
def format_instruction(example):
"""Format dataset into instruction format."""
if example.get("input", ""):
text = f"""### Instruction:
{example['instruction']}
### Input:
{example['input']}
### Response:
{example['output']}"""
else:
text = f"""### Instruction:
{example['instruction']}
### Response:
{example['output']}"""
return {"text": text}
dataset = dataset.map(format_instruction)
# Split into train/validation
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(f"Train size: {len(train_dataset)}")
print(f"Eval size: {len(eval_dataset)}")
# ============================================
# 3. Load Model with QLoRA
# ============================================
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
)
model = prepare_model_for_kbit_training(model)
# ============================================
# 4. Configure LoRA
# ============================================
lora_config = LoraConfig(
r=16,
lora_alpha=16,
target_modules="all-linear",
lora_dropout=0.0,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# ============================================
# 5. Load Tokenizer
# ============================================
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# ============================================
# 6. Training Arguments
# ============================================
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
gradient_checkpointing=True,
learning_rate=2e-4,
weight_decay=0.01,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_steps=100,
eval_strategy="steps",
eval_steps=100,
save_total_limit=3,
bf16=True,
max_grad_norm=0.3,
group_by_length=True,
report_to="none", # or "wandb" for experiment tracking
)
# ============================================
# 7. Initialize Trainer
# ============================================
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
processing_class=tokenizer,
max_seq_length=max_seq_length,
dataset_text_field="text",
packing=False,
)
# ============================================
# 8. Train!
# ============================================
print("Starting training...")
trainer.train()
# ============================================
# 9. Save the Model
# ============================================
trainer.save_model(f"{output_dir}/final")
tokenizer.save_pretrained(f"{output_dir}/final")
print(f"Training complete! Model saved to {output_dir}/final")
Understanding Training Arguments
Batch Size and Accumulation
# Effective batch size = per_device * devices * accumulation
# Example: 4 * 1 * 4 = 16 effective batch size
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
Learning Rate Schedule
learning_rate=2e-4, # Peak learning rate
warmup_ratio=0.03, # 3% of steps for warmup
lr_scheduler_type="cosine", # Cosine decay after warmup
Memory Optimization
gradient_checkpointing=True, # Trade compute for memory
bf16=True, # Use BFloat16 precision
max_grad_norm=0.3, # Gradient clipping
Testing Your Fine-tuned Model
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Load base model and adapter
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-3B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto"
)
model = PeftModel.from_pretrained(
base_model,
"./outputs/my-first-finetune/final"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
# Test generation
prompt = """### Instruction:
Write a short poem about machine learning.
### Response:
"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.7,
do_sample=True
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Merging Adapters (Optional)
For deployment, you can merge the LoRA adapters into the base model:
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torch
# Load base model (full precision for merging)
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-3B-Instruct",
torch_dtype=torch.float16,
device_map="auto"
)
# Load and merge adapter
model = PeftModel.from_pretrained(base_model, "./outputs/my-first-finetune/final")
merged_model = model.merge_and_unload()
# Save merged model
merged_model.save_pretrained("./outputs/merged-model")
Training Tips
Monitor Loss Curves
Good training shows:
- Decreasing training loss
- Validation loss that plateaus (not increases)
- No sudden spikes
Checkpointing
# Save checkpoints frequently
save_steps=100,
save_total_limit=3, # Keep only last 3 checkpoints
Resume Training
# Resume from a checkpoint
trainer.train(resume_from_checkpoint="./outputs/checkpoint-500")
Expected Results
For a 1000-example fine-tune on Llama 3.2 3B:
| Metric | Expected |
|---|---|
| Training time | 30-60 minutes (A100) |
| Final loss | 0.5-1.5 |
| VRAM usage | 8-12 GB |
| Checkpoint size | ~100 MB (LoRA only) |
Tip: Start with a small dataset (100-1000 examples) to verify everything works, then scale up.
In the next module, we'll learn how Unsloth can make this 2x faster with 70% less memory. :::