Training with Unsloth
Training with Unsloth
3 min read
Now let's run a complete training job using Unsloth with TRL's SFTTrainer. You'll see how similar it is to standard training, but much faster.
Complete Training Script
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
# ============================================
# 1. Load Model with Unsloth
# ============================================
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Llama-3.2-3B-Instruct",
max_seq_length=2048,
load_in_4bit=True,
dtype=None,
)
# ============================================
# 2. Add LoRA Adapters
# ============================================
model = FastLanguageModel.get_peft_model(
model,
r=16,
lora_alpha=16,
lora_dropout=0,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42,
)
# ============================================
# 3. Prepare Dataset
# ============================================
dataset = load_dataset("tatsu-lab/alpaca", split="train")
def format_prompt(example):
"""Format into Alpaca style."""
if example.get("input", ""):
text = f"""### Instruction:
{example['instruction']}
### Input:
{example['input']}
### Response:
{example['output']}<|eot_id|>"""
else:
text = f"""### Instruction:
{example['instruction']}
### Response:
{example['output']}<|eot_id|>"""
return {"text": text}
dataset = dataset.map(format_prompt)
# ============================================
# 4. Setup Tokenizer
# ============================================
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# ============================================
# 5. Training Arguments
# ============================================
training_args = TrainingArguments(
output_dir="./outputs/unsloth-finetune",
num_train_epochs=1,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
weight_decay=0.01,
warmup_steps=10,
lr_scheduler_type="linear",
logging_steps=10,
save_steps=500,
bf16=True,
optim="adamw_8bit", # 8-bit optimizer for memory savings
seed=42,
)
# ============================================
# 6. Initialize Trainer
# ============================================
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=dataset,
processing_class=tokenizer,
max_seq_length=2048,
dataset_text_field="text",
packing=False,
)
# ============================================
# 7. Train!
# ============================================
print("Starting Unsloth training...")
trainer.train()
# ============================================
# 8. Save
# ============================================
trainer.save_model("./outputs/unsloth-finetune/final")
print("Training complete!")
Key Differences from Standard Training
1. Model Loading
# Standard
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(...)
# Unsloth
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(...)
2. Adding LoRA
# Standard
from peft import get_peft_model, LoraConfig
model = get_peft_model(model, LoraConfig(...))
# Unsloth
model = FastLanguageModel.get_peft_model(model, ...)
3. Gradient Checkpointing
# Standard
model.gradient_checkpointing_enable()
# Unsloth (optimized version)
use_gradient_checkpointing="unsloth"
Optimized Training Arguments
For Unsloth, these arguments work well:
training_args = TrainingArguments(
# Core settings
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
# Optimizer (8-bit saves memory)
optim="adamw_8bit",
learning_rate=2e-4,
weight_decay=0.01,
# Scheduler
warmup_ratio=0.03,
lr_scheduler_type="linear",
# Precision
bf16=True,
# Logging
logging_steps=10,
save_steps=500,
)
Using Chat Templates
For instruct models, use the tokenizer's chat template:
def format_chat(example):
"""Format using model's chat template."""
messages = [
{"role": "user", "content": example["instruction"]},
{"role": "assistant", "content": example["output"]}
]
text = tokenizer.apply_chat_template(messages, tokenize=False)
return {"text": text}
dataset = dataset.map(format_chat)
Monitoring Training
With Weights & Biases
import wandb
wandb.login()
training_args = TrainingArguments(
...
report_to="wandb",
run_name="unsloth-llama-finetune",
)
Manual Logging
# Training will show progress like:
# Step 10: loss=2.345
# Step 20: loss=1.987
# Step 30: loss=1.654
# ...
Batch Size Optimization
Find the optimal batch size for your GPU:
# Start small and increase
batch_sizes = [1, 2, 4, 8]
for bs in batch_sizes:
try:
# Test with one step
trainer = SFTTrainer(
model=model,
args=TrainingArguments(
per_device_train_batch_size=bs,
max_steps=1,
...
),
...
)
trainer.train()
print(f"Batch size {bs}: OK")
except RuntimeError as e:
if "out of memory" in str(e):
print(f"Batch size {bs}: OOM")
break
Expected Performance
Training Llama 3.2 3B on 10K examples:
| Metric | Standard | Unsloth |
|---|---|---|
| Time per epoch | 60 min | 30 min |
| VRAM usage | 12 GB | 4 GB |
| Steps/second | 2.5 | 5.0 |
Troubleshooting
Loss Not Decreasing
# Check learning rate
learning_rate = 2e-4 # Try 1e-4 or 5e-5
# Add warmup
warmup_steps = 100
Out of Memory
# Reduce batch size
per_device_train_batch_size = 1
gradient_accumulation_steps = 16
# Use 8-bit optimizer
optim = "adamw_8bit"
Slow Training
# Disable evaluation during training
eval_strategy = "no"
# Reduce logging frequency
logging_steps = 50
Tip: With Unsloth, you can train the same model in half the time on the same hardware. This means more experiments and faster iteration!
Next, let's learn how to export and convert our fine-tuned model for deployment. :::