Lesson 15 of 24

Training with Unsloth

Training with Unsloth

3 min read

Now let's run a complete training job using Unsloth with TRL's SFTTrainer. You'll see how similar it is to standard training, but much faster.

Complete Training Script

from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

# ============================================
# 1. Load Model with Unsloth
# ============================================
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2048,
    load_in_4bit=True,
    dtype=None,
)

# ============================================
# 2. Add LoRA Adapters
# ============================================
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

# ============================================
# 3. Prepare Dataset
# ============================================
dataset = load_dataset("tatsu-lab/alpaca", split="train")

def format_prompt(example):
    """Format into Alpaca style."""
    if example.get("input", ""):
        text = f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}<|eot_id|>"""
    else:
        text = f"""### Instruction:
{example['instruction']}

### Response:
{example['output']}<|eot_id|>"""
    return {"text": text}

dataset = dataset.map(format_prompt)

# ============================================
# 4. Setup Tokenizer
# ============================================
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# ============================================
# 5. Training Arguments
# ============================================
training_args = TrainingArguments(
    output_dir="./outputs/unsloth-finetune",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=10,
    lr_scheduler_type="linear",
    logging_steps=10,
    save_steps=500,
    bf16=True,
    optim="adamw_8bit",  # 8-bit optimizer for memory savings
    seed=42,
)

# ============================================
# 6. Initialize Trainer
# ============================================
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    processing_class=tokenizer,
    max_seq_length=2048,
    dataset_text_field="text",
    packing=False,
)

# ============================================
# 7. Train!
# ============================================
print("Starting Unsloth training...")
trainer.train()

# ============================================
# 8. Save
# ============================================
trainer.save_model("./outputs/unsloth-finetune/final")
print("Training complete!")

Key Differences from Standard Training

1. Model Loading

# Standard
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(...)

# Unsloth
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(...)

2. Adding LoRA

# Standard
from peft import get_peft_model, LoraConfig
model = get_peft_model(model, LoraConfig(...))

# Unsloth
model = FastLanguageModel.get_peft_model(model, ...)

3. Gradient Checkpointing

# Standard
model.gradient_checkpointing_enable()

# Unsloth (optimized version)
use_gradient_checkpointing="unsloth"

Optimized Training Arguments

For Unsloth, these arguments work well:

training_args = TrainingArguments(
    # Core settings
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,

    # Optimizer (8-bit saves memory)
    optim="adamw_8bit",
    learning_rate=2e-4,
    weight_decay=0.01,

    # Scheduler
    warmup_ratio=0.03,
    lr_scheduler_type="linear",

    # Precision
    bf16=True,

    # Logging
    logging_steps=10,
    save_steps=500,
)

Using Chat Templates

For instruct models, use the tokenizer's chat template:

def format_chat(example):
    """Format using model's chat template."""
    messages = [
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": example["output"]}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False)
    return {"text": text}

dataset = dataset.map(format_chat)

Monitoring Training

With Weights & Biases

import wandb
wandb.login()

training_args = TrainingArguments(
    ...
    report_to="wandb",
    run_name="unsloth-llama-finetune",
)

Manual Logging

# Training will show progress like:
# Step 10: loss=2.345
# Step 20: loss=1.987
# Step 30: loss=1.654
# ...

Batch Size Optimization

Find the optimal batch size for your GPU:

# Start small and increase
batch_sizes = [1, 2, 4, 8]

for bs in batch_sizes:
    try:
        # Test with one step
        trainer = SFTTrainer(
            model=model,
            args=TrainingArguments(
                per_device_train_batch_size=bs,
                max_steps=1,
                ...
            ),
            ...
        )
        trainer.train()
        print(f"Batch size {bs}: OK")
    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"Batch size {bs}: OOM")
            break

Expected Performance

Training Llama 3.2 3B on 10K examples:

Metric Standard Unsloth
Time per epoch 60 min 30 min
VRAM usage 12 GB 4 GB
Steps/second 2.5 5.0

Troubleshooting

Loss Not Decreasing

# Check learning rate
learning_rate = 2e-4  # Try 1e-4 or 5e-5

# Add warmup
warmup_steps = 100

Out of Memory

# Reduce batch size
per_device_train_batch_size = 1
gradient_accumulation_steps = 16

# Use 8-bit optimizer
optim = "adamw_8bit"

Slow Training

# Disable evaluation during training
eval_strategy = "no"

# Reduce logging frequency
logging_steps = 50

Tip: With Unsloth, you can train the same model in half the time on the same hardware. This means more experiments and faster iteration!

Next, let's learn how to export and convert our fine-tuned model for deployment. :::

Quiz

Module 4: Training with Unsloth

Take Quiz