Lesson 20 of 22

Production Deployment

Docker Deployment

3 min read

Deploy Ollama in containers for reproducible, isolated, and scalable local LLM infrastructure.

Why Docker for Ollama?

┌─────────────────────────────────────────────────────────────────┐
│                   Docker Deployment Benefits                    │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  Benefit            │ Description                               │
│  ───────────────────│──────────────────────────────────────     │
│  Isolation          │ Separate from host system                 │
│  Reproducibility    │ Same environment everywhere               │
│  GPU passthrough    │ Full GPU access in container              │
│  Easy updates       │ Pull new image, restart                   │
│  Resource limits    │ Control CPU, memory, GPU allocation       │
│  Orchestration      │ Works with Kubernetes, Swarm              │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Basic Ollama Docker Setup

# Run Ollama container (CPU only)
docker run -d \
  --name ollama \
  -p 11434:11434 \
  -v ollama:/root/.ollama \
  ollama/ollama

# Pull a model inside the container
docker exec ollama ollama pull llama3.2

# Test the API
curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Hello from Docker!",
  "stream": false
}'

GPU-Enabled Docker (NVIDIA)

# Install NVIDIA Container Toolkit first
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html

# Run with GPU support
docker run -d \
  --name ollama-gpu \
  --gpus all \
  -p 11434:11434 \
  -v ollama:/root/.ollama \
  ollama/ollama

# Verify GPU is accessible
docker exec ollama-gpu nvidia-smi

Docker Compose Configuration

# docker-compose.yml
version: '3.8'

services:
  ollama:
    image: ollama/ollama:latest
    container_name: ollama
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
      - ./models:/models  # Optional: mount local models
    environment:
      - OLLAMA_KEEP_ALIVE=24h
      - OLLAMA_NUM_PARALLEL=4
      - OLLAMA_MAX_LOADED_MODELS=2
    restart: unless-stopped
    # Uncomment for GPU support
    # deploy:
    #   resources:
    #     reservations:
    #       devices:
    #         - driver: nvidia
    #           count: all
    #           capabilities: [gpu]

volumes:
  ollama_data:
# Start the stack
docker compose up -d

# View logs
docker compose logs -f ollama

# Stop
docker compose down

Pre-loading Models on Start

# docker-compose.yml with model preloading
version: '3.8'

services:
  ollama:
    image: ollama/ollama:latest
    container_name: ollama
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
    restart: unless-stopped

  # Init container to pull models
  ollama-init:
    image: ollama/ollama:latest
    container_name: ollama-init
    depends_on:
      - ollama
    entrypoint: >
      /bin/sh -c "
        sleep 5 &&
        ollama pull llama3.2 &&
        ollama pull nomic-embed-text &&
        echo 'Models ready!'
      "
    environment:
      - OLLAMA_HOST=ollama:11434
    restart: "no"

volumes:
  ollama_data:

Custom Dockerfile with Models

# Dockerfile.ollama
FROM ollama/ollama:latest

# Set environment
ENV OLLAMA_KEEP_ALIVE=24h
ENV OLLAMA_NUM_PARALLEL=4

# Copy custom Modelfiles
COPY modelfiles/ /modelfiles/

# Create entrypoint script
COPY <<EOF /entrypoint.sh
#!/bin/bash
ollama serve &
sleep 5

# Pull base models
ollama pull llama3.2
ollama pull nomic-embed-text

# Create custom models from Modelfiles
for f in /modelfiles/*.Modelfile; do
  name=\$(basename \$f .Modelfile)
  ollama create \$name -f \$f
done

# Keep container running
wait
EOF

RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
# Build custom image
docker build -f Dockerfile.ollama -t my-ollama:latest .

# Run with pre-loaded models
docker run -d --name my-ollama -p 11434:11434 my-ollama:latest

Health Checks

# docker-compose.yml with health check
services:
  ollama:
    image: ollama/ollama:latest
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11434/api/version"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    restart: unless-stopped

Python Client for Dockerized Ollama

import ollama
import os

# Configure client for Docker deployment
client = ollama.Client(
    host=os.getenv("OLLAMA_HOST", "http://localhost:11434")
)

def check_ollama_health():
    """Check if Ollama container is healthy."""
    try:
        # List models to verify connection
        models = client.list()
        return {
            "healthy": True,
            "models": [m["name"] for m in models.get("models", [])]
        }
    except Exception as e:
        return {"healthy": False, "error": str(e)}

def ensure_model(model_name: str):
    """Ensure a model is available, pull if needed."""
    try:
        client.show(model_name)
        return True
    except:
        print(f"Pulling {model_name}...")
        client.pull(model_name)
        return True

# Usage
health = check_ollama_health()
print(f"Ollama healthy: {health['healthy']}")

if health["healthy"]:
    ensure_model("llama3.2")
    response = client.generate(model="llama3.2", prompt="Hello from Python!")
    print(response["response"])

Resource Limits

# docker-compose.yml with resource limits
services:
  ollama:
    image: ollama/ollama:latest
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 16G
        reservations:
          cpus: '2'
          memory: 8G
          # GPU reservation
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]

Multi-Container Application

# Full stack with Ollama, API, and frontend
version: '3.8'

services:
  ollama:
    image: ollama/ollama:latest
    volumes:
      - ollama_data:/root/.ollama
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]

  api:
    build: ./api
    ports:
      - "8000:8000"
    environment:
      - OLLAMA_HOST=http://ollama:11434
    depends_on:
      - ollama

  frontend:
    build: ./frontend
    ports:
      - "3000:3000"
    depends_on:
      - api

volumes:
  ollama_data:

Security Considerations

# Production-ready security settings
services:
  ollama:
    image: ollama/ollama:latest
    ports:
      - "127.0.0.1:11434:11434"  # Bind to localhost only
    volumes:
      - ollama_data:/root/.ollama:rw
    read_only: false  # Ollama needs write access
    security_opt:
      - no-new-privileges:true
    # Don't run as root in production
    # user: "1000:1000"  # Uncomment when Ollama supports it

Docker provides the foundation for production deployment. Next, we'll explore scaling strategies for handling multiple users. :::

Quiz

Module 6: Production Deployment

Take Quiz