Production Deployment
Docker Deployment
3 min read
Deploy Ollama in containers for reproducible, isolated, and scalable local LLM infrastructure.
Why Docker for Ollama?
┌─────────────────────────────────────────────────────────────────┐
│ Docker Deployment Benefits │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Benefit │ Description │
│ ───────────────────│────────────────────────────────────── │
│ Isolation │ Separate from host system │
│ Reproducibility │ Same environment everywhere │
│ GPU passthrough │ Full GPU access in container │
│ Easy updates │ Pull new image, restart │
│ Resource limits │ Control CPU, memory, GPU allocation │
│ Orchestration │ Works with Kubernetes, Swarm │
│ │
└─────────────────────────────────────────────────────────────────┘
Basic Ollama Docker Setup
# Run Ollama container (CPU only)
docker run -d \
--name ollama \
-p 11434:11434 \
-v ollama:/root/.ollama \
ollama/ollama
# Pull a model inside the container
docker exec ollama ollama pull llama3.2
# Test the API
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"prompt": "Hello from Docker!",
"stream": false
}'
GPU-Enabled Docker (NVIDIA)
# Install NVIDIA Container Toolkit first
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
# Run with GPU support
docker run -d \
--name ollama-gpu \
--gpus all \
-p 11434:11434 \
-v ollama:/root/.ollama \
ollama/ollama
# Verify GPU is accessible
docker exec ollama-gpu nvidia-smi
Docker Compose Configuration
# docker-compose.yml
version: '3.8'
services:
ollama:
image: ollama/ollama:latest
container_name: ollama
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
- ./models:/models # Optional: mount local models
environment:
- OLLAMA_KEEP_ALIVE=24h
- OLLAMA_NUM_PARALLEL=4
- OLLAMA_MAX_LOADED_MODELS=2
restart: unless-stopped
# Uncomment for GPU support
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
volumes:
ollama_data:
# Start the stack
docker compose up -d
# View logs
docker compose logs -f ollama
# Stop
docker compose down
Pre-loading Models on Start
# docker-compose.yml with model preloading
version: '3.8'
services:
ollama:
image: ollama/ollama:latest
container_name: ollama
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
restart: unless-stopped
# Init container to pull models
ollama-init:
image: ollama/ollama:latest
container_name: ollama-init
depends_on:
- ollama
entrypoint: >
/bin/sh -c "
sleep 5 &&
ollama pull llama3.2 &&
ollama pull nomic-embed-text &&
echo 'Models ready!'
"
environment:
- OLLAMA_HOST=ollama:11434
restart: "no"
volumes:
ollama_data:
Custom Dockerfile with Models
# Dockerfile.ollama
FROM ollama/ollama:latest
# Set environment
ENV OLLAMA_KEEP_ALIVE=24h
ENV OLLAMA_NUM_PARALLEL=4
# Copy custom Modelfiles
COPY modelfiles/ /modelfiles/
# Create entrypoint script
COPY <<EOF /entrypoint.sh
#!/bin/bash
ollama serve &
sleep 5
# Pull base models
ollama pull llama3.2
ollama pull nomic-embed-text
# Create custom models from Modelfiles
for f in /modelfiles/*.Modelfile; do
name=\$(basename \$f .Modelfile)
ollama create \$name -f \$f
done
# Keep container running
wait
EOF
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
# Build custom image
docker build -f Dockerfile.ollama -t my-ollama:latest .
# Run with pre-loaded models
docker run -d --name my-ollama -p 11434:11434 my-ollama:latest
Health Checks
# docker-compose.yml with health check
services:
ollama:
image: ollama/ollama:latest
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/version"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
restart: unless-stopped
Python Client for Dockerized Ollama
import ollama
import os
# Configure client for Docker deployment
client = ollama.Client(
host=os.getenv("OLLAMA_HOST", "http://localhost:11434")
)
def check_ollama_health():
"""Check if Ollama container is healthy."""
try:
# List models to verify connection
models = client.list()
return {
"healthy": True,
"models": [m["name"] for m in models.get("models", [])]
}
except Exception as e:
return {"healthy": False, "error": str(e)}
def ensure_model(model_name: str):
"""Ensure a model is available, pull if needed."""
try:
client.show(model_name)
return True
except:
print(f"Pulling {model_name}...")
client.pull(model_name)
return True
# Usage
health = check_ollama_health()
print(f"Ollama healthy: {health['healthy']}")
if health["healthy"]:
ensure_model("llama3.2")
response = client.generate(model="llama3.2", prompt="Hello from Python!")
print(response["response"])
Resource Limits
# docker-compose.yml with resource limits
services:
ollama:
image: ollama/ollama:latest
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
deploy:
resources:
limits:
cpus: '4'
memory: 16G
reservations:
cpus: '2'
memory: 8G
# GPU reservation
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
Multi-Container Application
# Full stack with Ollama, API, and frontend
version: '3.8'
services:
ollama:
image: ollama/ollama:latest
volumes:
- ollama_data:/root/.ollama
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
api:
build: ./api
ports:
- "8000:8000"
environment:
- OLLAMA_HOST=http://ollama:11434
depends_on:
- ollama
frontend:
build: ./frontend
ports:
- "3000:3000"
depends_on:
- api
volumes:
ollama_data:
Security Considerations
# Production-ready security settings
services:
ollama:
image: ollama/ollama:latest
ports:
- "127.0.0.1:11434:11434" # Bind to localhost only
volumes:
- ollama_data:/root/.ollama:rw
read_only: false # Ollama needs write access
security_opt:
- no-new-privileges:true
# Don't run as root in production
# user: "1000:1000" # Uncomment when Ollama supports it
Docker provides the foundation for production deployment. Next, we'll explore scaling strategies for handling multiple users. :::