Production Deployment
Scaling Local Inference
3 min read
Handle multiple users and high traffic by scaling your local LLM infrastructure with load balancing and replicas.
Scaling Strategies
┌─────────────────────────────────────────────────────────────────┐
│ Local LLM Scaling Strategies │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Strategy │ Description │ Best For │
│ ──────────────────│───────────────────────│───────────────── │
│ Vertical │ Bigger GPU/more RAM │ Single server │
│ Horizontal │ Multiple replicas │ High availability │
│ Model routing │ Different models/GPUs │ Mixed workloads │
│ Request queuing │ Async processing │ Batch workloads │
│ Caching │ Cache common queries │ Repeated questions │
│ │
└─────────────────────────────────────────────────────────────────┘
Load Balancing with NGINX
# nginx.conf
upstream ollama_backend {
# Round-robin load balancing across replicas
server ollama1:11434;
server ollama2:11434;
server ollama3:11434;
# Optional: least connections strategy
# least_conn;
# Health checks
keepalive 32;
}
server {
listen 80;
server_name llm.example.com;
location / {
proxy_pass http://ollama_backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
# Streaming support
proxy_buffering off;
proxy_cache off;
chunked_transfer_encoding on;
# Timeouts for long-running inference
proxy_connect_timeout 300s;
proxy_send_timeout 300s;
proxy_read_timeout 300s;
}
}
Docker Compose with Replicas
# docker-compose.scale.yml
version: '3.8'
services:
ollama:
image: ollama/ollama:latest
volumes:
- ollama_shared:/root/.ollama
deploy:
replicas: 3
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
environment:
- OLLAMA_KEEP_ALIVE=24h
nginx:
image: nginx:alpine
ports:
- "80:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
depends_on:
- ollama
volumes:
ollama_shared:
# Scale to 3 replicas
docker compose -f docker-compose.scale.yml up -d --scale ollama=3
Request Queue with Redis
import redis
import json
import ollama
from typing import Optional
import threading
import time
class OllamaQueue:
"""Queue-based request handling for rate limiting."""
def __init__(self, redis_url: str = "redis://localhost:6379"):
self.redis = redis.from_url(redis_url)
self.queue_name = "ollama_requests"
self.results_prefix = "ollama_result:"
def submit(self, request_id: str, model: str, prompt: str) -> str:
"""Submit a request to the queue."""
request = {
"id": request_id,
"model": model,
"prompt": prompt,
"timestamp": time.time()
}
self.redis.rpush(self.queue_name, json.dumps(request))
return request_id
def get_result(self, request_id: str, timeout: int = 300) -> Optional[dict]:
"""Wait for and retrieve result."""
key = f"{self.results_prefix}{request_id}"
for _ in range(timeout):
result = self.redis.get(key)
if result:
self.redis.delete(key)
return json.loads(result)
time.sleep(1)
return None
def process_queue(self):
"""Worker that processes queued requests."""
while True:
# Blocking pop from queue
_, request_json = self.redis.blpop(self.queue_name)
request = json.loads(request_json)
try:
# Process with Ollama
response = ollama.generate(
model=request["model"],
prompt=request["prompt"]
)
result = {
"success": True,
"response": response["response"]
}
except Exception as e:
result = {
"success": False,
"error": str(e)
}
# Store result
key = f"{self.results_prefix}{request['id']}"
self.redis.setex(key, 3600, json.dumps(result))
# Start worker in background
queue = OllamaQueue()
worker = threading.Thread(target=queue.process_queue, daemon=True)
worker.start()
# Submit requests
request_id = queue.submit("req-001", "llama3.2", "What is AI?")
result = queue.get_result(request_id)
print(result)
Response Caching
import hashlib
import json
import redis
import ollama
from typing import Optional
class CachedOllama:
"""Ollama client with response caching."""
def __init__(self, redis_url: str = "redis://localhost:6379", ttl: int = 3600):
self.redis = redis.from_url(redis_url)
self.ttl = ttl
def _cache_key(self, model: str, prompt: str, options: dict) -> str:
"""Generate cache key from request parameters."""
content = json.dumps({
"model": model,
"prompt": prompt,
"options": options
}, sort_keys=True)
return f"ollama_cache:{hashlib.sha256(content.encode()).hexdigest()}"
def generate(self, model: str, prompt: str, **options) -> dict:
"""Generate with caching."""
cache_key = self._cache_key(model, prompt, options)
# Check cache
cached = self.redis.get(cache_key)
if cached:
return json.loads(cached)
# Generate new response
response = ollama.generate(model=model, prompt=prompt, **options)
# Cache result
self.redis.setex(cache_key, self.ttl, json.dumps(response))
return response
def cache_stats(self) -> dict:
"""Get cache statistics."""
keys = self.redis.keys("ollama_cache:*")
return {
"cached_responses": len(keys),
"memory_usage_bytes": sum(
self.redis.memory_usage(k) or 0 for k in keys
)
}
# Usage
client = CachedOllama()
# First call - generates and caches
response1 = client.generate("llama3.2", "What is Python?")
# Second call - returns cached response instantly
response2 = client.generate("llama3.2", "What is Python?")
print(client.cache_stats())
Model-Based Routing
import ollama
from typing import Literal
class ModelRouter:
"""Route requests to appropriate models based on task."""
def __init__(self):
self.model_map = {
"code": "deepseek-coder:6.7b",
"chat": "llama3.2",
"embedding": "nomic-embed-text",
"fast": "phi3:mini",
}
def classify_task(self, prompt: str) -> str:
"""Classify the task type from prompt."""
prompt_lower = prompt.lower()
if any(kw in prompt_lower for kw in ["code", "function", "debug", "program"]):
return "code"
if any(kw in prompt_lower for kw in ["embed", "vector", "similarity"]):
return "embedding"
if len(prompt) < 50:
return "fast"
return "chat"
def generate(self, prompt: str, task_type: str = None) -> dict:
"""Route request to appropriate model."""
if task_type is None:
task_type = self.classify_task(prompt)
model = self.model_map.get(task_type, "llama3.2")
if task_type == "embedding":
return ollama.embed(model=model, input=prompt)
else:
return ollama.generate(model=model, prompt=prompt)
# Usage
router = ModelRouter()
# Automatically routes to code model
code_response = router.generate("Write a Python function to sort a list")
# Automatically routes to fast model
quick_response = router.generate("What is 2+2?")
# Explicitly specify task
chat_response = router.generate("Tell me about AI", task_type="chat")
Health Monitoring
import ollama
import time
import threading
from dataclasses import dataclass
from typing import Dict, List
@dataclass
class ServerHealth:
url: str
healthy: bool
latency_ms: float
last_check: float
class HealthMonitor:
"""Monitor health of multiple Ollama instances."""
def __init__(self, servers: List[str], check_interval: int = 30):
self.servers = servers
self.check_interval = check_interval
self.health: Dict[str, ServerHealth] = {}
self._start_monitoring()
def _check_server(self, url: str) -> ServerHealth:
"""Check health of a single server."""
try:
client = ollama.Client(host=url)
start = time.time()
client.list() # Simple health check
latency = (time.time() - start) * 1000
return ServerHealth(
url=url,
healthy=True,
latency_ms=latency,
last_check=time.time()
)
except Exception:
return ServerHealth(
url=url,
healthy=False,
latency_ms=-1,
last_check=time.time()
)
def _monitor_loop(self):
"""Continuous monitoring loop."""
while True:
for server in self.servers:
self.health[server] = self._check_server(server)
time.sleep(self.check_interval)
def _start_monitoring(self):
"""Start background monitoring thread."""
thread = threading.Thread(target=self._monitor_loop, daemon=True)
thread.start()
def get_healthy_servers(self) -> List[str]:
"""Get list of healthy servers."""
return [
url for url, health in self.health.items()
if health.healthy
]
def get_fastest_server(self) -> str:
"""Get the server with lowest latency."""
healthy = [h for h in self.health.values() if h.healthy]
if not healthy:
raise Exception("No healthy servers available")
return min(healthy, key=lambda h: h.latency_ms).url
# Usage
monitor = HealthMonitor([
"http://ollama1:11434",
"http://ollama2:11434",
"http://ollama3:11434"
])
# Wait for initial health check
time.sleep(2)
# Use fastest healthy server
fastest = monitor.get_fastest_server()
client = ollama.Client(host=fastest)
response = client.generate(model="llama3.2", prompt="Hello!")
Scaling Decision Matrix
| Users | Strategy |
|---|---|
| 1-5 | Single Ollama instance |
| 5-20 | 2-3 replicas with NGINX |
| 20-100 | Multiple GPUs + caching |
| 100+ | Consider vLLM + Kubernetes |
Scaling ensures your local LLM infrastructure can handle production load. In the final lesson, we'll discuss next steps for advancing your skills. :::