Lesson 21 of 22

Production Deployment

Scaling Local Inference

3 min read

Handle multiple users and high traffic by scaling your local LLM infrastructure with load balancing and replicas.

Scaling Strategies

┌─────────────────────────────────────────────────────────────────┐
│                   Local LLM Scaling Strategies                  │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  Strategy          │ Description           │ Best For           │
│  ──────────────────│───────────────────────│─────────────────   │
│  Vertical          │ Bigger GPU/more RAM   │ Single server      │
│  Horizontal        │ Multiple replicas     │ High availability  │
│  Model routing     │ Different models/GPUs │ Mixed workloads    │
│  Request queuing   │ Async processing      │ Batch workloads    │
│  Caching           │ Cache common queries  │ Repeated questions │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Load Balancing with NGINX

# nginx.conf
upstream ollama_backend {
    # Round-robin load balancing across replicas
    server ollama1:11434;
    server ollama2:11434;
    server ollama3:11434;

    # Optional: least connections strategy
    # least_conn;

    # Health checks
    keepalive 32;
}

server {
    listen 80;
    server_name llm.example.com;

    location / {
        proxy_pass http://ollama_backend;
        proxy_http_version 1.1;
        proxy_set_header Connection "";
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;

        # Streaming support
        proxy_buffering off;
        proxy_cache off;
        chunked_transfer_encoding on;

        # Timeouts for long-running inference
        proxy_connect_timeout 300s;
        proxy_send_timeout 300s;
        proxy_read_timeout 300s;
    }
}

Docker Compose with Replicas

# docker-compose.scale.yml
version: '3.8'

services:
  ollama:
    image: ollama/ollama:latest
    volumes:
      - ollama_shared:/root/.ollama
    deploy:
      replicas: 3
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    environment:
      - OLLAMA_KEEP_ALIVE=24h

  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
    depends_on:
      - ollama

volumes:
  ollama_shared:
# Scale to 3 replicas
docker compose -f docker-compose.scale.yml up -d --scale ollama=3

Request Queue with Redis

import redis
import json
import ollama
from typing import Optional
import threading
import time

class OllamaQueue:
    """Queue-based request handling for rate limiting."""

    def __init__(self, redis_url: str = "redis://localhost:6379"):
        self.redis = redis.from_url(redis_url)
        self.queue_name = "ollama_requests"
        self.results_prefix = "ollama_result:"

    def submit(self, request_id: str, model: str, prompt: str) -> str:
        """Submit a request to the queue."""
        request = {
            "id": request_id,
            "model": model,
            "prompt": prompt,
            "timestamp": time.time()
        }
        self.redis.rpush(self.queue_name, json.dumps(request))
        return request_id

    def get_result(self, request_id: str, timeout: int = 300) -> Optional[dict]:
        """Wait for and retrieve result."""
        key = f"{self.results_prefix}{request_id}"

        for _ in range(timeout):
            result = self.redis.get(key)
            if result:
                self.redis.delete(key)
                return json.loads(result)
            time.sleep(1)

        return None

    def process_queue(self):
        """Worker that processes queued requests."""
        while True:
            # Blocking pop from queue
            _, request_json = self.redis.blpop(self.queue_name)
            request = json.loads(request_json)

            try:
                # Process with Ollama
                response = ollama.generate(
                    model=request["model"],
                    prompt=request["prompt"]
                )

                result = {
                    "success": True,
                    "response": response["response"]
                }
            except Exception as e:
                result = {
                    "success": False,
                    "error": str(e)
                }

            # Store result
            key = f"{self.results_prefix}{request['id']}"
            self.redis.setex(key, 3600, json.dumps(result))

# Start worker in background
queue = OllamaQueue()
worker = threading.Thread(target=queue.process_queue, daemon=True)
worker.start()

# Submit requests
request_id = queue.submit("req-001", "llama3.2", "What is AI?")
result = queue.get_result(request_id)
print(result)

Response Caching

import hashlib
import json
import redis
import ollama
from typing import Optional

class CachedOllama:
    """Ollama client with response caching."""

    def __init__(self, redis_url: str = "redis://localhost:6379", ttl: int = 3600):
        self.redis = redis.from_url(redis_url)
        self.ttl = ttl

    def _cache_key(self, model: str, prompt: str, options: dict) -> str:
        """Generate cache key from request parameters."""
        content = json.dumps({
            "model": model,
            "prompt": prompt,
            "options": options
        }, sort_keys=True)
        return f"ollama_cache:{hashlib.sha256(content.encode()).hexdigest()}"

    def generate(self, model: str, prompt: str, **options) -> dict:
        """Generate with caching."""
        cache_key = self._cache_key(model, prompt, options)

        # Check cache
        cached = self.redis.get(cache_key)
        if cached:
            return json.loads(cached)

        # Generate new response
        response = ollama.generate(model=model, prompt=prompt, **options)

        # Cache result
        self.redis.setex(cache_key, self.ttl, json.dumps(response))

        return response

    def cache_stats(self) -> dict:
        """Get cache statistics."""
        keys = self.redis.keys("ollama_cache:*")
        return {
            "cached_responses": len(keys),
            "memory_usage_bytes": sum(
                self.redis.memory_usage(k) or 0 for k in keys
            )
        }

# Usage
client = CachedOllama()

# First call - generates and caches
response1 = client.generate("llama3.2", "What is Python?")

# Second call - returns cached response instantly
response2 = client.generate("llama3.2", "What is Python?")

print(client.cache_stats())

Model-Based Routing

import ollama
from typing import Literal

class ModelRouter:
    """Route requests to appropriate models based on task."""

    def __init__(self):
        self.model_map = {
            "code": "deepseek-coder:6.7b",
            "chat": "llama3.2",
            "embedding": "nomic-embed-text",
            "fast": "phi3:mini",
        }

    def classify_task(self, prompt: str) -> str:
        """Classify the task type from prompt."""
        prompt_lower = prompt.lower()

        if any(kw in prompt_lower for kw in ["code", "function", "debug", "program"]):
            return "code"
        if any(kw in prompt_lower for kw in ["embed", "vector", "similarity"]):
            return "embedding"
        if len(prompt) < 50:
            return "fast"
        return "chat"

    def generate(self, prompt: str, task_type: str = None) -> dict:
        """Route request to appropriate model."""
        if task_type is None:
            task_type = self.classify_task(prompt)

        model = self.model_map.get(task_type, "llama3.2")

        if task_type == "embedding":
            return ollama.embed(model=model, input=prompt)
        else:
            return ollama.generate(model=model, prompt=prompt)

# Usage
router = ModelRouter()

# Automatically routes to code model
code_response = router.generate("Write a Python function to sort a list")

# Automatically routes to fast model
quick_response = router.generate("What is 2+2?")

# Explicitly specify task
chat_response = router.generate("Tell me about AI", task_type="chat")

Health Monitoring

import ollama
import time
import threading
from dataclasses import dataclass
from typing import Dict, List

@dataclass
class ServerHealth:
    url: str
    healthy: bool
    latency_ms: float
    last_check: float

class HealthMonitor:
    """Monitor health of multiple Ollama instances."""

    def __init__(self, servers: List[str], check_interval: int = 30):
        self.servers = servers
        self.check_interval = check_interval
        self.health: Dict[str, ServerHealth] = {}
        self._start_monitoring()

    def _check_server(self, url: str) -> ServerHealth:
        """Check health of a single server."""
        try:
            client = ollama.Client(host=url)
            start = time.time()
            client.list()  # Simple health check
            latency = (time.time() - start) * 1000

            return ServerHealth(
                url=url,
                healthy=True,
                latency_ms=latency,
                last_check=time.time()
            )
        except Exception:
            return ServerHealth(
                url=url,
                healthy=False,
                latency_ms=-1,
                last_check=time.time()
            )

    def _monitor_loop(self):
        """Continuous monitoring loop."""
        while True:
            for server in self.servers:
                self.health[server] = self._check_server(server)
            time.sleep(self.check_interval)

    def _start_monitoring(self):
        """Start background monitoring thread."""
        thread = threading.Thread(target=self._monitor_loop, daemon=True)
        thread.start()

    def get_healthy_servers(self) -> List[str]:
        """Get list of healthy servers."""
        return [
            url for url, health in self.health.items()
            if health.healthy
        ]

    def get_fastest_server(self) -> str:
        """Get the server with lowest latency."""
        healthy = [h for h in self.health.values() if h.healthy]
        if not healthy:
            raise Exception("No healthy servers available")
        return min(healthy, key=lambda h: h.latency_ms).url

# Usage
monitor = HealthMonitor([
    "http://ollama1:11434",
    "http://ollama2:11434",
    "http://ollama3:11434"
])

# Wait for initial health check
time.sleep(2)

# Use fastest healthy server
fastest = monitor.get_fastest_server()
client = ollama.Client(host=fastest)
response = client.generate(model="llama3.2", prompt="Hello!")

Scaling Decision Matrix

Users Strategy
1-5 Single Ollama instance
5-20 2-3 replicas with NGINX
20-100 Multiple GPUs + caching
100+ Consider vLLM + Kubernetes

Scaling ensures your local LLM infrastructure can handle production load. In the final lesson, we'll discuss next steps for advancing your skills. :::

Quiz

Module 6: Production Deployment

Take Quiz