vLLM & Open-Source Inference Engines
vLLM Production Deployment
4 min read
Moving from development to production requires careful configuration for reliability, performance, and scalability.
Docker Deployment
Basic Docker Setup
# Dockerfile for vLLM production
FROM vllm/vllm-openai:latest
# Set environment variables
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV CUDA_VISIBLE_DEVICES=0,1,2,3
# Health check
HEALTHCHECK \
CMD curl -f http://localhost:8000/health || exit 1
# Default command
CMD ["--model", "meta-llama/Llama-3.3-70B-Instruct", \
"--tensor-parallel-size", "4", \
"--port", "8000"]
Docker Compose for Multi-GPU
# docker-compose.yml
version: '3.8'
services:
vllm:
image: vllm/vllm-openai:latest
container_name: vllm-server
ports:
- "8000:8000"
volumes:
- ~/.cache/huggingface:/root/.cache/huggingface
environment:
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
- CUDA_VISIBLE_DEVICES=0,1,2,3
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 4
capabilities: [gpu]
command: >
--model meta-llama/Llama-3.3-70B-Instruct
--tensor-parallel-size 4
--gpu-memory-utilization 0.9
--max-model-len 32768
--enable-prefix-caching
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 120s
restart: unless-stopped
Kubernetes Deployment
Basic Deployment
# vllm-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server
labels:
app: vllm
spec:
replicas: 1 # Scale with HPA
selector:
matchLabels:
app: vllm
template:
metadata:
labels:
app: vllm
spec:
containers:
- name: vllm
image: vllm/vllm-openai:latest
ports:
- containerPort: 8000
args:
- "--model"
- "meta-llama/Llama-3.3-70B-Instruct"
- "--tensor-parallel-size"
- "4"
- "--gpu-memory-utilization"
- "0.9"
- "--port"
- "8000"
resources:
limits:
nvidia.com/gpu: 4
memory: "128Gi"
requests:
nvidia.com/gpu: 4
memory: "64Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 120
periodSeconds: 30
readinessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 10
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache-pvc
nodeSelector:
nvidia.com/gpu.product: "NVIDIA-A100-SXM4-80GB"
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
---
apiVersion: v1
kind: Service
metadata:
name: vllm-service
spec:
selector:
app: vllm
ports:
- port: 8000
targetPort: 8000
type: ClusterIP
Horizontal Pod Autoscaler
# vllm-hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: vllm-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: vllm-server
minReplicas: 1
maxReplicas: 4
metrics:
- type: Pods
pods:
metric:
name: vllm_num_requests_waiting
target:
type: AverageValue
averageValue: "10" # Scale when queue > 10
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 80
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Pods
value: 1
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Pods
value: 1
periodSeconds: 120
Production Configuration
Optimized Server Settings
# Production launch script
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1,2,3
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_LOGGING_LEVEL=INFO
vllm serve meta-llama/Llama-3.3-70B-Instruct \
# Parallelism
--tensor-parallel-size 4 \
# Memory optimization
--gpu-memory-utilization 0.92 \
--max-model-len 32768 \
--enable-prefix-caching \
# Scheduling
--enable-chunked-prefill \
--max-num-seqs 256 \
--max-num-batched-tokens 8192 \
# Quantization (optional)
--quantization fp8 \
# Server settings
--port 8000 \
--host 0.0.0.0 \
--api-key "${VLLM_API_KEY}" \
# Logging
--disable-log-requests \ # Reduce I/O
--disable-log-stats # Or enable for monitoring
Load Balancing
# nginx.conf for vLLM load balancing
upstream vllm_cluster {
least_conn; # Route to least busy server
server vllm-1:8000 weight=1;
server vllm-2:8000 weight=1;
server vllm-3:8000 weight=1;
keepalive 32;
}
server {
listen 80;
location /v1 {
proxy_pass http://vllm_cluster;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
# Streaming support
proxy_buffering off;
proxy_cache off;
# Timeouts for long generations
proxy_read_timeout 300s;
proxy_send_timeout 300s;
}
location /health {
proxy_pass http://vllm_cluster/health;
}
}
Multi-Node Deployment
For models larger than single-node capacity:
# Node 0 (Ray head)
ray start --head --port=6379
# Node 1-N (Ray workers)
ray start --address='head-node-ip:6379'
# Launch vLLM across cluster
vllm serve meta-llama/Llama-3.1-405B-Instruct \
--tensor-parallel-size 8 \
--pipeline-parallel-size 2 \
--distributed-executor-backend ray
# Kubernetes multi-node with Ray
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: vllm-cluster
spec:
rayVersion: '2.9.0'
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
template:
spec:
containers:
- name: ray-head
image: vllm/vllm-openai:latest
resources:
limits:
nvidia.com/gpu: 8
workerGroupSpecs:
- replicas: 1
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: vllm/vllm-openai:latest
resources:
limits:
nvidia.com/gpu: 8
Monitoring & Health Checks
# Custom health check endpoint
import httpx
import asyncio
async def check_vllm_health(base_url: str) -> dict:
async with httpx.AsyncClient() as client:
# Basic health
health = await client.get(f"{base_url}/health")
# Detailed metrics
metrics = await client.get(f"{base_url}/metrics")
# Parse key metrics
return {
"healthy": health.status_code == 200,
"num_requests_running": parse_metric(metrics, "vllm_num_requests_running"),
"num_requests_waiting": parse_metric(metrics, "vllm_num_requests_waiting"),
"gpu_cache_usage": parse_metric(metrics, "vllm_gpu_cache_usage_perc"),
}
Production deployment requires balancing performance, cost, and reliability. Start conservative and tune based on actual load patterns.
Next, we'll explore alternative inference engines and when to choose them. :::