LLM Gateways & Routing
Model Routing Strategies
3 min read
Effective model routing optimizes for cost, latency, and quality by dynamically selecting the best model for each request. This lesson covers routing patterns for production LLM deployments.
Routing Decision Factors
┌─────────────────────────────────────────────────────────────┐
│ Routing Decision Matrix │
├─────────────────────────────────────────────────────────────┤
│ │
│ Request Attributes Model Selection Criteria │
│ ───────────────── ───────────────────────── │
│ • Task complexity • Cost per token │
│ • Required quality • Latency (TTFT, generation) │
│ • Context length • Context window size │
│ • User tier • Rate limits available │
│ • Budget remaining • Current availability │
│ │
│ ┌─────────────────────┐ │
│ │ Routing Decision │ │
│ │ Engine │ │
│ └──────────┬──────────┘ │
│ │ │
│ ┌───────────────┼───────────────┐ │
│ ▼ ▼ ▼ │
│ GPT-4o Claude Sonnet Llama 70B │
│ (Premium) (Balanced) (Cost-opt) │
│ │
└─────────────────────────────────────────────────────────────┘
Strategy 1: Complexity-Based Routing
Route based on task complexity analysis:
from litellm import completion
import re
def analyze_complexity(prompt: str) -> str:
"""Estimate task complexity from prompt."""
indicators = {
"high": ["analyze", "compare", "evaluate", "synthesize", "create"],
"medium": ["explain", "describe", "summarize", "list"],
"low": ["what is", "define", "translate", "format"]
}
prompt_lower = prompt.lower()
for level, keywords in indicators.items():
if any(kw in prompt_lower for kw in keywords):
return level
# Default based on length
if len(prompt) > 2000:
return "high"
elif len(prompt) > 500:
return "medium"
return "low"
def route_by_complexity(prompt: str, messages: list) -> str:
"""Select model based on complexity."""
complexity = analyze_complexity(prompt)
model_map = {
"high": "gpt-4o", # $5/1M tokens
"medium": "claude-sonnet", # $3/1M tokens
"low": "gpt-4o-mini" # $0.15/1M tokens
}
return model_map[complexity]
# Usage
prompt = "Compare the economic policies of three countries"
model = route_by_complexity(prompt, [])
response = completion(model=model, messages=[{"role": "user", "content": prompt}])
Strategy 2: Semantic Routing
Use embeddings to classify requests:
from litellm import embedding, completion
import numpy as np
# Pre-computed route embeddings
ROUTES = {
"code_generation": {
"examples": ["Write a Python function", "Create a REST API"],
"model": "gpt-4o", # Best for code
},
"creative_writing": {
"examples": ["Write a story", "Compose a poem"],
"model": "claude-sonnet", # Best for creative
},
"factual_qa": {
"examples": ["What is the capital of France?", "When was Einstein born?"],
"model": "gpt-4o-mini", # Cost-effective for simple QA
}
}
def get_route_embeddings():
"""Pre-compute route embeddings."""
route_embeddings = {}
for route, config in ROUTES.items():
embeddings = []
for example in config["examples"]:
resp = embedding(model="text-embedding-3-small", input=example)
embeddings.append(resp.data[0].embedding)
route_embeddings[route] = np.mean(embeddings, axis=0)
return route_embeddings
def semantic_route(query: str, route_embeddings: dict) -> str:
"""Route based on semantic similarity."""
query_embedding = embedding(
model="text-embedding-3-small",
input=query
).data[0].embedding
best_route = None
best_similarity = -1
for route, route_emb in route_embeddings.items():
similarity = np.dot(query_embedding, route_emb)
if similarity > best_similarity:
best_similarity = similarity
best_route = route
return ROUTES[best_route]["model"]
Strategy 3: Fallback Chains
Handle failures gracefully with fallbacks:
from litellm import completion
from litellm.exceptions import RateLimitError, ServiceUnavailableError
FALLBACK_CHAIN = [
{"model": "gpt-4o", "timeout": 30},
{"model": "azure/gpt-4o", "timeout": 30},
{"model": "claude-sonnet-4-20250514", "timeout": 30},
{"model": "gpt-4o-mini", "timeout": 60}, # Last resort
]
async def completion_with_fallback(messages: list) -> dict:
"""Try models in sequence until success."""
last_error = None
for config in FALLBACK_CHAIN:
try:
response = await acompletion(
model=config["model"],
messages=messages,
timeout=config["timeout"]
)
return {
"response": response,
"model_used": config["model"],
"fallback_count": FALLBACK_CHAIN.index(config)
}
except (RateLimitError, ServiceUnavailableError) as e:
last_error = e
continue
raise last_error
Strategy 4: A/B Testing Router
Compare model performance in production:
import random
from dataclasses import dataclass
from typing import Optional
@dataclass
class ABConfig:
control_model: str
treatment_model: str
treatment_percentage: float
class ABRouter:
def __init__(self, config: ABConfig):
self.config = config
self.results = {"control": [], "treatment": []}
def route(self, user_id: str) -> tuple[str, str]:
"""Deterministic routing based on user_id."""
# Consistent assignment per user
bucket = hash(user_id) % 100
if bucket < self.config.treatment_percentage * 100:
return self.config.treatment_model, "treatment"
return self.config.control_model, "control"
def record_result(self, variant: str, score: float):
self.results[variant].append(score)
def get_statistics(self) -> dict:
return {
variant: {
"count": len(scores),
"mean": sum(scores) / len(scores) if scores else 0
}
for variant, scores in self.results.items()
}
# Usage
ab_router = ABRouter(ABConfig(
control_model="gpt-4o",
treatment_model="claude-sonnet-4-20250514",
treatment_percentage=0.2 # 20% to treatment
))
model, variant = ab_router.route(user_id="user-123")
Strategy 5: Cost-Optimized Routing
Stay within budget while maintaining quality:
from dataclasses import dataclass
@dataclass
class ModelCost:
model: str
input_cost_per_1m: float
output_cost_per_1m: float
quality_score: float # 0-1
MODELS = [
ModelCost("gpt-4o", 5.0, 15.0, 0.95),
ModelCost("claude-sonnet", 3.0, 15.0, 0.92),
ModelCost("gpt-4o-mini", 0.15, 0.6, 0.80),
ModelCost("llama-3.1-70b", 0.9, 0.9, 0.85),
]
class BudgetRouter:
def __init__(self, daily_budget: float, min_quality: float = 0.8):
self.daily_budget = daily_budget
self.min_quality = min_quality
self.spent_today = 0.0
def route(self, estimated_tokens: int) -> str:
"""Select cheapest model meeting quality threshold."""
remaining = self.daily_budget - self.spent_today
# Filter by quality and sort by cost
candidates = [
m for m in MODELS
if m.quality_score >= self.min_quality
]
candidates.sort(key=lambda m: m.input_cost_per_1m + m.output_cost_per_1m)
for model in candidates:
estimated_cost = (
(estimated_tokens * model.input_cost_per_1m / 1_000_000) +
(estimated_tokens * 0.5 * model.output_cost_per_1m / 1_000_000)
)
if estimated_cost <= remaining:
return model.model
# Budget exceeded, use cheapest
return candidates[0].model
def record_spend(self, tokens: int, model: str):
model_config = next(m for m in MODELS if m.model == model)
cost = tokens * (model_config.input_cost_per_1m + model_config.output_cost_per_1m) / 1_000_000
self.spent_today += cost
Routing Best Practices
┌─────────────────────────────────────────────────────────────┐
│ Production Routing Checklist │
├─────────────────────────────────────────────────────────────┤
│ │
│ 1. Always have fallbacks │
│ └── Never depend on a single provider │
│ │
│ 2. Monitor routing decisions │
│ └── Log model selection with request metadata │
│ │
│ 3. A/B test model changes │
│ └── Don't assume newer = better for your use case │
│ │
│ 4. Consider latency, not just cost │
│ └── Cheaper models may have higher latency │
│ │
│ 5. Implement circuit breakers │
│ └── Stop routing to failing providers quickly │
│ │
│ 6. Cache routing decisions │
│ └── Semantic routing embeddings are expensive │
│ │
└─────────────────────────────────────────────────────────────┘
:::