LLM Application Architecture
Cost Optimization
5 min read
LLM costs can spiral quickly. At scale, optimizing costs becomes as important as optimizing performance. This lesson covers strategies to keep costs under control.
Understanding LLM Costs
Token-Based Pricing
# Typical pricing (as of 2024)
PRICING = {
"gpt-4": {"input": 0.03, "output": 0.06}, # per 1K tokens
"gpt-4-turbo": {"input": 0.01, "output": 0.03},
"gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
"claude-3-opus": {"input": 0.015, "output": 0.075},
"claude-3-sonnet": {"input": 0.003, "output": 0.015},
}
def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
pricing = PRICING[model]
input_cost = (input_tokens / 1000) * pricing["input"]
output_cost = (output_tokens / 1000) * pricing["output"]
return input_cost + output_cost
# Example: 2K input, 500 output with GPT-4
cost = estimate_cost("gpt-4", 2000, 500)
# = (2 * 0.03) + (0.5 * 0.06) = $0.09 per request
Cost at Scale
| Daily Requests | Model | Monthly Cost |
|---|---|---|
| 10,000 | GPT-4 | ~$27,000 |
| 10,000 | GPT-4-Turbo | ~$10,500 |
| 10,000 | GPT-3.5-Turbo | ~$600 |
| 10,000 | Claude-3-Sonnet | ~$4,500 |
Strategy 1: Model Routing
Route requests to the cheapest model that can handle them.
class ModelRouter:
def __init__(self):
self.models = {
"simple": "gpt-3.5-turbo", # Simple queries
"medium": "gpt-4-turbo", # Complex but not critical
"complex": "gpt-4" # Critical, complex tasks
}
def classify_complexity(self, query: str) -> str:
# Simple heuristics
if len(query) < 100 and not any(
word in query.lower()
for word in ["analyze", "complex", "detailed", "compare"]
):
return "simple"
# Use a classifier for more accuracy
complexity_score = self.complexity_classifier.predict(query)
if complexity_score < 0.3:
return "simple"
elif complexity_score < 0.7:
return "medium"
return "complex"
async def route(self, query: str) -> str:
complexity = self.classify_complexity(query)
model = self.models[complexity]
return await self.call_model(model, query)
Production results: 40-60% cost reduction with <5% quality degradation.
Strategy 2: Prompt Optimization
Reduce token count without losing quality.
class PromptOptimizer:
def __init__(self):
self.abbreviations = {
"Please provide": "Give",
"In order to": "To",
"It is important to note that": "Note:",
}
def optimize(self, prompt: str) -> str:
optimized = prompt
# Remove verbose phrases
for verbose, concise in self.abbreviations.items():
optimized = optimized.replace(verbose, concise)
# Remove excessive whitespace
optimized = " ".join(optimized.split())
return optimized
def estimate_savings(self, original: str, optimized: str) -> dict:
original_tokens = len(original) / 4 # Rough estimate
optimized_tokens = len(optimized) / 4
savings = (original_tokens - optimized_tokens) / original_tokens
return {
"original_tokens": int(original_tokens),
"optimized_tokens": int(optimized_tokens),
"savings_percent": round(savings * 100, 1)
}
Before and After
# Before: 47 tokens
prompt_before = """
Please provide a detailed summary of the following document.
It is important to note that you should focus on the key points
and main arguments presented by the author.
"""
# After: 18 tokens
prompt_after = """
Summarize this document. Focus on key points and main arguments.
"""
# Savings: 62% fewer tokens
Strategy 3: Context Window Management
Optimize what goes into the context.
class ContextManager:
def __init__(self, max_tokens: int = 4000):
self.max_tokens = max_tokens
def optimize_context(
self,
system_prompt: str,
history: list,
retrieved_docs: list,
query: str
) -> dict:
# Reserve tokens for output
available = self.max_tokens - 500
# Priority allocation
allocations = {
"system": min(500, len(system_prompt)),
"query": len(query),
"docs": 0,
"history": 0
}
remaining = available - allocations["system"] - allocations["query"]
# Prioritize recent docs over old history
doc_budget = int(remaining * 0.7)
history_budget = int(remaining * 0.3)
allocations["docs"] = doc_budget
allocations["history"] = history_budget
return {
"system": system_prompt[:allocations["system"]],
"history": self._truncate_history(history, history_budget),
"docs": self._truncate_docs(retrieved_docs, doc_budget),
"query": query
}
def _truncate_history(self, history: list, budget: int) -> list:
# Keep most recent messages within budget
result = []
tokens_used = 0
for msg in reversed(history):
msg_tokens = len(msg["content"]) // 4
if tokens_used + msg_tokens > budget:
break
result.insert(0, msg)
tokens_used += msg_tokens
return result
Strategy 4: Fallback Chains
Use expensive models only when cheaper ones fail.
class FallbackChain:
def __init__(self):
self.chain = [
{"model": "gpt-3.5-turbo", "timeout": 5},
{"model": "gpt-4-turbo", "timeout": 10},
{"model": "gpt-4", "timeout": 30},
]
async def execute(self, prompt: str, quality_check) -> dict:
for i, config in enumerate(self.chain):
try:
response = await self.call_model(
config["model"],
prompt,
timeout=config["timeout"]
)
# Check if response meets quality threshold
if quality_check(response):
return {
"response": response,
"model_used": config["model"],
"fallback_level": i
}
except Exception as e:
continue # Try next model
raise Exception("All models failed")
# Usage
chain = FallbackChain()
result = await chain.execute(
prompt="Explain quantum computing",
quality_check=lambda r: len(r) > 100 and "quantum" in r.lower()
)
Cost Monitoring Dashboard
Track costs in real-time:
class CostTracker:
def __init__(self):
self.daily_costs = defaultdict(float)
self.alerts = []
def record(self, model: str, input_tokens: int, output_tokens: int):
cost = estimate_cost(model, input_tokens, output_tokens)
today = datetime.utcnow().date().isoformat()
self.daily_costs[today] += cost
# Check budget alerts
if self.daily_costs[today] > DAILY_BUDGET:
self.trigger_alert("Daily budget exceeded")
def get_report(self) -> dict:
return {
"today": self.daily_costs[datetime.utcnow().date().isoformat()],
"this_week": sum(
cost for date, cost in self.daily_costs.items()
if self._is_this_week(date)
),
"by_model": self._breakdown_by_model()
}
Now let's move to RAG system design—one of the most common interview topics. :::