Cost Optimization

LLM costs can spiral quickly. At scale, optimizing costs becomes as important as optimizing performance. This lesson covers strategies to keep costs under control.

Understanding LLM Costs

Token-Based Pricing

# Current pricing (as of 2026)
PRICING = {
    "gpt-4o": {"input": 0.0025, "output": 0.01},      # per 1K tokens
    "gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
    "claude-sonnet-4-6": {"input": 0.003, "output": 0.015},
    "claude-haiku-4-5": {"input": 0.0008, "output": 0.004},
}

def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
    pricing = PRICING[model]
    input_cost = (input_tokens / 1000) * pricing["input"]
    output_cost = (output_tokens / 1000) * pricing["output"]
    return input_cost + output_cost

# Example: 2K input, 500 output with GPT-4o
cost = estimate_cost("gpt-4o", 2000, 500)
# = (2 * 0.0025) + (0.5 * 0.01) = $0.01 per request

Cost at Scale

Daily Requests	Model	Monthly Cost
10,000	GPT-4o	~$3,750
10,000	GPT-4o-mini	~$225
10,000	Claude-Sonnet-4	~$4,500
10,000	Claude-Haiku-4.5	~$1,200

Strategy 1: Model Routing

Route requests to the cheapest model that can handle them.

class ModelRouter:
    def __init__(self):
        self.models = {
            "simple": "gpt-4o-mini",       # Simple queries
            "medium": "gpt-4o",            # Complex but not critical
            "complex": "gpt-4.1"           # Critical, complex tasks
        }

    def classify_complexity(self, query: str) -> str:
        # Simple heuristics
        if len(query) < 100 and not any(
            word in query.lower()
            for word in ["analyze", "complex", "detailed", "compare"]
        ):
            return "simple"

        # Use a classifier for more accuracy
        complexity_score = self.complexity_classifier.predict(query)
        if complexity_score < 0.3:
            return "simple"
        elif complexity_score < 0.7:
            return "medium"
        return "complex"

    async def route(self, query: str) -> str:
        complexity = self.classify_complexity(query)
        model = self.models[complexity]
        return await self.call_model(model, query)

Production results: 40-60% cost reduction with <5% quality degradation.

Strategy 2: Prompt Optimization

Reduce token count without losing quality.

class PromptOptimizer:
    def __init__(self):
        self.abbreviations = {
            "Please provide": "Give",
            "In order to": "To",
            "It is important to note that": "Note:",
        }

    def optimize(self, prompt: str) -> str:
        optimized = prompt

        # Remove verbose phrases
        for verbose, concise in self.abbreviations.items():
            optimized = optimized.replace(verbose, concise)

        # Remove excessive whitespace
        optimized = " ".join(optimized.split())

        return optimized

    def estimate_savings(self, original: str, optimized: str) -> dict:
        original_tokens = len(original) / 4  # Rough estimate
        optimized_tokens = len(optimized) / 4
        savings = (original_tokens - optimized_tokens) / original_tokens
        return {
            "original_tokens": int(original_tokens),
            "optimized_tokens": int(optimized_tokens),
            "savings_percent": round(savings * 100, 1)
        }

Before and After

# Before: 47 tokens
prompt_before = """
Please provide a detailed summary of the following document.
It is important to note that you should focus on the key points
and main arguments presented by the author.
"""

# After: 18 tokens
prompt_after = """
Summarize this document. Focus on key points and main arguments.
"""

# Savings: 62% fewer tokens

Strategy 3: Context Window Management

Optimize what goes into the context.

class ContextManager:
    def __init__(self, max_tokens: int = 4000):
        self.max_tokens = max_tokens

    def optimize_context(
        self,
        system_prompt: str,
        history: list,
        retrieved_docs: list,
        query: str
    ) -> dict:
        # Reserve tokens for output
        available = self.max_tokens - 500

        # Priority allocation
        allocations = {
            "system": min(500, len(system_prompt)),
            "query": len(query),
            "docs": 0,
            "history": 0
        }

        remaining = available - allocations["system"] - allocations["query"]

        # Prioritize recent docs over old history
        doc_budget = int(remaining * 0.7)
        history_budget = int(remaining * 0.3)

        allocations["docs"] = doc_budget
        allocations["history"] = history_budget

        return {
            "system": system_prompt[:allocations["system"]],
            "history": self._truncate_history(history, history_budget),
            "docs": self._truncate_docs(retrieved_docs, doc_budget),
            "query": query
        }

    def _truncate_history(self, history: list, budget: int) -> list:
        # Keep most recent messages within budget
        result = []
        tokens_used = 0
        for msg in reversed(history):
            msg_tokens = len(msg["content"]) // 4
            if tokens_used + msg_tokens > budget:
                break
            result.insert(0, msg)
            tokens_used += msg_tokens
        return result

Strategy 4: Fallback Chains

Use expensive models only when cheaper ones fail.

class FallbackChain:
    def __init__(self):
        self.chain = [
            {"model": "gpt-4o-mini", "timeout": 5},
            {"model": "gpt-4o", "timeout": 10},
            {"model": "gpt-4.1", "timeout": 30},
        ]

    async def execute(self, prompt: str, quality_check) -> dict:
        for i, config in enumerate(self.chain):
            try:
                response = await self.call_model(
                    config["model"],
                    prompt,
                    timeout=config["timeout"]
                )

                # Check if response meets quality threshold
                if quality_check(response):
                    return {
                        "response": response,
                        "model_used": config["model"],
                        "fallback_level": i
                    }

            except Exception as e:
                continue  # Try next model

        raise Exception("All models failed")

# Usage
chain = FallbackChain()
result = await chain.execute(
    prompt="Explain quantum computing",
    quality_check=lambda r: len(r) > 100 and "quantum" in r.lower()
)

Cost Monitoring Dashboard

Track costs in real-time:

class CostTracker:
    def __init__(self):
        self.daily_costs = defaultdict(float)
        self.alerts = []

    def record(self, model: str, input_tokens: int, output_tokens: int):
        cost = estimate_cost(model, input_tokens, output_tokens)
        today = datetime.utcnow().date().isoformat()
        self.daily_costs[today] += cost

        # Check budget alerts
        if self.daily_costs[today] > DAILY_BUDGET:
            self.trigger_alert("Daily budget exceeded")

    def get_report(self) -> dict:
        return {
            "today": self.daily_costs[datetime.utcnow().date().isoformat()],
            "this_week": sum(
                cost for date, cost in self.daily_costs.items()
                if self._is_this_week(date)
            ),
            "by_model": self._breakdown_by_model()
        }

Now let's move to RAG system design—one of the most common interview topics. :::