LiteLLM Integration

LiteLLM provides a unified interface for 100+ LLM providers. Combined with Guardrails AI, you get validated outputs across any model with automatic fallback and load balancing.

LiteLLM Setup

pip install litellm

import litellm

# Configure API keys
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."
os.environ["AZURE_API_KEY"] = "..."
os.environ["AZURE_API_BASE"] = "https://your-resource.openai.azure.com"

Basic LiteLLM with Guardrails

from guardrails import Guard
from pydantic import BaseModel, Field
import litellm

class AnalysisResult(BaseModel):
    summary: str = Field(description="Analysis summary")
    key_findings: list[str] = Field(description="Key findings")
    confidence: float = Field(ge=0, le=1, description="Confidence score")

guard = Guard.for_pydantic(AnalysisResult)

# Use any LiteLLM-supported model
result = guard(
    model="gpt-4o",  # OpenAI
    messages=[{"role": "user", "content": "Analyze the Q3 earnings report"}]
)

# Switch providers easily
result = guard(
    model="claude-sonnet-4-6",  # Anthropic
    messages=[{"role": "user", "content": "Analyze the Q3 earnings report"}]
)

result = guard(
    model="azure/gpt-4o",  # Azure OpenAI
    messages=[{"role": "user", "content": "Analyze the Q3 earnings report"}]
)

Fallback Configuration

from guardrails import Guard
import litellm
from litellm import Router

# Configure fallback chain
router = Router(
    model_list=[
        {
            "model_name": "primary",
            "litellm_params": {
                "model": "gpt-4o",
                "api_key": os.environ["OPENAI_API_KEY"]
            }
        },
        {
            "model_name": "primary",
            "litellm_params": {
                "model": "azure/gpt-4o",
                "api_key": os.environ["AZURE_API_KEY"],
                "api_base": os.environ["AZURE_API_BASE"]
            }
        },
        {
            "model_name": "primary",
            "litellm_params": {
                "model": "claude-sonnet-4-6",
                "api_key": os.environ["ANTHROPIC_API_KEY"]
            }
        }
    ],
    fallbacks=[
        {"primary": ["azure/gpt-4o", "claude-sonnet-4-6"]}
    ],
    retry_after=5  # Retry after 5 seconds
)

guard = Guard.for_pydantic(AnalysisResult)

# Use router for automatic fallback
async def analyze_with_fallback(content: str):
    try:
        response = await router.acompletion(
            model="primary",
            messages=[{"role": "user", "content": content}]
        )
        return guard.parse(response.choices[0].message.content)
    except Exception as e:
        # Router automatically tries fallbacks
        raise

Cost-Optimized Routing

from litellm import Router
from guardrails import Guard

# Route based on complexity
router = Router(
    model_list=[
        # Cheap models for simple tasks
        {
            "model_name": "simple",
            "litellm_params": {
                "model": "gpt-4o-mini",
                "api_key": os.environ["OPENAI_API_KEY"]
            }
        },
        # Expensive models for complex tasks
        {
            "model_name": "complex",
            "litellm_params": {
                "model": "gpt-4o",
                "api_key": os.environ["OPENAI_API_KEY"]
            }
        }
    ]
)

class TaskRouter:
    def __init__(self, guard: Guard):
        self.guard = guard
        self.router = router

    async def route(self, task: str, complexity: str = "auto") -> dict:
        """Route task to appropriate model based on complexity."""
        if complexity == "auto":
            complexity = self._estimate_complexity(task)

        model = "simple" if complexity == "low" else "complex"

        result = self.guard(
            model=self.router.model_list[0 if model == "simple" else 1]["litellm_params"]["model"],
            messages=[{"role": "user", "content": task}]
        )

        return {
            "result": result.validated_output,
            "model_used": model,
            "estimated_cost": self._estimate_cost(model, task)
        }

    def _estimate_complexity(self, task: str) -> str:
        """Simple complexity estimation."""
        complex_keywords = ["analyze", "compare", "synthesize", "evaluate"]
        return "high" if any(kw in task.lower() for kw in complex_keywords) else "low"

    def _estimate_cost(self, model: str, task: str) -> float:
        """Estimate cost based on model and input length."""
        costs = {"simple": 0.00015, "complex": 0.005}  # Per 1K tokens
        tokens = len(task.split()) * 1.3  # Rough token estimate
        return costs[model] * tokens / 1000

Rate Limiting and Quotas

from litellm import Router
import asyncio
from datetime import datetime, timedelta

class RateLimitedRouter:
    """Router with rate limiting per model."""

    def __init__(self, guard: Guard):
        self.guard = guard
        self.request_counts = {}
        self.rate_limits = {
            "gpt-4o": 60,        # 60 RPM
            "gpt-4o-mini": 200,  # 200 RPM
            "claude-sonnet-4-6": 40   # 40 RPM
        }
        self.router = Router(
            model_list=[
                {"model_name": "primary", "litellm_params": {"model": "gpt-4o"}},
                {"model_name": "fallback", "litellm_params": {"model": "gpt-4o-mini"}},
            ]
        )

    async def completion(self, model: str, messages: list) -> dict:
        """Make completion with rate limit awareness."""
        await self._wait_for_rate_limit(model)

        result = self.guard(
            model=model,
            messages=messages
        )

        self._record_request(model)
        return result

    async def _wait_for_rate_limit(self, model: str):
        """Wait if rate limit would be exceeded."""
        limit = self.rate_limits.get(model, 60)
        current = self._get_request_count(model)

        if current >= limit:
            wait_time = 60 - (datetime.now().second)
            await asyncio.sleep(wait_time)

    def _record_request(self, model: str):
        """Record a request for rate limiting."""
        now = datetime.now()
        minute_key = now.strftime("%Y%m%d%H%M")

        if model not in self.request_counts:
            self.request_counts[model] = {}

        self.request_counts[model][minute_key] = \
            self.request_counts[model].get(minute_key, 0) + 1

    def _get_request_count(self, model: str) -> int:
        """Get request count for current minute."""
        minute_key = datetime.now().strftime("%Y%m%d%H%M")
        return self.request_counts.get(model, {}).get(minute_key, 0)

Streaming with Guardrails

from guardrails import Guard
import litellm

guard = Guard.for_pydantic(AnalysisResult)

async def stream_with_validation(prompt: str):
    """Stream response and validate at the end."""
    full_response = ""

    async for chunk in await litellm.acompletion(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        stream=True
    ):
        if chunk.choices[0].delta.content:
            content = chunk.choices[0].delta.content
            full_response += content
            yield {"type": "chunk", "content": content}

    # Validate complete response
    try:
        validated = guard.parse(full_response)
        yield {"type": "validated", "result": validated}
    except Exception as e:
        yield {"type": "error", "error": str(e)}

Production Wrapper

from guardrails import Guard
from pydantic import BaseModel
import litellm
from typing import TypeVar, Type
import logging

T = TypeVar("T", bound=BaseModel)

class GuardedLLM:
    """Production-ready LLM wrapper with Guardrails."""

    def __init__(
        self,
        primary_model: str = "gpt-4o",
        fallback_model: str = "gpt-4o-mini",
        max_retries: int = 3
    ):
        self.primary_model = primary_model
        self.fallback_model = fallback_model
        self.max_retries = max_retries
        self.logger = logging.getLogger(__name__)

    async def generate(
        self,
        schema: Type[T],
        prompt: str,
        system_prompt: str = None,
        num_reasks: int = 2
    ) -> T:
        """Generate validated response."""
        guard = Guard.for_pydantic(schema)

        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})

        # Try primary model
        try:
            result = guard(
                model=self.primary_model,
                messages=messages,
                num_reasks=num_reasks
            )

            if result.validated_output:
                return result.validated_output

        except Exception as e:
            self.logger.warning(f"Primary model failed: {e}")

        # Fallback
        try:
            result = guard(
                model=self.fallback_model,
                messages=messages,
                num_reasks=num_reasks + 1  # Extra retry for fallback
            )

            if result.validated_output:
                return result.validated_output

        except Exception as e:
            self.logger.error(f"Fallback model failed: {e}")
            raise

        raise ValueError("Failed to generate valid response")

# Usage
llm = GuardedLLM()

result = await llm.generate(
    schema=AnalysisResult,
    prompt="Analyze the market trends for Q4",
    system_prompt="You are a financial analyst."
)

LiteLLM Tip: Use environment variable prefixes for provider-specific configs: OPENAI_API_KEY, ANTHROPIC_API_KEY, AZURE_API_KEY, etc. LiteLLM automatically picks them up.

Next: Production deployment patterns and observability. :::