Advanced Local LLM Patterns
Multi-Model Workflows
3 min read
Running multiple models locally enables sophisticated workflows: routing queries to specialized models, ensemble responses, and model chaining.
Why Multiple Models?
┌─────────────────────────────────────────────────────────────────┐
│ Multi-Model Strategies │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. Routing: Send queries to the best model for each task │
│ • Code questions → deepseek-coder │
│ • General chat → llama3.2 │
│ • Multilingual → qwen2.5 │
│ │
│ 2. Ensemble: Combine outputs from multiple models │
│ • Generate with 3 models, vote on best answer │
│ • Increases reliability for critical tasks │
│ │
│ 3. Chaining: Use output of one model as input to another │
│ • Fast model for draft, large model for refinement │
│ • Classifier → Specialist model │
│ │
└─────────────────────────────────────────────────────────────────┘
Model Router
import ollama
from typing import Literal
class ModelRouter:
"""Route queries to the most appropriate model."""
def __init__(self):
self.models = {
"code": "deepseek-coder",
"general": "llama3.2",
"multilingual": "qwen2.5:7b",
"fast": "phi3:mini"
}
self.classifier = "llama3.2"
def classify_query(self, query: str) -> str:
"""Classify the query type."""
response = ollama.chat(
model=self.classifier,
messages=[{
"role": "user",
"content": f"""Classify this query into ONE category:
- code (programming, debugging, code review)
- multilingual (non-English or translation)
- general (everything else)
Query: {query}
Reply with just the category name."""
}]
)
category = response["message"]["content"].strip().lower()
# Map to model type
if "code" in category:
return "code"
elif "multilingual" in category or "translation" in category:
return "multilingual"
return "general"
def route(self, query: str) -> str:
"""Route query to appropriate model and get response."""
category = self.classify_query(query)
model = self.models[category]
print(f"[Routing to {model}]")
response = ollama.chat(
model=model,
messages=[{"role": "user", "content": query}]
)
return response["message"]["content"]
# Usage
router = ModelRouter()
print(router.route("Write a Python function to sort a list"))
print(router.route("Translate 'hello' to Japanese"))
print(router.route("What causes rain?"))
Ensemble Responses
import ollama
from collections import Counter
class EnsembleModel:
"""Combine responses from multiple models."""
def __init__(self, models: list[str]):
self.models = models
def generate_all(self, query: str) -> list[str]:
"""Get responses from all models."""
responses = []
for model in self.models:
response = ollama.chat(
model=model,
messages=[{"role": "user", "content": query}]
)
responses.append(response["message"]["content"])
return responses
def synthesize(self, query: str) -> str:
"""Synthesize responses into a final answer."""
responses = self.generate_all(query)
# Use another model to synthesize
synthesis_prompt = f"""You received these answers to the question: "{query}"
Answer 1: {responses[0]}
Answer 2: {responses[1]}
Answer 3: {responses[2]}
Synthesize these into a single, accurate response. If they disagree, go with the majority or most detailed answer."""
final = ollama.chat(
model="llama3.2",
messages=[{"role": "user", "content": synthesis_prompt}]
)
return final["message"]["content"]
def vote(self, query: str, options: list[str]) -> str:
"""Have models vote on multiple choice options."""
votes = []
for model in self.models:
response = ollama.chat(
model=model,
messages=[{
"role": "user",
"content": f"""Question: {query}
Options: {options}
Reply with just the letter of the correct answer."""
}]
)
vote = response["message"]["content"].strip().upper()
if vote in ["A", "B", "C", "D"]:
votes.append(vote)
# Return majority vote
if votes:
return Counter(votes).most_common(1)[0][0]
return "Unable to determine"
# Usage
ensemble = EnsembleModel(["llama3.2", "mistral", "phi3:medium"])
answer = ensemble.synthesize("What are the benefits of exercise?")
print(answer)
Model Chaining
import ollama
class ModelChain:
"""Chain models for multi-step processing."""
def draft_and_refine(self, query: str) -> str:
"""Use fast model for draft, large model for refinement."""
# Step 1: Fast draft
draft = ollama.chat(
model="phi3:mini",
messages=[{"role": "user", "content": query}]
)["message"]["content"]
# Step 2: Refine with larger model
refined = ollama.chat(
model="llama3.2",
messages=[{
"role": "user",
"content": f"""Improve this response for clarity and accuracy:
Original response: {draft}
Original question was: {query}
Provide an improved version:"""
}]
)
return refined["message"]["content"]
def translate_then_answer(self, query: str, source_lang: str) -> str:
"""Translate non-English query, answer, translate back."""
# Step 1: Translate to English
english_query = ollama.chat(
model="qwen2.5:7b",
messages=[{
"role": "user",
"content": f"Translate to English: {query}"
}]
)["message"]["content"]
# Step 2: Answer in English (best model)
answer = ollama.chat(
model="llama3.2",
messages=[{"role": "user", "content": english_query}]
)["message"]["content"]
# Step 3: Translate answer back
final = ollama.chat(
model="qwen2.5:7b",
messages=[{
"role": "user",
"content": f"Translate to {source_lang}: {answer}"
}]
)
return final["message"]["content"]
def code_review_chain(self, code: str) -> dict:
"""Multi-model code review."""
# Security review
security = ollama.chat(
model="deepseek-coder",
messages=[{
"role": "user",
"content": f"Review for security issues only:\n```\n{code}\n```"
}]
)["message"]["content"]
# Performance review
performance = ollama.chat(
model="deepseek-coder",
messages=[{
"role": "user",
"content": f"Review for performance issues only:\n```\n{code}\n```"
}]
)["message"]["content"]
# Final summary
summary = ollama.chat(
model="llama3.2",
messages=[{
"role": "user",
"content": f"""Summarize this code review:
Security findings: {security}
Performance findings: {performance}
Provide a brief executive summary."""
}]
)["message"]["content"]
return {
"security": security,
"performance": performance,
"summary": summary
}
# Usage
chain = ModelChain()
result = chain.draft_and_refine("Explain quantum computing")
print(result)
Parallel Processing
import ollama
import asyncio
from concurrent.futures import ThreadPoolExecutor
async def parallel_models(query: str, models: list[str]) -> dict[str, str]:
"""Query multiple models in parallel."""
loop = asyncio.get_event_loop()
def query_model(model: str) -> tuple[str, str]:
response = ollama.chat(
model=model,
messages=[{"role": "user", "content": query}]
)
return model, response["message"]["content"]
# Run in parallel using thread pool
with ThreadPoolExecutor(max_workers=len(models)) as executor:
futures = [
loop.run_in_executor(executor, query_model, model)
for model in models
]
results = await asyncio.gather(*futures)
return dict(results)
# Usage
async def main():
results = await parallel_models(
"What is the capital of France?",
["llama3.2", "mistral", "phi3:mini"]
)
for model, response in results.items():
print(f"{model}: {response[:100]}...")
asyncio.run(main())
Model Selection Matrix
| Task | Recommended Model | Why |
|---|---|---|
| Code generation | deepseek-coder | Specialized for code |
| Fast responses | phi3:mini | Smallest, fastest |
| Complex reasoning | llama3.2:70b | Most capable |
| Multilingual | qwen2.5:7b | Best language coverage |
| General chat | llama3.2:8b | Good all-rounder |
Multi-model workflows maximize the strengths of each model while running entirely locally. In the next module, we'll optimize performance. :::