Multi-Model Workflows

Running multiple models locally enables sophisticated workflows: routing queries to specialized models, ensemble responses, and model chaining.

Why Multiple Models?

┌─────────────────────────────────────────────────────────────────┐
│                    Multi-Model Strategies                        │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  1. Routing: Send queries to the best model for each task       │
│     • Code questions → deepseek-coder                           │
│     • General chat → llama3.2                                   │
│     • Multilingual → qwen2.5                                    │
│                                                                 │
│  2. Ensemble: Combine outputs from multiple models              │
│     • Generate with 3 models, vote on best answer               │
│     • Increases reliability for critical tasks                  │
│                                                                 │
│  3. Chaining: Use output of one model as input to another       │
│     • Fast model for draft, large model for refinement          │
│     • Classifier → Specialist model                             │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Model Router

import ollama
from typing import Literal

class ModelRouter:
    """Route queries to the most appropriate model."""

    def __init__(self):
        self.models = {
            "code": "deepseek-coder",
            "general": "llama3.2",
            "multilingual": "qwen2.5:7b",
            "fast": "phi3:mini"
        }
        self.classifier = "llama3.2"

    def classify_query(self, query: str) -> str:
        """Classify the query type."""
        response = ollama.chat(
            model=self.classifier,
            messages=[{
                "role": "user",
                "content": f"""Classify this query into ONE category:
- code (programming, debugging, code review)
- multilingual (non-English or translation)
- general (everything else)

Query: {query}

Reply with just the category name."""
            }]
        )
        category = response["message"]["content"].strip().lower()

        # Map to model type
        if "code" in category:
            return "code"
        elif "multilingual" in category or "translation" in category:
            return "multilingual"
        return "general"

    def route(self, query: str) -> str:
        """Route query to appropriate model and get response."""
        category = self.classify_query(query)
        model = self.models[category]

        print(f"[Routing to {model}]")

        response = ollama.chat(
            model=model,
            messages=[{"role": "user", "content": query}]
        )
        return response["message"]["content"]

# Usage
router = ModelRouter()
print(router.route("Write a Python function to sort a list"))
print(router.route("Translate 'hello' to Japanese"))
print(router.route("What causes rain?"))

Ensemble Responses

import ollama
from collections import Counter

class EnsembleModel:
    """Combine responses from multiple models."""

    def __init__(self, models: list[str]):
        self.models = models

    def generate_all(self, query: str) -> list[str]:
        """Get responses from all models."""
        responses = []
        for model in self.models:
            response = ollama.chat(
                model=model,
                messages=[{"role": "user", "content": query}]
            )
            responses.append(response["message"]["content"])
        return responses

    def synthesize(self, query: str) -> str:
        """Synthesize responses into a final answer."""
        responses = self.generate_all(query)

        # Use another model to synthesize
        synthesis_prompt = f"""You received these answers to the question: "{query}"

Answer 1: {responses[0]}

Answer 2: {responses[1]}

Answer 3: {responses[2]}

Synthesize these into a single, accurate response. If they disagree, go with the majority or most detailed answer."""

        final = ollama.chat(
            model="llama3.2",
            messages=[{"role": "user", "content": synthesis_prompt}]
        )
        return final["message"]["content"]

    def vote(self, query: str, options: list[str]) -> str:
        """Have models vote on multiple choice options."""
        votes = []
        for model in self.models:
            response = ollama.chat(
                model=model,
                messages=[{
                    "role": "user",
                    "content": f"""Question: {query}
Options: {options}
Reply with just the letter of the correct answer."""
                }]
            )
            vote = response["message"]["content"].strip().upper()
            if vote in ["A", "B", "C", "D"]:
                votes.append(vote)

        # Return majority vote
        if votes:
            return Counter(votes).most_common(1)[0][0]
        return "Unable to determine"

# Usage
ensemble = EnsembleModel(["llama3.2", "mistral", "phi3:medium"])
answer = ensemble.synthesize("What are the benefits of exercise?")
print(answer)

Model Chaining

import ollama

class ModelChain:
    """Chain models for multi-step processing."""

    def draft_and_refine(self, query: str) -> str:
        """Use fast model for draft, large model for refinement."""
        # Step 1: Fast draft
        draft = ollama.chat(
            model="phi3:mini",
            messages=[{"role": "user", "content": query}]
        )["message"]["content"]

        # Step 2: Refine with larger model
        refined = ollama.chat(
            model="llama3.2",
            messages=[{
                "role": "user",
                "content": f"""Improve this response for clarity and accuracy:

Original response: {draft}

Original question was: {query}

Provide an improved version:"""
            }]
        )
        return refined["message"]["content"]

    def translate_then_answer(self, query: str, source_lang: str) -> str:
        """Translate non-English query, answer, translate back."""
        # Step 1: Translate to English
        english_query = ollama.chat(
            model="qwen2.5:7b",
            messages=[{
                "role": "user",
                "content": f"Translate to English: {query}"
            }]
        )["message"]["content"]

        # Step 2: Answer in English (best model)
        answer = ollama.chat(
            model="llama3.2",
            messages=[{"role": "user", "content": english_query}]
        )["message"]["content"]

        # Step 3: Translate answer back
        final = ollama.chat(
            model="qwen2.5:7b",
            messages=[{
                "role": "user",
                "content": f"Translate to {source_lang}: {answer}"
            }]
        )
        return final["message"]["content"]

    def code_review_chain(self, code: str) -> dict:
        """Multi-model code review."""
        # Security review
        security = ollama.chat(
            model="deepseek-coder",
            messages=[{
                "role": "user",
                "content": f"Review for security issues only:\n```\n{code}\n```"
            }]
        )["message"]["content"]

        # Performance review
        performance = ollama.chat(
            model="deepseek-coder",
            messages=[{
                "role": "user",
                "content": f"Review for performance issues only:\n```\n{code}\n```"
            }]
        )["message"]["content"]

        # Final summary
        summary = ollama.chat(
            model="llama3.2",
            messages=[{
                "role": "user",
                "content": f"""Summarize this code review:

Security findings: {security}

Performance findings: {performance}

Provide a brief executive summary."""
            }]
        )["message"]["content"]

        return {
            "security": security,
            "performance": performance,
            "summary": summary
        }

# Usage
chain = ModelChain()
result = chain.draft_and_refine("Explain quantum computing")
print(result)

Parallel Processing

import ollama
import asyncio
from concurrent.futures import ThreadPoolExecutor

async def parallel_models(query: str, models: list[str]) -> dict[str, str]:
    """Query multiple models in parallel."""
    loop = asyncio.get_event_loop()

    def query_model(model: str) -> tuple[str, str]:
        response = ollama.chat(
            model=model,
            messages=[{"role": "user", "content": query}]
        )
        return model, response["message"]["content"]

    # Run in parallel using thread pool
    with ThreadPoolExecutor(max_workers=len(models)) as executor:
        futures = [
            loop.run_in_executor(executor, query_model, model)
            for model in models
        ]
        results = await asyncio.gather(*futures)

    return dict(results)

# Usage
async def main():
    results = await parallel_models(
        "What is the capital of France?",
        ["llama3.2", "mistral", "phi3:mini"]
    )
    for model, response in results.items():
        print(f"{model}: {response[:100]}...")

asyncio.run(main())

Model Selection Matrix

Task	Recommended Model	Why
Code generation	deepseek-coder	Specialized for code
Fast responses	phi3:mini	Smallest, fastest
Complex reasoning	llama3.2:70b	Most capable
Multilingual	qwen2.5:7b	Best language coverage
General chat	llama3.2:8b	Good all-rounder

Multi-model workflows maximize the strengths of each model while running entirely locally. In the next module, we'll optimize performance. :::