Lesson 13 of 22

Advanced Local LLM Patterns

Local RAG Pipeline

3 min read

Build a complete Retrieval-Augmented Generation system that runs entirely on your machine. No data ever leaves your network.

Architecture Overview

┌─────────────────────────────────────────────────────────────────┐
│                    Fully Local RAG Pipeline                      │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  Documents ──► Chunking ──► Local Embeddings ──► Vector Store   │
│                              (nomic-embed)        (FAISS/Chroma)│
│                                                                 │
│  Query ──► Embed ──► Retrieve ──► Local LLM ──► Answer          │
│            (nomic)    (top-k)      (llama3.2)                   │
│                                                                 │
│  Everything runs locally - complete data privacy                │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Complete Implementation

import ollama
import numpy as np
from typing import List

class LocalRAG:
    """Fully local RAG pipeline with Ollama."""

    def __init__(
        self,
        embedding_model: str = "nomic-embed-text",
        llm_model: str = "llama3.2",
        chunk_size: int = 500,
        chunk_overlap: int = 50
    ):
        self.embedding_model = embedding_model
        self.llm_model = llm_model
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.chunks: list[str] = []
        self.embeddings: list[list[float]] = []

    def add_documents(self, documents: list[str]):
        """Chunk and embed documents."""
        # Chunk documents
        for doc in documents:
            doc_chunks = self._chunk_text(doc)
            self.chunks.extend(doc_chunks)

        # Embed all chunks
        if self.chunks:
            response = ollama.embed(
                model=self.embedding_model,
                input=self.chunks
            )
            self.embeddings = response["embeddings"]

    def _chunk_text(self, text: str) -> list[str]:
        """Split text into overlapping chunks."""
        chunks = []
        start = 0
        while start < len(text):
            end = start + self.chunk_size
            chunk = text[start:end]
            if chunk.strip():
                chunks.append(chunk)
            start = end - self.chunk_overlap
        return chunks

    def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
        """Calculate cosine similarity."""
        a, b = np.array(a), np.array(b)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def retrieve(self, query: str, top_k: int = 3) -> list[str]:
        """Retrieve relevant chunks for a query."""
        # Embed query
        query_embedding = ollama.embed(
            model=self.embedding_model,
            input=query
        )["embeddings"][0]

        # Calculate similarities
        similarities = [
            (i, self._cosine_similarity(query_embedding, emb))
            for i, emb in enumerate(self.embeddings)
        ]

        # Sort and get top-k
        similarities.sort(key=lambda x: x[1], reverse=True)
        top_indices = [i for i, _ in similarities[:top_k]]

        return [self.chunks[i] for i in top_indices]

    def query(self, question: str, top_k: int = 3) -> str:
        """Answer a question using RAG."""
        # Retrieve relevant context
        context_chunks = self.retrieve(question, top_k)
        context = "\n\n".join(context_chunks)

        # Generate answer
        prompt = f"""Answer the question based on the following context.
If the answer isn't in the context, say "I don't have enough information."

Context:
{context}

Question: {question}

Answer:"""

        response = ollama.chat(
            model=self.llm_model,
            messages=[{"role": "user", "content": prompt}]
        )

        return response["message"]["content"]

# Usage
rag = LocalRAG()

# Add your documents
documents = [
    """Python is a high-level programming language created by Guido van Rossum.
    It was first released in 1991. Python emphasizes code readability with its
    notable use of significant indentation. Python supports multiple programming
    paradigms including procedural, object-oriented, and functional programming.""",

    """Machine learning is a subset of artificial intelligence that focuses on
    building systems that learn from data. Common ML algorithms include decision
    trees, neural networks, and support vector machines. Python is the most
    popular language for machine learning due to libraries like scikit-learn,
    TensorFlow, and PyTorch."""
]

rag.add_documents(documents)

# Query the system
answer = rag.query("Who created Python and when?")
print(answer)

Using LangChain with ChromaDB

from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Initialize local models
embeddings = OllamaEmbeddings(model="nomic-embed-text")
llm = ChatOllama(model="llama3.2")

# Sample documents
documents = [
    "Ollama is an open-source tool for running large language models locally.",
    "LangChain is a framework for developing applications powered by LLMs.",
    "ChromaDB is an open-source vector database for AI applications.",
    "RAG combines retrieval with generation for more accurate responses."
]

# Split documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
splits = text_splitter.create_documents(documents)

# Create vector store (persisted locally)
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# RAG prompt
prompt = ChatPromptTemplate.from_template("""
Answer based only on this context:

{context}

Question: {question}
""")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Build chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Query
answer = rag_chain.invoke("What is Ollama?")
print(answer)

Loading Documents from Files

from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
    DirectoryLoader
)

# Load a single text file
text_loader = TextLoader("document.txt")
text_docs = text_loader.load()

# Load a PDF
pdf_loader = PyPDFLoader("document.pdf")
pdf_docs = pdf_loader.load()

# Load all files from a directory
dir_loader = DirectoryLoader(
    "./documents",
    glob="**/*.txt",
    loader_cls=TextLoader
)
all_docs = dir_loader.load()

# Add to vector store
vectorstore.add_documents(all_docs)

Streaming RAG Responses

from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = OllamaEmbeddings(model="nomic-embed-text")
llm = ChatOllama(model="llama3.2")

# Assume vectorstore already created
vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embeddings
)

def stream_rag_response(question: str):
    """Stream a RAG response token by token."""
    # Retrieve context
    docs = vectorstore.similarity_search(question, k=3)
    context = "\n\n".join(doc.page_content for doc in docs)

    prompt = f"""Answer based on this context:

{context}

Question: {question}
Answer:"""

    # Stream response
    for chunk in llm.stream(prompt):
        print(chunk.content, end="", flush=True)
    print()

stream_rag_response("What are the benefits of local LLMs?")

Evaluation

def evaluate_rag(rag_system, test_questions: list[dict]) -> dict:
    """Evaluate RAG system on test questions."""
    results = {
        "total": len(test_questions),
        "correct": 0,
        "details": []
    }

    for item in test_questions:
        question = item["question"]
        expected = item["expected_answer"]

        answer = rag_system.query(question)

        # Simple keyword matching (use RAGAS for production)
        is_correct = any(
            keyword.lower() in answer.lower()
            for keyword in expected.split()
        )

        results["details"].append({
            "question": question,
            "answer": answer,
            "expected": expected,
            "correct": is_correct
        })

        if is_correct:
            results["correct"] += 1

    results["accuracy"] = results["correct"] / results["total"]
    return results

# Test
test_set = [
    {"question": "Who created Python?", "expected_answer": "Guido van Rossum"},
    {"question": "When was Python released?", "expected_answer": "1991"}
]

eval_results = evaluate_rag(rag, test_set)
print(f"Accuracy: {eval_results['accuracy']:.1%}")

Key Advantages of Local RAG

Feature Local RAG Cloud RAG
Data privacy Complete Data sent to cloud
Cost Hardware only Per-token charges
Latency Consistent Variable
Internet Not required Required
Customization Full control Limited

Your RAG pipeline now runs entirely locally. Next, we'll add function calling capabilities. :::

Quiz

Module 4: Advanced Local LLM Patterns

Take Quiz