Advanced Local LLM Patterns
Local RAG Pipeline
3 min read
Build a complete Retrieval-Augmented Generation system that runs entirely on your machine. No data ever leaves your network.
Architecture Overview
┌─────────────────────────────────────────────────────────────────┐
│ Fully Local RAG Pipeline │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Documents ──► Chunking ──► Local Embeddings ──► Vector Store │
│ (nomic-embed) (FAISS/Chroma)│
│ │
│ Query ──► Embed ──► Retrieve ──► Local LLM ──► Answer │
│ (nomic) (top-k) (llama3.2) │
│ │
│ Everything runs locally - complete data privacy │
│ │
└─────────────────────────────────────────────────────────────────┘
Complete Implementation
import ollama
import numpy as np
from typing import List
class LocalRAG:
"""Fully local RAG pipeline with Ollama."""
def __init__(
self,
embedding_model: str = "nomic-embed-text",
llm_model: str = "llama3.2",
chunk_size: int = 500,
chunk_overlap: int = 50
):
self.embedding_model = embedding_model
self.llm_model = llm_model
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.chunks: list[str] = []
self.embeddings: list[list[float]] = []
def add_documents(self, documents: list[str]):
"""Chunk and embed documents."""
# Chunk documents
for doc in documents:
doc_chunks = self._chunk_text(doc)
self.chunks.extend(doc_chunks)
# Embed all chunks
if self.chunks:
response = ollama.embed(
model=self.embedding_model,
input=self.chunks
)
self.embeddings = response["embeddings"]
def _chunk_text(self, text: str) -> list[str]:
"""Split text into overlapping chunks."""
chunks = []
start = 0
while start < len(text):
end = start + self.chunk_size
chunk = text[start:end]
if chunk.strip():
chunks.append(chunk)
start = end - self.chunk_overlap
return chunks
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
"""Calculate cosine similarity."""
a, b = np.array(a), np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def retrieve(self, query: str, top_k: int = 3) -> list[str]:
"""Retrieve relevant chunks for a query."""
# Embed query
query_embedding = ollama.embed(
model=self.embedding_model,
input=query
)["embeddings"][0]
# Calculate similarities
similarities = [
(i, self._cosine_similarity(query_embedding, emb))
for i, emb in enumerate(self.embeddings)
]
# Sort and get top-k
similarities.sort(key=lambda x: x[1], reverse=True)
top_indices = [i for i, _ in similarities[:top_k]]
return [self.chunks[i] for i in top_indices]
def query(self, question: str, top_k: int = 3) -> str:
"""Answer a question using RAG."""
# Retrieve relevant context
context_chunks = self.retrieve(question, top_k)
context = "\n\n".join(context_chunks)
# Generate answer
prompt = f"""Answer the question based on the following context.
If the answer isn't in the context, say "I don't have enough information."
Context:
{context}
Question: {question}
Answer:"""
response = ollama.chat(
model=self.llm_model,
messages=[{"role": "user", "content": prompt}]
)
return response["message"]["content"]
# Usage
rag = LocalRAG()
# Add your documents
documents = [
"""Python is a high-level programming language created by Guido van Rossum.
It was first released in 1991. Python emphasizes code readability with its
notable use of significant indentation. Python supports multiple programming
paradigms including procedural, object-oriented, and functional programming.""",
"""Machine learning is a subset of artificial intelligence that focuses on
building systems that learn from data. Common ML algorithms include decision
trees, neural networks, and support vector machines. Python is the most
popular language for machine learning due to libraries like scikit-learn,
TensorFlow, and PyTorch."""
]
rag.add_documents(documents)
# Query the system
answer = rag.query("Who created Python and when?")
print(answer)
Using LangChain with ChromaDB
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# Initialize local models
embeddings = OllamaEmbeddings(model="nomic-embed-text")
llm = ChatOllama(model="llama3.2")
# Sample documents
documents = [
"Ollama is an open-source tool for running large language models locally.",
"LangChain is a framework for developing applications powered by LLMs.",
"ChromaDB is an open-source vector database for AI applications.",
"RAG combines retrieval with generation for more accurate responses."
]
# Split documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50
)
splits = text_splitter.create_documents(documents)
# Create vector store (persisted locally)
vectorstore = Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory="./chroma_db"
)
# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
# RAG prompt
prompt = ChatPromptTemplate.from_template("""
Answer based only on this context:
{context}
Question: {question}
""")
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
# Build chain
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# Query
answer = rag_chain.invoke("What is Ollama?")
print(answer)
Loading Documents from Files
from langchain_community.document_loaders import (
TextLoader,
PyPDFLoader,
DirectoryLoader
)
# Load a single text file
text_loader = TextLoader("document.txt")
text_docs = text_loader.load()
# Load a PDF
pdf_loader = PyPDFLoader("document.pdf")
pdf_docs = pdf_loader.load()
# Load all files from a directory
dir_loader = DirectoryLoader(
"./documents",
glob="**/*.txt",
loader_cls=TextLoader
)
all_docs = dir_loader.load()
# Add to vector store
vectorstore.add_documents(all_docs)
Streaming RAG Responses
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_community.vectorstores import Chroma
embeddings = OllamaEmbeddings(model="nomic-embed-text")
llm = ChatOllama(model="llama3.2")
# Assume vectorstore already created
vectorstore = Chroma(
persist_directory="./chroma_db",
embedding_function=embeddings
)
def stream_rag_response(question: str):
"""Stream a RAG response token by token."""
# Retrieve context
docs = vectorstore.similarity_search(question, k=3)
context = "\n\n".join(doc.page_content for doc in docs)
prompt = f"""Answer based on this context:
{context}
Question: {question}
Answer:"""
# Stream response
for chunk in llm.stream(prompt):
print(chunk.content, end="", flush=True)
print()
stream_rag_response("What are the benefits of local LLMs?")
Evaluation
def evaluate_rag(rag_system, test_questions: list[dict]) -> dict:
"""Evaluate RAG system on test questions."""
results = {
"total": len(test_questions),
"correct": 0,
"details": []
}
for item in test_questions:
question = item["question"]
expected = item["expected_answer"]
answer = rag_system.query(question)
# Simple keyword matching (use RAGAS for production)
is_correct = any(
keyword.lower() in answer.lower()
for keyword in expected.split()
)
results["details"].append({
"question": question,
"answer": answer,
"expected": expected,
"correct": is_correct
})
if is_correct:
results["correct"] += 1
results["accuracy"] = results["correct"] / results["total"]
return results
# Test
test_set = [
{"question": "Who created Python?", "expected_answer": "Guido van Rossum"},
{"question": "When was Python released?", "expected_answer": "1991"}
]
eval_results = evaluate_rag(rag, test_set)
print(f"Accuracy: {eval_results['accuracy']:.1%}")
Key Advantages of Local RAG
| Feature | Local RAG | Cloud RAG |
|---|---|---|
| Data privacy | Complete | Data sent to cloud |
| Cost | Hardware only | Per-token charges |
| Latency | Consistent | Variable |
| Internet | Not required | Required |
| Customization | Full control | Limited |
Your RAG pipeline now runs entirely locally. Next, we'll add function calling capabilities. :::