Advanced Local LLM Patterns
Local Embeddings
3 min read
Embeddings are the foundation of RAG, semantic search, and many AI applications. Running them locally gives you privacy and cost savings at scale.
Local Embedding Models
┌─────────────────────────────────────────────────────────────────┐
│ Popular Local Embedding Models │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Model │ Dimensions │ Speed │ Best For │
│ ────────────────────│────────────│──────────│───────────── │
│ nomic-embed-text │ 768 │ Fast │ General text │
│ mxbai-embed-large │ 1024 │ Medium │ High accuracy │
│ all-minilm │ 384 │ Very fast│ Speed priority │
│ snowflake-arctic │ 1024 │ Medium │ Enterprise │
│ bge-large │ 1024 │ Medium │ Multilingual │
│ │
└─────────────────────────────────────────────────────────────────┘
Using Ollama for Embeddings
# Pull an embedding model
ollama pull nomic-embed-text
# Generate embeddings via CLI
echo '{"model": "nomic-embed-text", "input": "Hello world"}' | \
curl -s http://localhost:11434/api/embed -d @-
Python Implementation
import ollama
import numpy as np
def get_embedding(text: str, model: str = "nomic-embed-text") -> list[float]:
"""Generate embedding for a single text."""
response = ollama.embed(model=model, input=text)
return response["embeddings"][0]
def get_embeddings(texts: list[str], model: str = "nomic-embed-text") -> list[list[float]]:
"""Generate embeddings for multiple texts."""
response = ollama.embed(model=model, input=texts)
return response["embeddings"]
# Example usage
embedding = get_embedding("Machine learning is transforming industries")
print(f"Embedding dimension: {len(embedding)}")
Semantic Similarity Search
import ollama
import numpy as np
from typing import List, Tuple
class LocalVectorStore:
"""Simple in-memory vector store with local embeddings."""
def __init__(self, model: str = "nomic-embed-text"):
self.model = model
self.documents: list[str] = []
self.embeddings: list[list[float]] = []
def add_documents(self, documents: list[str]):
"""Add documents to the store."""
new_embeddings = ollama.embed(
model=self.model,
input=documents
)["embeddings"]
self.documents.extend(documents)
self.embeddings.extend(new_embeddings)
def search(self, query: str, top_k: int = 3) -> list[tuple[str, float]]:
"""Search for similar documents."""
query_embedding = ollama.embed(
model=self.model,
input=query
)["embeddings"][0]
# Calculate similarities
similarities = []
for i, doc_embedding in enumerate(self.embeddings):
sim = self._cosine_similarity(query_embedding, doc_embedding)
similarities.append((self.documents[i], sim))
# Sort by similarity (descending)
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_k]
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
"""Calculate cosine similarity between two vectors."""
a, b = np.array(a), np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# Usage
store = LocalVectorStore()
store.add_documents([
"Python is a programming language",
"JavaScript runs in web browsers",
"Machine learning uses neural networks",
"Python is popular for data science"
])
results = store.search("What language is good for AI?")
for doc, score in results:
print(f"{score:.3f}: {doc}")
Batch Processing for Large Datasets
import ollama
from tqdm import tqdm
def embed_large_dataset(
documents: list[str],
model: str = "nomic-embed-text",
batch_size: int = 100
) -> list[list[float]]:
"""Embed large datasets in batches."""
all_embeddings = []
for i in tqdm(range(0, len(documents), batch_size)):
batch = documents[i:i + batch_size]
response = ollama.embed(model=model, input=batch)
all_embeddings.extend(response["embeddings"])
return all_embeddings
# Process 10,000 documents
documents = [f"Document {i} content here" for i in range(10000)]
embeddings = embed_large_dataset(documents)
Comparing Embedding Models
import ollama
import numpy as np
import time
def benchmark_embedding_model(model: str, texts: list[str]) -> dict:
"""Benchmark an embedding model."""
start = time.time()
response = ollama.embed(model=model, input=texts)
elapsed = time.time() - start
embeddings = response["embeddings"]
return {
"model": model,
"dimension": len(embeddings[0]),
"time_seconds": elapsed,
"texts_per_second": len(texts) / elapsed
}
# Test texts
test_texts = ["This is a test sentence"] * 100
# Compare models
models = ["nomic-embed-text", "all-minilm", "mxbai-embed-large"]
for model in models:
try:
result = benchmark_embedding_model(model, test_texts)
print(f"{result['model']}: {result['dimension']}d, "
f"{result['texts_per_second']:.1f} texts/sec")
except Exception as e:
print(f"{model}: Not available ({e})")
Persisting Embeddings
import ollama
import numpy as np
import json
def save_embeddings(documents: list[str], embeddings: list[list[float]], path: str):
"""Save documents and embeddings to disk."""
data = {
"documents": documents,
"embeddings": embeddings
}
with open(path, "w") as f:
json.dump(data, f)
def load_embeddings(path: str) -> tuple[list[str], list[list[float]]]:
"""Load documents and embeddings from disk."""
with open(path, "r") as f:
data = json.load(f)
return data["documents"], data["embeddings"]
# Save
documents = ["Doc 1", "Doc 2", "Doc 3"]
embeddings = ollama.embed(model="nomic-embed-text", input=documents)["embeddings"]
save_embeddings(documents, embeddings, "embeddings.json")
# Load later
docs, embs = load_embeddings("embeddings.json")
LangChain Integration
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
# Initialize local embeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Create vector store with local embeddings
documents = [
"Python is great for AI",
"JavaScript is for web development",
"Rust is systems programming"
]
vectorstore = FAISS.from_texts(documents, embeddings)
# Search
results = vectorstore.similarity_search("AI programming", k=2)
for doc in results:
print(doc.page_content)
Model Selection Guide
| Use Case | Recommended Model | Why |
|---|---|---|
| General purpose | nomic-embed-text | Good balance of speed and quality |
| High accuracy | mxbai-embed-large | Best retrieval performance |
| Speed priority | all-minilm | Fastest, smaller dimensions |
| Multilingual | bge-large | Handles multiple languages |
Local embeddings unlock privacy-preserving semantic search. In the next lesson, we'll build a complete local RAG pipeline. :::