Lesson 12 of 22

Advanced Local LLM Patterns

Local Embeddings

3 min read

Embeddings are the foundation of RAG, semantic search, and many AI applications. Running them locally gives you privacy and cost savings at scale.

Local Embedding Models

┌─────────────────────────────────────────────────────────────────┐
│                   Popular Local Embedding Models                 │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  Model               │ Dimensions │ Speed    │ Best For         │
│  ────────────────────│────────────│──────────│─────────────     │
│  nomic-embed-text    │ 768        │ Fast     │ General text     │
│  mxbai-embed-large   │ 1024       │ Medium   │ High accuracy    │
│  all-minilm          │ 384        │ Very fast│ Speed priority   │
│  snowflake-arctic    │ 1024       │ Medium   │ Enterprise       │
│  bge-large           │ 1024       │ Medium   │ Multilingual     │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Using Ollama for Embeddings

# Pull an embedding model
ollama pull nomic-embed-text

# Generate embeddings via CLI
echo '{"model": "nomic-embed-text", "input": "Hello world"}' | \
  curl -s http://localhost:11434/api/embed -d @-

Python Implementation

import ollama
import numpy as np

def get_embedding(text: str, model: str = "nomic-embed-text") -> list[float]:
    """Generate embedding for a single text."""
    response = ollama.embed(model=model, input=text)
    return response["embeddings"][0]

def get_embeddings(texts: list[str], model: str = "nomic-embed-text") -> list[list[float]]:
    """Generate embeddings for multiple texts."""
    response = ollama.embed(model=model, input=texts)
    return response["embeddings"]

# Example usage
embedding = get_embedding("Machine learning is transforming industries")
print(f"Embedding dimension: {len(embedding)}")
import ollama
import numpy as np
from typing import List, Tuple

class LocalVectorStore:
    """Simple in-memory vector store with local embeddings."""

    def __init__(self, model: str = "nomic-embed-text"):
        self.model = model
        self.documents: list[str] = []
        self.embeddings: list[list[float]] = []

    def add_documents(self, documents: list[str]):
        """Add documents to the store."""
        new_embeddings = ollama.embed(
            model=self.model,
            input=documents
        )["embeddings"]

        self.documents.extend(documents)
        self.embeddings.extend(new_embeddings)

    def search(self, query: str, top_k: int = 3) -> list[tuple[str, float]]:
        """Search for similar documents."""
        query_embedding = ollama.embed(
            model=self.model,
            input=query
        )["embeddings"][0]

        # Calculate similarities
        similarities = []
        for i, doc_embedding in enumerate(self.embeddings):
            sim = self._cosine_similarity(query_embedding, doc_embedding)
            similarities.append((self.documents[i], sim))

        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]

    def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
        """Calculate cosine similarity between two vectors."""
        a, b = np.array(a), np.array(b)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Usage
store = LocalVectorStore()
store.add_documents([
    "Python is a programming language",
    "JavaScript runs in web browsers",
    "Machine learning uses neural networks",
    "Python is popular for data science"
])

results = store.search("What language is good for AI?")
for doc, score in results:
    print(f"{score:.3f}: {doc}")

Batch Processing for Large Datasets

import ollama
from tqdm import tqdm

def embed_large_dataset(
    documents: list[str],
    model: str = "nomic-embed-text",
    batch_size: int = 100
) -> list[list[float]]:
    """Embed large datasets in batches."""
    all_embeddings = []

    for i in tqdm(range(0, len(documents), batch_size)):
        batch = documents[i:i + batch_size]
        response = ollama.embed(model=model, input=batch)
        all_embeddings.extend(response["embeddings"])

    return all_embeddings

# Process 10,000 documents
documents = [f"Document {i} content here" for i in range(10000)]
embeddings = embed_large_dataset(documents)

Comparing Embedding Models

import ollama
import numpy as np
import time

def benchmark_embedding_model(model: str, texts: list[str]) -> dict:
    """Benchmark an embedding model."""
    start = time.time()
    response = ollama.embed(model=model, input=texts)
    elapsed = time.time() - start

    embeddings = response["embeddings"]
    return {
        "model": model,
        "dimension": len(embeddings[0]),
        "time_seconds": elapsed,
        "texts_per_second": len(texts) / elapsed
    }

# Test texts
test_texts = ["This is a test sentence"] * 100

# Compare models
models = ["nomic-embed-text", "all-minilm", "mxbai-embed-large"]
for model in models:
    try:
        result = benchmark_embedding_model(model, test_texts)
        print(f"{result['model']}: {result['dimension']}d, "
              f"{result['texts_per_second']:.1f} texts/sec")
    except Exception as e:
        print(f"{model}: Not available ({e})")

Persisting Embeddings

import ollama
import numpy as np
import json

def save_embeddings(documents: list[str], embeddings: list[list[float]], path: str):
    """Save documents and embeddings to disk."""
    data = {
        "documents": documents,
        "embeddings": embeddings
    }
    with open(path, "w") as f:
        json.dump(data, f)

def load_embeddings(path: str) -> tuple[list[str], list[list[float]]]:
    """Load documents and embeddings from disk."""
    with open(path, "r") as f:
        data = json.load(f)
    return data["documents"], data["embeddings"]

# Save
documents = ["Doc 1", "Doc 2", "Doc 3"]
embeddings = ollama.embed(model="nomic-embed-text", input=documents)["embeddings"]
save_embeddings(documents, embeddings, "embeddings.json")

# Load later
docs, embs = load_embeddings("embeddings.json")

LangChain Integration

from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

# Initialize local embeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Create vector store with local embeddings
documents = [
    "Python is great for AI",
    "JavaScript is for web development",
    "Rust is systems programming"
]

vectorstore = FAISS.from_texts(documents, embeddings)

# Search
results = vectorstore.similarity_search("AI programming", k=2)
for doc in results:
    print(doc.page_content)

Model Selection Guide

Use Case Recommended Model Why
General purpose nomic-embed-text Good balance of speed and quality
High accuracy mxbai-embed-large Best retrieval performance
Speed priority all-minilm Fastest, smaller dimensions
Multilingual bge-large Handles multiple languages

Local embeddings unlock privacy-preserving semantic search. In the next lesson, we'll build a complete local RAG pipeline. :::

Quiz

Module 4: Advanced Local LLM Patterns

Take Quiz