Python Integration

The official ollama Python library provides a clean, Pythonic interface to Ollama. Let's build real applications.

Installation

pip install ollama

Basic Usage

Simple Generation

import ollama

# Basic text generation
response = ollama.generate(
    model='llama3.2',
    prompt='Explain what an API is in one sentence.'
)

print(response['response'])
# "An API (Application Programming Interface) is a set of rules
#  that allows different software applications to communicate."

Chat Conversations

import ollama

# Multi-turn conversation
response = ollama.chat(
    model='llama3.2',
    messages=[
        {'role': 'system', 'content': 'You are a Python tutor.'},
        {'role': 'user', 'content': 'What is a list comprehension?'}
    ]
)

print(response['message']['content'])

Maintaining Conversation Context

import ollama

messages = [
    {'role': 'system', 'content': 'You are a helpful assistant.'}
]

def chat(user_message: str) -> str:
    """Send a message and get a response, maintaining context."""
    messages.append({'role': 'user', 'content': user_message})

    response = ollama.chat(
        model='llama3.2',
        messages=messages
    )

    assistant_message = response['message']['content']
    messages.append({'role': 'assistant', 'content': assistant_message})

    return assistant_message

# Multi-turn conversation
print(chat("What's the capital of France?"))
print(chat("What's the population there?"))  # Context preserved
print(chat("And what language do they speak?"))  # Still knows "France"

Streaming Responses

For real-time output in applications:

import ollama

# Stream tokens as they're generated
stream = ollama.chat(
    model='llama3.2',
    messages=[{'role': 'user', 'content': 'Write a haiku about coding'}],
    stream=True
)

for chunk in stream:
    print(chunk['message']['content'], end='', flush=True)

print()  # Final newline

Streaming with Progress Indicator

import ollama
import sys

def stream_response(prompt: str, model: str = 'llama3.2') -> str:
    """Stream response with real-time output."""
    full_response = []

    stream = ollama.chat(
        model=model,
        messages=[{'role': 'user', 'content': prompt}],
        stream=True
    )

    for chunk in stream:
        content = chunk['message']['content']
        full_response.append(content)
        sys.stdout.write(content)
        sys.stdout.flush()

    print()  # Final newline
    return ''.join(full_response)

response = stream_response("Explain recursion in 3 sentences")

Generating Embeddings

import ollama
import numpy as np

# Single embedding
response = ollama.embed(
    model='llama3.2',
    input='Python is a great programming language'
)

embedding = response['embeddings'][0]
print(f"Embedding dimension: {len(embedding)}")

# Multiple embeddings
texts = [
    "Python is great for AI",
    "JavaScript runs in browsers",
    "Python is excellent for machine learning"
]

response = ollama.embed(model='llama3.2', input=texts)
embeddings = response['embeddings']

# Calculate similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Python-AI and Python-ML should be more similar
sim_01 = cosine_similarity(embeddings[0], embeddings[1])
sim_02 = cosine_similarity(embeddings[0], embeddings[2])
print(f"Python-AI vs JavaScript: {sim_01:.3f}")
print(f"Python-AI vs Python-ML: {sim_02:.3f}")

Model Management

import ollama

# List all models
models = ollama.list()
for model in models['models']:
    print(f"{model['name']}: {model['size'] / 1e9:.1f} GB")

# Pull a model
ollama.pull('mistral')

# Show model details
info = ollama.show('llama3.2')
print(info['modelfile'])

# Delete a model
ollama.delete('old-model')

Custom Options

import ollama

response = ollama.chat(
    model='llama3.2',
    messages=[{'role': 'user', 'content': 'Write creative story ideas'}],
    options={
        'temperature': 1.2,      # More creative
        'top_p': 0.95,
        'num_ctx': 4096,         # Larger context
        'num_predict': 500,      # Max tokens to generate
        'stop': ['THE END']      # Custom stop sequence
    }
)

Error Handling

import ollama
from ollama import ResponseError

def safe_generate(prompt: str, model: str = 'llama3.2') -> str:
    """Generate with proper error handling."""
    try:
        response = ollama.generate(model=model, prompt=prompt)
        return response['response']

    except ResponseError as e:
        if 'not found' in str(e):
            print(f"Model {model} not found. Pulling...")
            ollama.pull(model)
            return safe_generate(prompt, model)
        raise

    except ConnectionError:
        print("Ollama server not running. Start with: ollama serve")
        raise

# Usage
result = safe_generate("Hello!", "llama3.2")

Async Support

import asyncio
import ollama

async def async_chat(messages: list) -> str:
    """Async chat with Ollama."""
    client = ollama.AsyncClient()
    response = await client.chat(
        model='llama3.2',
        messages=messages
    )
    return response['message']['content']

async def main():
    # Concurrent requests
    tasks = [
        async_chat([{'role': 'user', 'content': 'What is 2+2?'}]),
        async_chat([{'role': 'user', 'content': 'What is 3+3?'}]),
        async_chat([{'role': 'user', 'content': 'What is 4+4?'}])
    ]
    results = await asyncio.gather(*tasks)
    for i, result in enumerate(results):
        print(f"Result {i+1}: {result}")

asyncio.run(main())

Building a Simple CLI Chatbot

import ollama

def chatbot(model: str = 'llama3.2'):
    """Interactive chatbot with conversation history."""
    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'}
    ]

    print(f"Chatbot ready (using {model}). Type 'quit' to exit.\n")

    while True:
        user_input = input("You: ").strip()

        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break

        if not user_input:
            continue

        messages.append({'role': 'user', 'content': user_input})

        print("Assistant: ", end='', flush=True)

        stream = ollama.chat(
            model=model,
            messages=messages,
            stream=True
        )

        response_content = []
        for chunk in stream:
            content = chunk['message']['content']
            response_content.append(content)
            print(content, end='', flush=True)

        print()  # Newline after response

        messages.append({
            'role': 'assistant',
            'content': ''.join(response_content)
        })

if __name__ == '__main__':
    chatbot()

The Python library makes Ollama integration simple and Pythonic. In the next lesson, we'll use LangChain for more sophisticated applications. :::