Building Applications with Ollama
Python Integration
3 min read
The official ollama Python library provides a clean, Pythonic interface to Ollama. Let's build real applications.
Installation
pip install ollama
Basic Usage
Simple Generation
import ollama
# Basic text generation
response = ollama.generate(
model='llama3.2',
prompt='Explain what an API is in one sentence.'
)
print(response['response'])
# "An API (Application Programming Interface) is a set of rules
# that allows different software applications to communicate."
Chat Conversations
import ollama
# Multi-turn conversation
response = ollama.chat(
model='llama3.2',
messages=[
{'role': 'system', 'content': 'You are a Python tutor.'},
{'role': 'user', 'content': 'What is a list comprehension?'}
]
)
print(response['message']['content'])
Maintaining Conversation Context
import ollama
messages = [
{'role': 'system', 'content': 'You are a helpful assistant.'}
]
def chat(user_message: str) -> str:
"""Send a message and get a response, maintaining context."""
messages.append({'role': 'user', 'content': user_message})
response = ollama.chat(
model='llama3.2',
messages=messages
)
assistant_message = response['message']['content']
messages.append({'role': 'assistant', 'content': assistant_message})
return assistant_message
# Multi-turn conversation
print(chat("What's the capital of France?"))
print(chat("What's the population there?")) # Context preserved
print(chat("And what language do they speak?")) # Still knows "France"
Streaming Responses
For real-time output in applications:
import ollama
# Stream tokens as they're generated
stream = ollama.chat(
model='llama3.2',
messages=[{'role': 'user', 'content': 'Write a haiku about coding'}],
stream=True
)
for chunk in stream:
print(chunk['message']['content'], end='', flush=True)
print() # Final newline
Streaming with Progress Indicator
import ollama
import sys
def stream_response(prompt: str, model: str = 'llama3.2') -> str:
"""Stream response with real-time output."""
full_response = []
stream = ollama.chat(
model=model,
messages=[{'role': 'user', 'content': prompt}],
stream=True
)
for chunk in stream:
content = chunk['message']['content']
full_response.append(content)
sys.stdout.write(content)
sys.stdout.flush()
print() # Final newline
return ''.join(full_response)
response = stream_response("Explain recursion in 3 sentences")
Generating Embeddings
import ollama
import numpy as np
# Single embedding
response = ollama.embed(
model='llama3.2',
input='Python is a great programming language'
)
embedding = response['embeddings'][0]
print(f"Embedding dimension: {len(embedding)}")
# Multiple embeddings
texts = [
"Python is great for AI",
"JavaScript runs in browsers",
"Python is excellent for machine learning"
]
response = ollama.embed(model='llama3.2', input=texts)
embeddings = response['embeddings']
# Calculate similarity
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# Python-AI and Python-ML should be more similar
sim_01 = cosine_similarity(embeddings[0], embeddings[1])
sim_02 = cosine_similarity(embeddings[0], embeddings[2])
print(f"Python-AI vs JavaScript: {sim_01:.3f}")
print(f"Python-AI vs Python-ML: {sim_02:.3f}")
Model Management
import ollama
# List all models
models = ollama.list()
for model in models['models']:
print(f"{model['name']}: {model['size'] / 1e9:.1f} GB")
# Pull a model
ollama.pull('mistral')
# Show model details
info = ollama.show('llama3.2')
print(info['modelfile'])
# Delete a model
ollama.delete('old-model')
Custom Options
import ollama
response = ollama.chat(
model='llama3.2',
messages=[{'role': 'user', 'content': 'Write creative story ideas'}],
options={
'temperature': 1.2, # More creative
'top_p': 0.95,
'num_ctx': 4096, # Larger context
'num_predict': 500, # Max tokens to generate
'stop': ['THE END'] # Custom stop sequence
}
)
Error Handling
import ollama
from ollama import ResponseError
def safe_generate(prompt: str, model: str = 'llama3.2') -> str:
"""Generate with proper error handling."""
try:
response = ollama.generate(model=model, prompt=prompt)
return response['response']
except ResponseError as e:
if 'not found' in str(e):
print(f"Model {model} not found. Pulling...")
ollama.pull(model)
return safe_generate(prompt, model)
raise
except ConnectionError:
print("Ollama server not running. Start with: ollama serve")
raise
# Usage
result = safe_generate("Hello!", "llama3.2")
Async Support
import asyncio
import ollama
async def async_chat(messages: list) -> str:
"""Async chat with Ollama."""
client = ollama.AsyncClient()
response = await client.chat(
model='llama3.2',
messages=messages
)
return response['message']['content']
async def main():
# Concurrent requests
tasks = [
async_chat([{'role': 'user', 'content': 'What is 2+2?'}]),
async_chat([{'role': 'user', 'content': 'What is 3+3?'}]),
async_chat([{'role': 'user', 'content': 'What is 4+4?'}])
]
results = await asyncio.gather(*tasks)
for i, result in enumerate(results):
print(f"Result {i+1}: {result}")
asyncio.run(main())
Building a Simple CLI Chatbot
import ollama
def chatbot(model: str = 'llama3.2'):
"""Interactive chatbot with conversation history."""
messages = [
{'role': 'system', 'content': 'You are a helpful assistant.'}
]
print(f"Chatbot ready (using {model}). Type 'quit' to exit.\n")
while True:
user_input = input("You: ").strip()
if user_input.lower() in ['quit', 'exit', 'q']:
print("Goodbye!")
break
if not user_input:
continue
messages.append({'role': 'user', 'content': user_input})
print("Assistant: ", end='', flush=True)
stream = ollama.chat(
model=model,
messages=messages,
stream=True
)
response_content = []
for chunk in stream:
content = chunk['message']['content']
response_content.append(content)
print(content, end='', flush=True)
print() # Newline after response
messages.append({
'role': 'assistant',
'content': ''.join(response_content)
})
if __name__ == '__main__':
chatbot()
The Python library makes Ollama integration simple and Pythonic. In the next lesson, we'll use LangChain for more sophisticated applications. :::