Build a Research Agent
Testing & Validation
3 min read
Testing AI agents is different from traditional software—we need to handle non-determinism and evaluate quality, not just correctness.
Test Structure
# tests/test_agent.py
import pytest
from unittest.mock import Mock, patch
from agent import ResearchAgent
from config import Config
class TestResearchAgent:
@pytest.fixture
def agent(self):
config = Config()
config.MAX_ITERATIONS = 3 # Limit for tests
return ResearchAgent(config)
@pytest.fixture
def mock_search_results(self):
return {
"success": True,
"results": [
{
"title": "Test Article",
"url": "https://example.com/test",
"snippet": "This is test content about AI agents."
}
]
}
Unit Tests
Test individual components:
# tests/test_tools.py
from tools.search import WebSearchTool
class TestWebSearchTool:
def test_search_returns_results(self):
tool = WebSearchTool()
result = tool.run("Python programming")
assert result["success"] == True
assert len(result["results"]) > 0
assert "title" in result["results"][0]
def test_search_handles_errors(self):
tool = WebSearchTool()
# Test with problematic query
result = tool.run("")
assert "success" in result
# Should not raise exception
# tests/test_memory.py
from memory.store import ResearchMemory
class TestResearchMemory:
def test_add_and_retrieve_finding(self):
memory = ResearchMemory()
memory.add_finding(
query="AI agents",
content="Agents are autonomous systems",
url="https://example.com",
title="About Agents"
)
assert len(memory.findings) == 1
assert memory.has_searched("AI agents")
assert not memory.has_searched("Unknown topic")
def test_get_sources_deduplicates(self):
memory = ResearchMemory()
# Add same source twice
for _ in range(2):
memory.add_finding(
query="test",
content="content",
url="https://same-url.com",
title="Same Title"
)
sources = memory.get_sources()
assert len(sources) == 1 # Deduplicated
Integration Tests
Test the full agent flow:
# tests/test_integration.py
import pytest
from agent import ResearchAgent
from config import Config
class TestAgentIntegration:
@pytest.mark.integration
def test_full_research_flow(self):
"""Test complete research on a known topic"""
agent = ResearchAgent(Config())
report = agent.research("What is machine learning?")
# Check report structure
assert len(report) > 100
assert "machine learning" in report.lower()
@pytest.mark.integration
def test_handles_unknown_topic(self):
"""Agent should still produce output for obscure topics"""
agent = ResearchAgent(Config())
report = agent.research("xyznonexistenttopic123")
# Should not crash, should indicate limited results
assert report is not None
Output Validation
# utils/validators.py
from pydantic import BaseModel, validator
from typing import List, Optional
import re
class ResearchReport(BaseModel):
content: str
sources: List[str]
@validator('content')
def content_not_empty(cls, v):
if len(v.strip()) < 50:
raise ValueError('Report too short')
return v
@validator('content')
def no_hallucinated_urls(cls, v):
# Check for suspicious URL patterns
urls = re.findall(r'https?://[^\s]+', v)
for url in urls:
if 'example.com' in url and 'real' not in url:
raise ValueError(f'Potentially hallucinated URL: {url}')
return v
def validate_report(report: str, sources: List[dict]) -> dict:
"""Validate a generated report"""
issues = []
# Check minimum length
if len(report) < 100:
issues.append("Report is too short")
# Check for source citations
citation_pattern = r'\[\d+\]'
citations = re.findall(citation_pattern, report)
if not citations:
issues.append("No source citations found")
# Check citation numbers match sources
cited_numbers = set(int(c[1:-1]) for c in citations)
available_numbers = set(s['id'] for s in sources)
invalid_citations = cited_numbers - available_numbers
if invalid_citations:
issues.append(f"Invalid citation numbers: {invalid_citations}")
return {
"valid": len(issues) == 0,
"issues": issues
}
Quality Evaluation
# tests/test_quality.py
from langchain_openai import ChatOpenAI
def evaluate_report_quality(report: str, topic: str) -> dict:
"""Use LLM to evaluate report quality"""
llm = ChatOpenAI(model="gpt-4o")
eval_prompt = f"""
Evaluate this research report on "{topic}".
Report:
{report}
Score each criterion from 1-5:
1. Relevance: Does it address the topic?
2. Accuracy: Are facts correct (to your knowledge)?
3. Completeness: Are key aspects covered?
4. Clarity: Is it well-organized and readable?
5. Sources: Are claims properly cited?
Return JSON: {{"relevance": X, "accuracy": X, "completeness": X, "clarity": X, "sources": X, "overall": X, "feedback": "..."}}
"""
response = llm.invoke([{"role": "user", "content": eval_prompt}])
return parse_json(response.content)
class TestReportQuality:
@pytest.mark.quality
def test_report_meets_quality_threshold(self):
agent = ResearchAgent(Config())
report = agent.research("Benefits of renewable energy")
scores = evaluate_report_quality(report, "renewable energy")
assert scores["overall"] >= 3.5
assert scores["relevance"] >= 4
Running Tests
# Run all tests
pytest tests/
# Run only unit tests
pytest tests/ -m "not integration and not quality"
# Run with coverage
pytest tests/ --cov=. --cov-report=html
# Run quality evaluation tests
pytest tests/ -m quality -v
Next: Learn how to extend and deploy your research agent. :::