Lesson 19 of 20

Build a Research Agent

Testing & Validation

3 min read

Testing AI agents is different from traditional software—we need to handle non-determinism and evaluate quality, not just correctness.

Test Structure

# tests/test_agent.py
import pytest
from unittest.mock import Mock, patch
from agent import ResearchAgent
from config import Config

class TestResearchAgent:
    @pytest.fixture
    def agent(self):
        config = Config()
        config.MAX_ITERATIONS = 3  # Limit for tests
        return ResearchAgent(config)

    @pytest.fixture
    def mock_search_results(self):
        return {
            "success": True,
            "results": [
                {
                    "title": "Test Article",
                    "url": "https://example.com/test",
                    "snippet": "This is test content about AI agents."
                }
            ]
        }

Unit Tests

Test individual components:

# tests/test_tools.py
from tools.search import WebSearchTool

class TestWebSearchTool:
    def test_search_returns_results(self):
        tool = WebSearchTool()
        result = tool.run("Python programming")

        assert result["success"] == True
        assert len(result["results"]) > 0
        assert "title" in result["results"][0]

    def test_search_handles_errors(self):
        tool = WebSearchTool()
        # Test with problematic query
        result = tool.run("")

        assert "success" in result
        # Should not raise exception

# tests/test_memory.py
from memory.store import ResearchMemory

class TestResearchMemory:
    def test_add_and_retrieve_finding(self):
        memory = ResearchMemory()

        memory.add_finding(
            query="AI agents",
            content="Agents are autonomous systems",
            url="https://example.com",
            title="About Agents"
        )

        assert len(memory.findings) == 1
        assert memory.has_searched("AI agents")
        assert not memory.has_searched("Unknown topic")

    def test_get_sources_deduplicates(self):
        memory = ResearchMemory()

        # Add same source twice
        for _ in range(2):
            memory.add_finding(
                query="test",
                content="content",
                url="https://same-url.com",
                title="Same Title"
            )

        sources = memory.get_sources()
        assert len(sources) == 1  # Deduplicated

Integration Tests

Test the full agent flow:

# tests/test_integration.py
import pytest
from agent import ResearchAgent
from config import Config

class TestAgentIntegration:
    @pytest.mark.integration
    def test_full_research_flow(self):
        """Test complete research on a known topic"""
        agent = ResearchAgent(Config())

        report = agent.research("What is machine learning?")

        # Check report structure
        assert len(report) > 100
        assert "machine learning" in report.lower()

    @pytest.mark.integration
    def test_handles_unknown_topic(self):
        """Agent should still produce output for obscure topics"""
        agent = ResearchAgent(Config())

        report = agent.research("xyznonexistenttopic123")

        # Should not crash, should indicate limited results
        assert report is not None

Output Validation

# utils/validators.py
from pydantic import BaseModel, validator
from typing import List, Optional
import re

class ResearchReport(BaseModel):
    content: str
    sources: List[str]

    @validator('content')
    def content_not_empty(cls, v):
        if len(v.strip()) < 50:
            raise ValueError('Report too short')
        return v

    @validator('content')
    def no_hallucinated_urls(cls, v):
        # Check for suspicious URL patterns
        urls = re.findall(r'https?://[^\s]+', v)
        for url in urls:
            if 'example.com' in url and 'real' not in url:
                raise ValueError(f'Potentially hallucinated URL: {url}')
        return v

def validate_report(report: str, sources: List[dict]) -> dict:
    """Validate a generated report"""
    issues = []

    # Check minimum length
    if len(report) < 100:
        issues.append("Report is too short")

    # Check for source citations
    citation_pattern = r'\[\d+\]'
    citations = re.findall(citation_pattern, report)
    if not citations:
        issues.append("No source citations found")

    # Check citation numbers match sources
    cited_numbers = set(int(c[1:-1]) for c in citations)
    available_numbers = set(s['id'] for s in sources)
    invalid_citations = cited_numbers - available_numbers
    if invalid_citations:
        issues.append(f"Invalid citation numbers: {invalid_citations}")

    return {
        "valid": len(issues) == 0,
        "issues": issues
    }

Quality Evaluation

# tests/test_quality.py
from langchain_openai import ChatOpenAI

def evaluate_report_quality(report: str, topic: str) -> dict:
    """Use LLM to evaluate report quality"""
    llm = ChatOpenAI(model="gpt-4o")

    eval_prompt = f"""
    Evaluate this research report on "{topic}".

    Report:
    {report}

    Score each criterion from 1-5:
    1. Relevance: Does it address the topic?
    2. Accuracy: Are facts correct (to your knowledge)?
    3. Completeness: Are key aspects covered?
    4. Clarity: Is it well-organized and readable?
    5. Sources: Are claims properly cited?

    Return JSON: {{"relevance": X, "accuracy": X, "completeness": X, "clarity": X, "sources": X, "overall": X, "feedback": "..."}}
    """

    response = llm.invoke([{"role": "user", "content": eval_prompt}])
    return parse_json(response.content)

class TestReportQuality:
    @pytest.mark.quality
    def test_report_meets_quality_threshold(self):
        agent = ResearchAgent(Config())
        report = agent.research("Benefits of renewable energy")

        scores = evaluate_report_quality(report, "renewable energy")

        assert scores["overall"] >= 3.5
        assert scores["relevance"] >= 4

Running Tests

# Run all tests
pytest tests/

# Run only unit tests
pytest tests/ -m "not integration and not quality"

# Run with coverage
pytest tests/ --cov=. --cov-report=html

# Run quality evaluation tests
pytest tests/ -m quality -v

Next: Learn how to extend and deploy your research agent. :::

Quiz

Module 5: Build a Research Agent

Take Quiz