Automated Testing Pipelines

Manual evaluation doesn't scale. This lesson covers how to set up automated testing pipelines that run on every change and catch regressions before they reach production.

Pipeline Architecture

┌─────────────────────────────────────────────────────────────────┐
│                    RAG Testing Pipeline                          │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  ┌──────────┐    ┌──────────┐    ┌──────────┐    ┌──────────┐  │
│  │  TRIGGER │───▶│   RUN    │───▶│ EVALUATE │───▶│  REPORT  │  │
│  └──────────┘    └──────────┘    └──────────┘    └──────────┘  │
│       │               │               │               │         │
│       ▼               ▼               ▼               ▼         │
│  • Git push      • Load test set  • RAGAS metrics • Dashboard   │
│  • PR created    • Run RAG        • Thresholds    • Alerts      │
│  • Schedule      • Collect output • Compare       • Artifacts   │
│  • Manual        • Log latency    • to baseline   • Slack/Email │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

Test Framework Setup

import json
import time
from pathlib import Path
from dataclasses import dataclass
from typing import Callable
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision

@dataclass
class TestResult:
    """Results from a single test run."""
    question: str
    answer: str
    contexts: list[str]
    latency_ms: float
    metrics: dict

@dataclass
class PipelineResult:
    """Results from a full pipeline run."""
    timestamp: str
    total_questions: int
    avg_latency_ms: float
    metrics: dict
    failures: list[dict]
    passed: bool

class RAGTestPipeline:
    """Automated RAG testing pipeline."""

    def __init__(
        self,
        rag_function: Callable,
        test_set_path: str,
        thresholds: dict = None,
    ):
        self.rag_function = rag_function
        self.test_set = self._load_test_set(test_set_path)
        self.thresholds = thresholds or {
            "faithfulness": 0.8,
            "answer_relevancy": 0.75,
            "context_precision": 0.7,
            "avg_latency_ms": 2000,
        }

    def _load_test_set(self, path: str) -> list[dict]:
        return json.loads(Path(path).read_text())

    def run(self) -> PipelineResult:
        """Execute the test pipeline."""

        results = []
        latencies = []

        # Run RAG on each test question
        for test_case in self.test_set:
            start = time.time()

            output = self.rag_function(test_case["question"])

            latency = (time.time() - start) * 1000
            latencies.append(latency)

            results.append(TestResult(
                question=test_case["question"],
                answer=output["answer"],
                contexts=output["contexts"],
                latency_ms=latency,
                metrics={},
            ))

        # Evaluate with RAGAS
        from datasets import Dataset
        eval_dataset = Dataset.from_dict({
            "question": [r.question for r in results],
            "answer": [r.answer for r in results],
            "contexts": [r.contexts for r in results],
        })

        scores = evaluate(
            eval_dataset,
            metrics=[faithfulness, answer_relevancy, context_precision],
        )

        # Check thresholds
        failures = []
        passed = True

        for metric, threshold in self.thresholds.items():
            if metric == "avg_latency_ms":
                actual = sum(latencies) / len(latencies)
            else:
                actual = scores.get(metric, 0)

            if actual < threshold and metric != "avg_latency_ms":
                failures.append({
                    "metric": metric,
                    "expected": f">= {threshold}",
                    "actual": actual,
                })
                passed = False
            elif actual > threshold and metric == "avg_latency_ms":
                failures.append({
                    "metric": metric,
                    "expected": f"<= {threshold}",
                    "actual": actual,
                })
                passed = False

        return PipelineResult(
            timestamp=datetime.now().isoformat(),
            total_questions=len(results),
            avg_latency_ms=sum(latencies) / len(latencies),
            metrics=dict(scores),
            failures=failures,
            passed=passed,
        )

GitHub Actions Integration

# .github/workflows/rag-tests.yml
name: RAG Evaluation

on:
  push:
    branches: [main]
    paths:
      - 'src/rag/**'
      - 'prompts/**'
  pull_request:
    branches: [main]
  schedule:
    - cron: '0 0 * * *'  # Daily at midnight

jobs:
  evaluate:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'

      - name: Install dependencies
        run: |
          pip install -r requirements.txt
          pip install ragas pytest

      - name: Run RAG evaluation
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          python -m pytest tests/test_rag_quality.py \
            --json-report \
            --json-report-file=results.json

      - name: Check thresholds
        run: python scripts/check_thresholds.py results.json

      - name: Upload results
        uses: actions/upload-artifact@v4
        with:
          name: rag-evaluation-results
          path: results.json

      - name: Comment on PR
        if: github.event_name == 'pull_request'
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            const results = JSON.parse(fs.readFileSync('results.json'));

            const body = `## RAG Evaluation Results

            | Metric | Score | Threshold | Status |
            |--------|-------|-----------|--------|
            | Faithfulness | ${results.faithfulness.toFixed(2)} | 0.80 | ${results.faithfulness >= 0.8 ? '✅' : '❌'} |
            | Relevancy | ${results.answer_relevancy.toFixed(2)} | 0.75 | ${results.answer_relevancy >= 0.75 ? '✅' : '❌'} |
            | Precision | ${results.context_precision.toFixed(2)} | 0.70 | ${results.context_precision >= 0.7 ? '✅' : '❌'} |
            `;

            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: body
            });

pytest Integration

# tests/test_rag_quality.py
import pytest
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from datasets import Dataset

@pytest.fixture(scope="module")
def rag_results():
    """Run RAG pipeline on test set once per module."""
    from src.rag import RAGPipeline

    pipeline = RAGPipeline()
    test_questions = load_test_questions("tests/data/test_set.json")

    results = []
    for q in test_questions:
        output = pipeline.query(q["question"])
        results.append({
            "question": q["question"],
            "answer": output["answer"],
            "contexts": output["contexts"],
            "ground_truth": q.get("ground_truth"),
        })

    return results

@pytest.fixture(scope="module")
def ragas_scores(rag_results):
    """Evaluate RAG results with RAGAS."""
    dataset = Dataset.from_dict({
        "question": [r["question"] for r in rag_results],
        "answer": [r["answer"] for r in rag_results],
        "contexts": [r["contexts"] for r in rag_results],
    })

    return evaluate(
        dataset,
        metrics=[faithfulness, answer_relevancy, context_precision],
    )

class TestRAGQuality:
    """RAG quality tests with thresholds."""

    def test_faithfulness_threshold(self, ragas_scores):
        """Answers should be grounded in context."""
        assert ragas_scores["faithfulness"] >= 0.8, \
            f"Faithfulness {ragas_scores['faithfulness']:.2f} below threshold 0.8"

    def test_answer_relevancy_threshold(self, ragas_scores):
        """Answers should address the questions."""
        assert ragas_scores["answer_relevancy"] >= 0.75, \
            f"Relevancy {ragas_scores['answer_relevancy']:.2f} below threshold 0.75"

    def test_context_precision_threshold(self, ragas_scores):
        """Retrieved context should be relevant."""
        assert ragas_scores["context_precision"] >= 0.7, \
            f"Precision {ragas_scores['context_precision']:.2f} below threshold 0.7"

class TestRAGPerformance:
    """RAG performance tests."""

    def test_latency_p95(self, rag_results):
        """95th percentile latency should be under 3 seconds."""
        latencies = [r.get("latency_ms", 0) for r in rag_results]
        p95 = sorted(latencies)[int(len(latencies) * 0.95)]

        assert p95 < 3000, f"P95 latency {p95}ms exceeds 3000ms threshold"

Regression Detection

class RegressionDetector:
    """Detect quality regressions between runs."""

    def __init__(self, history_path: str):
        self.history_path = Path(history_path)
        self.history = self._load_history()

    def _load_history(self) -> list[dict]:
        if self.history_path.exists():
            return json.loads(self.history_path.read_text())
        return []

    def record_run(self, result: PipelineResult):
        """Record a new test run."""
        self.history.append({
            "timestamp": result.timestamp,
            "metrics": result.metrics,
            "avg_latency_ms": result.avg_latency_ms,
        })
        self.history_path.write_text(json.dumps(self.history, indent=2))

    def check_regression(
        self,
        current: dict,
        threshold: float = 0.05
    ) -> list[dict]:
        """Check if current metrics regressed from baseline."""

        if len(self.history) < 5:
            return []  # Not enough history

        # Use last 5 runs as baseline
        baseline = {}
        for metric in current.keys():
            values = [h["metrics"].get(metric, 0) for h in self.history[-5:]]
            baseline[metric] = sum(values) / len(values)

        regressions = []
        for metric, current_value in current.items():
            baseline_value = baseline.get(metric, 0)

            if baseline_value > 0:
                change = (current_value - baseline_value) / baseline_value

                if change < -threshold:  # More than 5% regression
                    regressions.append({
                        "metric": metric,
                        "baseline": baseline_value,
                        "current": current_value,
                        "change_pct": change * 100,
                    })

        return regressions

# Usage in pipeline
detector = RegressionDetector("test_results/history.json")

result = pipeline.run()
detector.record_run(result)

regressions = detector.check_regression(result.metrics)
if regressions:
    print("REGRESSION DETECTED!")
    for r in regressions:
        print(f"  {r['metric']}: {r['baseline']:.2f} -> {r['current']:.2f} ({r['change_pct']:.1f}%)")

Alerting and Monitoring

import requests

class AlertManager:
    """Send alerts for test failures."""

    def __init__(self, slack_webhook: str = None, email_config: dict = None):
        self.slack_webhook = slack_webhook
        self.email_config = email_config

    def alert_failure(self, result: PipelineResult):
        """Send alerts for failed tests."""

        if result.passed:
            return

        message = self._format_message(result)

        if self.slack_webhook:
            self._send_slack(message)

        if self.email_config:
            self._send_email(message)

    def _format_message(self, result: PipelineResult) -> str:
        failures = "\n".join([
            f"• {f['metric']}: {f['actual']:.2f} (expected {f['expected']})"
            for f in result.failures
        ])

        return f"""RAG Quality Alert

Test run failed at {result.timestamp}

Failures:
{failures}

Metrics:
• Faithfulness: {result.metrics.get('faithfulness', 0):.2f}
• Relevancy: {result.metrics.get('answer_relevancy', 0):.2f}
• Precision: {result.metrics.get('context_precision', 0):.2f}
• Avg Latency: {result.avg_latency_ms:.0f}ms
"""

    def _send_slack(self, message: str):
        requests.post(self.slack_webhook, json={"text": message})

# Integration
alert_manager = AlertManager(
    slack_webhook=os.environ.get("SLACK_WEBHOOK"),
)

result = pipeline.run()
alert_manager.alert_failure(result)

Best Practices

Aspect	Recommendation
Frequency	Run on every PR + daily scheduled
Test Set Size	50-100 questions (balance cost vs coverage)
Thresholds	Start conservative, tighten over time
History	Keep 30+ days for trend analysis
Alerts	Alert on failures AND regressions
Cost	Use cheaper models for frequent runs

Key Insight: Automated testing catches issues early, but review failures manually. Low scores might indicate test set problems, not RAG problems. Update your test set when you find false positives.

Next module: Production RAG Systems - optimization, reliability, and monitoring. :::