Lesson 17 of 23

RAG Evaluation & Testing

RAGAS Framework

3 min read

RAGAS (Retrieval Augmented Generation Assessment) is the standard framework for evaluating RAG pipelines. It provides automated, reference-free evaluation using LLMs.

Why RAGAS?

┌────────────────────────────────────────────────────────────┐
│                    Traditional Evaluation                   │
├────────────────────────────────────────────────────────────┤
│  • Requires human-labeled ground truth                     │
│  • Expensive and time-consuming                            │
│  • Hard to scale                                           │
│  • Static test sets become stale                           │
└────────────────────────────────────────────────────────────┘
┌────────────────────────────────────────────────────────────┐
│                    RAGAS Evaluation                         │
├────────────────────────────────────────────────────────────┤
│  • Uses LLMs to evaluate LLM outputs                       │
│  • Reference-free (no ground truth needed)                 │
│  • Scalable and automated                                  │
│  • Evaluates multiple dimensions                           │
└────────────────────────────────────────────────────────────┘

Installation and Setup

pip install ragas langchain-openai
import os
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from datasets import Dataset

# Set up your LLM provider
os.environ["OPENAI_API_KEY"] = "your-key"

RAGAS Core Metrics

1. Faithfulness

Measures if the answer is grounded in the context:

from ragas.metrics import faithfulness

# How it works:
# 1. Extract statements from the answer
# 2. For each statement, check if context supports it
# 3. Score = supported_statements / total_statements

# Score interpretation:
# 1.0 = Every claim in answer is supported by context
# 0.5 = Half the claims are unsupported (hallucinations)
# 0.0 = Answer is completely made up

2. Answer Relevancy

Measures if the answer addresses the question:

from ragas.metrics import answer_relevancy

# How it works:
# 1. Generate N questions from the answer
# 2. Calculate cosine similarity with original question
# 3. Score = average similarity

# Score interpretation:
# 1.0 = Answer perfectly addresses the question
# 0.5 = Answer partially relevant
# 0.0 = Answer completely off-topic

3. Context Precision

Measures if retrieved contexts are relevant:

from ragas.metrics import context_precision

# How it works:
# 1. For each context chunk, determine if it's relevant
# 2. Weight by position (earlier = more important)
# 3. Score = weighted precision

# Score interpretation:
# 1.0 = All retrieved contexts are relevant
# 0.5 = Half the contexts are noise
# 0.0 = No relevant contexts retrieved

4. Context Recall

Measures if all necessary information was retrieved:

from ragas.metrics import context_recall

# Requires ground truth answer
# How it works:
# 1. Extract claims from ground truth answer
# 2. Check if each claim is attributable to contexts
# 3. Score = attributable_claims / total_claims

# Score interpretation:
# 1.0 = All information needed is in context
# 0.5 = Missing half the necessary information
# 0.0 = None of the needed information retrieved

Running RAGAS Evaluation

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from datasets import Dataset

# Prepare your evaluation data
eval_data = {
    "question": [
        "What is the capital of France?",
        "Who wrote Romeo and Juliet?",
        "What is photosynthesis?",
    ],
    "answer": [
        "Paris is the capital of France.",
        "William Shakespeare wrote Romeo and Juliet.",
        "Photosynthesis is how plants convert sunlight to energy.",
    ],
    "contexts": [
        ["Paris is the capital and largest city of France."],
        ["Romeo and Juliet is a tragedy by William Shakespeare."],
        ["Photosynthesis is a process used by plants to convert light into chemical energy."],
    ],
    "ground_truth": [
        "The capital of France is Paris.",
        "William Shakespeare wrote Romeo and Juliet.",
        "Photosynthesis is the process plants use to convert sunlight into food.",
    ],
}

# Create dataset
dataset = Dataset.from_dict(eval_data)

# Run evaluation
results = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
    ],
)

print(results)
# Output:
# {
#     'faithfulness': 0.95,
#     'answer_relevancy': 0.92,
#     'context_precision': 0.88,
#     'context_recall': 0.90
# }

Evaluating Your RAG Pipeline

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from datasets import Dataset

# Your RAG pipeline
def run_rag(question: str) -> dict:
    """Run RAG and return results in RAGAS format."""

    # Retrieve
    docs = vectorstore.similarity_search(question, k=5)
    contexts = [doc.page_content for doc in docs]

    # Generate
    prompt = f"""Answer based on the context:
    Context: {contexts}
    Question: {question}
    """
    answer = llm.invoke(prompt).content

    return {
        "question": question,
        "answer": answer,
        "contexts": contexts,
    }

# Evaluation questions
test_questions = [
    "How do I reset my password?",
    "What are the pricing tiers?",
    "How do I contact support?",
]

# Run RAG on all questions
results = [run_rag(q) for q in test_questions]

# Format for RAGAS
eval_dataset = Dataset.from_dict({
    "question": [r["question"] for r in results],
    "answer": [r["answer"] for r in results],
    "contexts": [r["contexts"] for r in results],
})

# Evaluate
scores = evaluate(
    eval_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision],
)

print(f"Faithfulness: {scores['faithfulness']:.2f}")
print(f"Answer Relevancy: {scores['answer_relevancy']:.2f}")
print(f"Context Precision: {scores['context_precision']:.2f}")

Interpreting Results

Metric Good Acceptable Poor
Faithfulness > 0.9 0.7 - 0.9 < 0.7
Answer Relevancy > 0.85 0.7 - 0.85 < 0.7
Context Precision > 0.8 0.6 - 0.8 < 0.6
Context Recall > 0.8 0.6 - 0.8 < 0.6

Diagnosing Issues

def diagnose_rag_issues(scores: dict) -> list[str]:
    """Identify RAG pipeline issues from RAGAS scores."""
    issues = []

    if scores.get("faithfulness", 1) < 0.7:
        issues.append(
            "LOW FAITHFULNESS: LLM is hallucinating. "
            "Try: Stronger prompts, lower temperature, better context."
        )

    if scores.get("answer_relevancy", 1) < 0.7:
        issues.append(
            "LOW RELEVANCY: Answers don't match questions. "
            "Try: Better prompts, query rewriting, improved retrieval."
        )

    if scores.get("context_precision", 1) < 0.6:
        issues.append(
            "LOW PRECISION: Too much irrelevant context. "
            "Try: Better embeddings, reranking, metadata filtering."
        )

    if scores.get("context_recall", 1) < 0.6:
        issues.append(
            "LOW RECALL: Missing relevant information. "
            "Try: More chunks, better chunking, hybrid search."
        )

    return issues

# Example
issues = diagnose_rag_issues({
    "faithfulness": 0.85,
    "answer_relevancy": 0.90,
    "context_precision": 0.55,  # Problem!
    "context_recall": 0.75,
})

# Output:
# ["LOW PRECISION: Too much irrelevant context. Try: Better embeddings, reranking, metadata filtering."]

Custom Metrics

from ragas.metrics.base import Metric

class AnswerConciseness(Metric):
    """Custom metric for answer length appropriateness."""

    name = "answer_conciseness"

    def _score(self, row: dict) -> float:
        answer = row["answer"]
        question = row["question"]

        # Simple heuristic: penalize very long answers
        word_count = len(answer.split())

        if word_count < 10:
            return 0.5  # Too short
        elif word_count < 100:
            return 1.0  # Good length
        elif word_count < 200:
            return 0.8  # Getting long
        else:
            return 0.5  # Too verbose

# Use custom metric
scores = evaluate(
    dataset,
    metrics=[faithfulness, AnswerConciseness()],
)

Key Insight: RAGAS provides automated evaluation at scale, but periodically validate against human judgments. LLM-as-judge can have blind spots, especially for domain-specific content.

Next, let's learn how to build comprehensive test datasets. :::

Quiz

Module 5: RAG Evaluation & Testing

Take Quiz