RAG Retrieval Rails

NeMo Guardrails provides built-in support for Retrieval-Augmented Generation (RAG) with guardrails that ensure responses are grounded in your knowledge base while preventing hallucinations.

Knowledge Base Setup

Configuration

# config/config.yml
models:
  - type: main
    engine: openai
    model: gpt-4o

  - type: embeddings
    engine: openai
    model: text-embedding-3-small

knowledge_base:
  - type: file
    path: ./kb/
    formats: [md, txt, pdf]

  - type: vector_db
    provider: chroma
    collection: company_docs
    connection_string: ./chroma_db

rails:
  retrieval:
    flows:
      - retrieve relevant chunks
      - check retrieval relevance
      - ground response in context

Document Indexing

from nemoguardrails import LLMRails, RailsConfig
from nemoguardrails.kb.index import KnowledgeBaseIndex

config = RailsConfig.from_path("./config")

# Index documents
index = KnowledgeBaseIndex(config)

# Add documents
index.add_documents([
    {
        "content": "Our product supports Python 3.9+...",
        "metadata": {"source": "docs/requirements.md", "category": "technical"}
    },
    {
        "content": "Pricing starts at $99/month...",
        "metadata": {"source": "docs/pricing.md", "category": "business"}
    }
])

# Build index
index.build()

Retrieval Rails

Basic Retrieval Flow

# rails/retrieval.co
define flow retrieve relevant chunks
  """Retrieve context before generating response."""
  $chunks = execute search_knowledge_base(
    query=$user_message,
    top_k=5
  )

  if len($chunks) == 0
    bot inform no relevant information
    stop

  # Store in context for generation
  $context.retrieved_chunks = $chunks

define bot inform no relevant information
  "I don't have specific information about that in my knowledge base. Let me provide general guidance instead."

Relevance Checking

# actions/retrieval.py
from nemoguardrails.actions import action
from typing import List, Dict
import numpy as np

@action(name="search_knowledge_base")
async def search_knowledge_base(
    query: str,
    top_k: int = 5,
    relevance_threshold: float = 0.7
) -> List[Dict]:
    """
    Search knowledge base with relevance filtering.
    """
    # Get embeddings (using configured model)
    query_embedding = await get_embedding(query)

    # Search vector DB
    results = await vector_db.search(
        embedding=query_embedding,
        top_k=top_k * 2  # Retrieve more for filtering
    )

    # Filter by relevance
    relevant = [
        r for r in results
        if r["score"] >= relevance_threshold
    ][:top_k]

    return relevant


@action(name="check_retrieval_quality")
async def check_retrieval_quality(
    query: str,
    chunks: List[Dict]
) -> Dict:
    """
    Assess if retrieved chunks are sufficient
    to answer the query.
    """
    if not chunks:
        return {"sufficient": False, "reason": "no_chunks"}

    # Check coverage
    avg_relevance = np.mean([c["score"] for c in chunks])

    if avg_relevance < 0.6:
        return {
            "sufficient": False,
            "reason": "low_relevance",
            "avg_score": avg_relevance
        }

    # Check diversity (avoid redundant chunks)
    unique_sources = len(set(c["metadata"]["source"] for c in chunks))

    return {
        "sufficient": True,
        "avg_relevance": avg_relevance,
        "source_diversity": unique_sources,
        "chunk_count": len(chunks)
    }

Grounding Verification

# rails/grounding.co
define flow ground response in context
  """Ensure response is grounded in retrieved context."""
  # First generate response with context
  $response = execute generate_grounded_response(
    query=$user_message,
    context=$context.retrieved_chunks
  )

  # Verify grounding
  $grounding_check = execute verify_grounding(
    response=$response,
    context=$context.retrieved_chunks
  )

  if not $grounding_check.is_grounded
    # Regenerate with stricter grounding
    $response = execute regenerate_with_citations(
      query=$user_message,
      context=$context.retrieved_chunks,
      ungrounded_claims=$grounding_check.ungrounded_claims
    )

  bot say $response

Grounding Verification Action

# actions/grounding.py
from nemoguardrails.actions import action
from typing import List, Dict

@action(name="verify_grounding")
async def verify_grounding(
    response: str,
    context: List[Dict]
) -> Dict:
    """
    Verify that response claims are grounded in context.
    Uses NLI (Natural Language Inference) model.
    """
    from transformers import pipeline

    nli = pipeline("text-classification", model="roberta-large-mnli")

    # Extract claims from response
    claims = extract_claims(response)

    # Concatenate context
    context_text = " ".join([c["content"] for c in context])

    ungrounded = []
    for claim in claims:
        # Check if claim is entailed by context
        result = nli(f"{context_text} [SEP] {claim}")

        if result[0]["label"] != "ENTAILMENT":
            ungrounded.append({
                "claim": claim,
                "verdict": result[0]["label"],
                "confidence": result[0]["score"]
            })

    return {
        "is_grounded": len(ungrounded) == 0,
        "grounded_ratio": (len(claims) - len(ungrounded)) / len(claims) if claims else 1.0,
        "ungrounded_claims": ungrounded
    }


@action(name="generate_grounded_response")
async def generate_grounded_response(
    query: str,
    context: List[Dict]
) -> str:
    """Generate response strictly grounded in context."""
    context_text = "\n\n".join([
        f"[Source: {c['metadata'].get('source', 'unknown')}]\n{c['content']}"
        for c in context
    ])

    prompt = f"""Based ONLY on the following context, answer the user's question.
If the context doesn't contain enough information, say so.
Always cite your sources using [Source: filename] format.

Context:
{context_text}

Question: {query}

Answer (grounded in context only):"""

    # Use the configured LLM
    response = await llm.generate(prompt)
    return response


@action(name="regenerate_with_citations")
async def regenerate_with_citations(
    query: str,
    context: List[Dict],
    ungrounded_claims: List[Dict]
) -> str:
    """Regenerate removing ungrounded claims and adding citations."""
    context_text = "\n\n".join([
        f"[Source: {c['metadata'].get('source', 'unknown')}]\n{c['content']}"
        for c in context
    ])

    ungrounded_text = "\n".join([c["claim"] for c in ungrounded_claims])

    prompt = f"""Based ONLY on the following context, answer the user's question.
DO NOT include these unverified claims: {ungrounded_text}
Every statement must have a citation like [Source: filename].

Context:
{context_text}

Question: {query}

Answer (with citations):"""

    response = await llm.generate(prompt)
    return response

Production RAG Pipeline

# Complete RAG pipeline with guardrails
from nemoguardrails import LLMRails, RailsConfig
from dataclasses import dataclass
from typing import Optional

@dataclass
class RAGResult:
    answer: str
    sources: list
    grounding_score: float
    was_regenerated: bool

class GuardedRAGPipeline:
    """Production RAG with NeMo Guardrails."""

    def __init__(self, config_path: str):
        self.config = RailsConfig.from_path(config_path)
        self.rails = LLMRails(self.config)

    async def query(
        self,
        question: str,
        user_context: dict = None
    ) -> RAGResult:
        """
        Execute guarded RAG query.
        """
        context = user_context or {}
        context["query"] = question

        # Execute rails pipeline
        response = await self.rails.generate_async(
            messages=[{"role": "user", "content": question}],
            context=context
        )

        # Extract metadata from context
        retrieved_chunks = response.get("context", {}).get("retrieved_chunks", [])
        grounding_check = response.get("context", {}).get("grounding_check", {})

        return RAGResult(
            answer=response["content"],
            sources=[c["metadata"]["source"] for c in retrieved_chunks],
            grounding_score=grounding_check.get("grounded_ratio", 1.0),
            was_regenerated=grounding_check.get("was_regenerated", False)
        )

# Usage
pipeline = GuardedRAGPipeline("./config")

result = await pipeline.query(
    "What programming languages does your product support?"
)

print(f"Answer: {result.answer}")
print(f"Sources: {result.sources}")
print(f"Grounding: {result.grounding_score:.1%}")

Hallucination Prevention Flow

# rails/anti_hallucination.co
define flow prevent hallucination
  """Complete anti-hallucination pipeline."""

  # Step 1: Retrieve
  $chunks = execute search_knowledge_base(query=$user_message, top_k=5)

  # Step 2: Check if we have enough context
  $quality = execute check_retrieval_quality(query=$user_message, chunks=$chunks)

  if not $quality.sufficient
    if $quality.reason == "no_chunks"
      bot admit lack of knowledge
    else
      bot provide hedged response
    stop

  # Step 3: Generate grounded response
  $response = execute generate_grounded_response(
    query=$user_message,
    context=$chunks
  )

  # Step 4: Verify grounding
  $check = execute verify_grounding(response=$response, context=$chunks)

  if $check.grounded_ratio < 0.8
    # Too many ungrounded claims
    $response = execute regenerate_with_citations(
      query=$user_message,
      context=$chunks,
      ungrounded_claims=$check.ungrounded_claims
    )

  # Step 5: Add confidence indicator
  $confidence = "high" if $check.grounded_ratio > 0.9 else "moderate"
  bot respond with confidence indicator $response $confidence

define bot admit lack of knowledge
  "I don't have verified information about that topic in my knowledge base. I'd recommend checking our official documentation or contacting support."

define bot provide hedged response
  "Based on limited information I found, though I'd recommend verifying: "

RAG Best Practice: Always verify grounding before returning responses. A response that admits uncertainty is better than one that confidently states unverified information.

Next: Deep dive into Guardrails AI framework for schema validation. :::