Advanced NeMo Guardrails
RAG Retrieval Rails
3 min read
NeMo Guardrails provides built-in support for Retrieval-Augmented Generation (RAG) with guardrails that ensure responses are grounded in your knowledge base while preventing hallucinations.
Knowledge Base Setup
Configuration
# config/config.yml
models:
- type: main
engine: openai
model: gpt-4o
- type: embeddings
engine: openai
model: text-embedding-3-small
knowledge_base:
- type: file
path: ./kb/
formats: [md, txt, pdf]
- type: vector_db
provider: chroma
collection: company_docs
connection_string: ./chroma_db
rails:
retrieval:
flows:
- retrieve relevant chunks
- check retrieval relevance
- ground response in context
Document Indexing
from nemoguardrails import LLMRails, RailsConfig
from nemoguardrails.kb.index import KnowledgeBaseIndex
config = RailsConfig.from_path("./config")
# Index documents
index = KnowledgeBaseIndex(config)
# Add documents
index.add_documents([
{
"content": "Our product supports Python 3.9+...",
"metadata": {"source": "docs/requirements.md", "category": "technical"}
},
{
"content": "Pricing starts at $99/month...",
"metadata": {"source": "docs/pricing.md", "category": "business"}
}
])
# Build index
index.build()
Retrieval Rails
Basic Retrieval Flow
# rails/retrieval.co
define flow retrieve relevant chunks
"""Retrieve context before generating response."""
$chunks = execute search_knowledge_base(
query=$user_message,
top_k=5
)
if len($chunks) == 0
bot inform no relevant information
stop
# Store in context for generation
$context.retrieved_chunks = $chunks
define bot inform no relevant information
"I don't have specific information about that in my knowledge base. Let me provide general guidance instead."
Relevance Checking
# actions/retrieval.py
from nemoguardrails.actions import action
from typing import List, Dict
import numpy as np
@action(name="search_knowledge_base")
async def search_knowledge_base(
query: str,
top_k: int = 5,
relevance_threshold: float = 0.7
) -> List[Dict]:
"""
Search knowledge base with relevance filtering.
"""
# Get embeddings (using configured model)
query_embedding = await get_embedding(query)
# Search vector DB
results = await vector_db.search(
embedding=query_embedding,
top_k=top_k * 2 # Retrieve more for filtering
)
# Filter by relevance
relevant = [
r for r in results
if r["score"] >= relevance_threshold
][:top_k]
return relevant
@action(name="check_retrieval_quality")
async def check_retrieval_quality(
query: str,
chunks: List[Dict]
) -> Dict:
"""
Assess if retrieved chunks are sufficient
to answer the query.
"""
if not chunks:
return {"sufficient": False, "reason": "no_chunks"}
# Check coverage
avg_relevance = np.mean([c["score"] for c in chunks])
if avg_relevance < 0.6:
return {
"sufficient": False,
"reason": "low_relevance",
"avg_score": avg_relevance
}
# Check diversity (avoid redundant chunks)
unique_sources = len(set(c["metadata"]["source"] for c in chunks))
return {
"sufficient": True,
"avg_relevance": avg_relevance,
"source_diversity": unique_sources,
"chunk_count": len(chunks)
}
Grounding Verification
# rails/grounding.co
define flow ground response in context
"""Ensure response is grounded in retrieved context."""
# First generate response with context
$response = execute generate_grounded_response(
query=$user_message,
context=$context.retrieved_chunks
)
# Verify grounding
$grounding_check = execute verify_grounding(
response=$response,
context=$context.retrieved_chunks
)
if not $grounding_check.is_grounded
# Regenerate with stricter grounding
$response = execute regenerate_with_citations(
query=$user_message,
context=$context.retrieved_chunks,
ungrounded_claims=$grounding_check.ungrounded_claims
)
bot say $response
Grounding Verification Action
# actions/grounding.py
from nemoguardrails.actions import action
from typing import List, Dict
@action(name="verify_grounding")
async def verify_grounding(
response: str,
context: List[Dict]
) -> Dict:
"""
Verify that response claims are grounded in context.
Uses NLI (Natural Language Inference) model.
"""
from transformers import pipeline
nli = pipeline("text-classification", model="roberta-large-mnli")
# Extract claims from response
claims = extract_claims(response)
# Concatenate context
context_text = " ".join([c["content"] for c in context])
ungrounded = []
for claim in claims:
# Check if claim is entailed by context
result = nli(f"{context_text} [SEP] {claim}")
if result[0]["label"] != "ENTAILMENT":
ungrounded.append({
"claim": claim,
"verdict": result[0]["label"],
"confidence": result[0]["score"]
})
return {
"is_grounded": len(ungrounded) == 0,
"grounded_ratio": (len(claims) - len(ungrounded)) / len(claims) if claims else 1.0,
"ungrounded_claims": ungrounded
}
@action(name="generate_grounded_response")
async def generate_grounded_response(
query: str,
context: List[Dict]
) -> str:
"""Generate response strictly grounded in context."""
context_text = "\n\n".join([
f"[Source: {c['metadata'].get('source', 'unknown')}]\n{c['content']}"
for c in context
])
prompt = f"""Based ONLY on the following context, answer the user's question.
If the context doesn't contain enough information, say so.
Always cite your sources using [Source: filename] format.
Context:
{context_text}
Question: {query}
Answer (grounded in context only):"""
# Use the configured LLM
response = await llm.generate(prompt)
return response
@action(name="regenerate_with_citations")
async def regenerate_with_citations(
query: str,
context: List[Dict],
ungrounded_claims: List[Dict]
) -> str:
"""Regenerate removing ungrounded claims and adding citations."""
context_text = "\n\n".join([
f"[Source: {c['metadata'].get('source', 'unknown')}]\n{c['content']}"
for c in context
])
ungrounded_text = "\n".join([c["claim"] for c in ungrounded_claims])
prompt = f"""Based ONLY on the following context, answer the user's question.
DO NOT include these unverified claims: {ungrounded_text}
Every statement must have a citation like [Source: filename].
Context:
{context_text}
Question: {query}
Answer (with citations):"""
response = await llm.generate(prompt)
return response
Production RAG Pipeline
# Complete RAG pipeline with guardrails
from nemoguardrails import LLMRails, RailsConfig
from dataclasses import dataclass
from typing import Optional
@dataclass
class RAGResult:
answer: str
sources: list
grounding_score: float
was_regenerated: bool
class GuardedRAGPipeline:
"""Production RAG with NeMo Guardrails."""
def __init__(self, config_path: str):
self.config = RailsConfig.from_path(config_path)
self.rails = LLMRails(self.config)
async def query(
self,
question: str,
user_context: dict = None
) -> RAGResult:
"""
Execute guarded RAG query.
"""
context = user_context or {}
context["query"] = question
# Execute rails pipeline
response = await self.rails.generate_async(
messages=[{"role": "user", "content": question}],
context=context
)
# Extract metadata from context
retrieved_chunks = response.get("context", {}).get("retrieved_chunks", [])
grounding_check = response.get("context", {}).get("grounding_check", {})
return RAGResult(
answer=response["content"],
sources=[c["metadata"]["source"] for c in retrieved_chunks],
grounding_score=grounding_check.get("grounded_ratio", 1.0),
was_regenerated=grounding_check.get("was_regenerated", False)
)
# Usage
pipeline = GuardedRAGPipeline("./config")
result = await pipeline.query(
"What programming languages does your product support?"
)
print(f"Answer: {result.answer}")
print(f"Sources: {result.sources}")
print(f"Grounding: {result.grounding_score:.1%}")
Hallucination Prevention Flow
# rails/anti_hallucination.co
define flow prevent hallucination
"""Complete anti-hallucination pipeline."""
# Step 1: Retrieve
$chunks = execute search_knowledge_base(query=$user_message, top_k=5)
# Step 2: Check if we have enough context
$quality = execute check_retrieval_quality(query=$user_message, chunks=$chunks)
if not $quality.sufficient
if $quality.reason == "no_chunks"
bot admit lack of knowledge
else
bot provide hedged response
stop
# Step 3: Generate grounded response
$response = execute generate_grounded_response(
query=$user_message,
context=$chunks
)
# Step 4: Verify grounding
$check = execute verify_grounding(response=$response, context=$chunks)
if $check.grounded_ratio < 0.8
# Too many ungrounded claims
$response = execute regenerate_with_citations(
query=$user_message,
context=$chunks,
ungrounded_claims=$check.ungrounded_claims
)
# Step 5: Add confidence indicator
$confidence = "high" if $check.grounded_ratio > 0.9 else "moderate"
bot respond with confidence indicator $response $confidence
define bot admit lack of knowledge
"I don't have verified information about that topic in my knowledge base. I'd recommend checking our official documentation or contacting support."
define bot provide hedged response
"Based on limited information I found, though I'd recommend verifying: "
RAG Best Practice: Always verify grounding before returning responses. A response that admits uncertainty is better than one that confidently states unverified information.
Next: Deep dive into Guardrails AI framework for schema validation. :::