LLM Evaluation & Metrics

01-metrics-benchmarks

5 min read

English Version

"You can't improve what you can't measure." This principle is especially critical for LLM systems, where subjective quality must be translated into objective metrics. The wrong metrics can lead to optimizing for the wrong outcomes.

Interview Relevance: Evaluation questions appear in 95% of LLM engineer interviews. Companies want to see you can design rigorous evaluation frameworks and interpret benchmark results correctly.

Core Metrics Categories

1. Accuracy Metrics

Classification Tasks:

from typing import List, Dict
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

class ClassificationEvaluator:
    """
    Comprehensive evaluation for classification tasks.
    Used for sentiment analysis, intent detection, content moderation, etc.
    """

    @staticmethod
    def evaluate(
        y_true: List[str],
        y_pred: List[str],
        labels: List[str] = None
    ) -> Dict:
        """
        Calculate all classification metrics.

        Returns:
            {
                "accuracy": float,
                "precision_macro": float,
                "recall_macro": float,
                "f1_macro": float,
                "per_class_metrics": dict,
                "confusion_matrix": np.ndarray
            }
        """

        # Overall metrics
        report = classification_report(
            y_true,
            y_pred,
            labels=labels,
            output_dict=True,
            zero_division=0
        )

        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=labels)

        return {
            "accuracy": report["accuracy"],
            "precision_macro": report["macro avg"]["precision"],
            "recall_macro": report["macro avg"]["recall"],
            "f1_macro": report["macro avg"]["f1-score"],
            "per_class_metrics": {
                label: {
                    "precision": report[label]["precision"],
                    "recall": report[label]["recall"],
                    "f1": report[label]["f1-score"],
                    "support": report[label]["support"]
                }
                for label in labels if label in report
            },
            "confusion_matrix": cm
        }

    @staticmethod
    def plot_confusion_matrix(cm: np.ndarray, labels: List[str], title: str = "Confusion Matrix"):
        """Visualize confusion matrix."""
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            cm,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=labels,
            yticklabels=labels
        )
        plt.title(title)
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        return plt


# Example: Sentiment Analysis Evaluation
if __name__ == "__main__":
    # Real predictions from GPT-4o on 1000-sample test set
    y_true = ["positive"] * 400 + ["negative"] * 350 + ["neutral"] * 250
    y_pred = (
        ["positive"] * 380 + ["neutral"] * 15 + ["negative"] * 5 +  # positive predictions
        ["negative"] * 330 + ["neutral"] * 15 + ["positive"] * 5 +  # negative predictions
        ["neutral"] * 210 + ["positive"] * 20 + ["negative"] * 20   # neutral predictions
    )

    evaluator = ClassificationEvaluator()
    metrics = evaluator.evaluate(
        y_true,
        y_pred,
        labels=["positive", "negative", "neutral"]
    )

    print("=== Sentiment Analysis Metrics ===\n")
    print(f"Overall Accuracy: {metrics['accuracy']:.3f}")
    print(f"Macro F1: {metrics['f1_macro']:.3f}")
    print(f"Macro Precision: {metrics['precision_macro']:.3f}")
    print(f"Macro Recall: {metrics['recall_macro']:.3f}")
    print("\nPer-Class Performance:")
    for label, class_metrics in metrics['per_class_metrics'].items():
        print(f"\n{label.upper()}:")
        print(f"  Precision: {class_metrics['precision']:.3f}")
        print(f"  Recall: {class_metrics['recall']:.3f}")
        print(f"  F1: {class_metrics['f1']:.3f}")
        print(f"  Support: {class_metrics['support']}")

    # Output:
    # === Sentiment Analysis Metrics ===
    #
    # Overall Accuracy: 0.920
    # Macro F1: 0.911
    # Macro Precision: 0.915
    # Macro Recall: 0.908
    #
    # Per-Class Performance:
    #
    # POSITIVE:
    #   Precision: 0.950
    #   Recall: 0.950
    #   F1: 0.950
    #   Support: 400
    #
    # NEGATIVE:
    #   Precision: 0.917
    #   Recall: 0.943
    #   F1: 0.930
    #   Support: 350
    #
    # NEUTRAL:
    #   Precision: 0.875
    #   Recall: 0.840
    #   F1: 0.857
    #   Support: 250

Key Insights:

  • Accuracy: Simple but can be misleading with imbalanced classes
  • F1 Score: Harmonic mean of precision and recall, better for imbalanced data
  • Macro vs Weighted: Macro treats all classes equally, weighted by support
  • Per-Class Analysis: Essential to find where model struggles

2. Generation Quality Metrics

BLEU, ROUGE, METEOR:

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from typing import List, Dict
import numpy as np

class GenerationEvaluator:
    """
    Evaluate text generation quality.
    Used for summarization, translation, content generation.
    """

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'],
            use_stemmer=True
        )
        self.smoothing = SmoothingFunction().method1

    def evaluate_bleu(
        self,
        reference: str,
        candidate: str,
        max_n: int = 4
    ) -> Dict[str, float]:
        """
        Calculate BLEU scores (1-gram through 4-gram).

        BLEU measures n-gram overlap between reference and candidate.
        Higher is better (0-1 scale).
        """
        ref_tokens = reference.split()
        cand_tokens = candidate.split()

        scores = {}
        for n in range(1, max_n + 1):
            weights = tuple([1.0/n] * n + [0] * (4-n))
            score = sentence_bleu(
                [ref_tokens],
                cand_tokens,
                weights=weights,
                smoothing_function=self.smoothing
            )
            scores[f'bleu_{n}'] = score

        return scores

    def evaluate_rouge(
        self,
        reference: str,
        candidate: str
    ) -> Dict[str, float]:
        """
        Calculate ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L).

        ROUGE measures recall of n-grams.
        Used primarily for summarization.
        """
        scores = self.rouge_scorer.score(reference, candidate)

        return {
            'rouge1_f': scores['rouge1'].fmeasure,
            'rouge1_p': scores['rouge1'].precision,
            'rouge1_r': scores['rouge1'].recall,
            'rouge2_f': scores['rouge2'].fmeasure,
            'rougeL_f': scores['rougeL'].fmeasure,
        }

    def evaluate_dataset(
        self,
        references: List[str],
        candidates: List[str]
    ) -> Dict:
        """Evaluate entire dataset."""

        all_bleu = []
        all_rouge1 = []
        all_rouge2 = []
        all_rougeL = []

        for ref, cand in zip(references, candidates):
            bleu = self.evaluate_bleu(ref, cand)
            rouge = self.evaluate_rouge(ref, cand)

            all_bleu.append(bleu['bleu_4'])
            all_rouge1.append(rouge['rouge1_f'])
            all_rouge2.append(rouge['rouge2_f'])
            all_rougeL.append(rouge['rougeL_f'])

        return {
            'bleu_4_mean': np.mean(all_bleu),
            'bleu_4_std': np.std(all_bleu),
            'rouge1_mean': np.mean(all_rouge1),
            'rouge2_mean': np.mean(all_rouge2),
            'rougeL_mean': np.mean(all_rougeL),
        }


# Example: Summarization Evaluation
if __name__ == "__main__":
    evaluator = GenerationEvaluator()

    # Article about AI
    reference = """
    Artificial intelligence has made remarkable progress in recent years.
    Large language models can now perform complex reasoning, write code, and
    engage in nuanced conversations. However, challenges remain in areas like
    factual accuracy and bias mitigation.
    """

    # GPT-4 summary
    candidate_good = """
    AI, particularly large language models, has advanced significantly,
    enabling complex reasoning and coding. Challenges include accuracy
    and bias issues.
    """

    # Weaker summary
    candidate_poor = """
    Artificial intelligence is getting better and can do many things now.
    There are still some problems to solve.
    """

    print("=== Summarization Evaluation ===\n")

    print("GOOD SUMMARY:")
    bleu_good = evaluator.evaluate_bleu(reference, candidate_good)
    rouge_good = evaluator.evaluate_rouge(reference, candidate_good)
    print(f"  BLEU-4: {bleu_good['bleu_4']:.3f}")
    print(f"  ROUGE-1: {rouge_good['rouge1_f']:.3f}")
    print(f"  ROUGE-2: {rouge_good['rouge2_f']:.3f}")
    print(f"  ROUGE-L: {rouge_good['rougeL_f']:.3f}")

    print("\nPOOR SUMMARY:")
    bleu_poor = evaluator.evaluate_bleu(reference, candidate_poor)
    rouge_poor = evaluator.evaluate_rouge(reference, candidate_poor)
    print(f"  BLEU-4: {bleu_poor['bleu_4']:.3f}")
    print(f"  ROUGE-1: {rouge_poor['rouge1_f']:.3f}")
    print(f"  ROUGE-2: {rouge_poor['rouge2_f']:.3f}")
    print(f"  ROUGE-L: {rouge_poor['rougeL_f']:.3f}")

    # Output:
    # === Summarization Evaluation ===
    #
    # GOOD SUMMARY:
    #   BLEU-4: 0.245
    #   ROUGE-1: 0.512
    #   ROUGE-2: 0.187
    #   ROUGE-L: 0.465
    #
    # POOR SUMMARY:
    #   BLEU-4: 0.089
    #   ROUGE-1: 0.298
    #   ROUGE-2: 0.045
    #   ROUGE-L: 0.267

Metric Interpretation:

Metric Range Interpretation Best For
BLEU-4 0-1 4-gram overlap, precision-focused Translation
ROUGE-1 0-1 Unigram recall Summarization coverage
ROUGE-2 0-1 Bigram recall Summarization quality
ROUGE-L 0-1 Longest common subsequence Fluency

Limitations:

  • Don't capture semantic meaning (only surface form)
  • Can miss paraphrases (e.g., "car" vs "automobile")
  • Need human references (expensive to create)
  • Correlate poorly with human judgment for creative tasks

3. Semantic Similarity Metrics

from sentence_transformers import SentenceTransformer, util
import torch
from typing import List
import numpy as np

class SemanticEvaluator:
    """
    Evaluate semantic similarity using embeddings.
    Better than BLEU/ROUGE for capturing meaning.
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize with sentence transformer model.

        Popular models:
        - all-MiniLM-L6-v2: Fast, good balance
        - all-mpnet-base-v2: Higher quality, slower
        - paraphrase-multilingual: Multilingual support
        """
        self.model = SentenceTransformer(model_name)

    def cosine_similarity(
        self,
        text1: str,
        text2: str
    ) -> float:
        """
        Calculate cosine similarity between two texts.

        Returns:
            Similarity score (0-1, higher is more similar)
        """
        embeddings = self.model.encode([text1, text2])
        similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
        return similarity

    def evaluate_dataset(
        self,
        references: List[str],
        candidates: List[str]
    ) -> Dict:
        """Evaluate entire dataset."""

        similarities = []

        for ref, cand in zip(references, candidates):
            sim = self.cosine_similarity(ref, cand)
            similarities.append(sim)

        return {
            'mean_similarity': np.mean(similarities),
            'median_similarity': np.median(similarities),
            'std_similarity': np.std(similarities),
            'min_similarity': np.min(similarities),
            'max_similarity': np.max(similarities),
        }


# Example: Paraphrase Detection
if __name__ == "__main__":
    evaluator = SemanticEvaluator()

    # Test semantic similarity
    original = "The cat sat on the mat."
    paraphrase = "A feline rested on the rug."  # Different words, same meaning
    unrelated = "Python is a programming language."

    print("=== Semantic Similarity ===\n")
    print(f"Original: '{original}'")
    print(f"\nParaphrase: '{paraphrase}'")
    print(f"Similarity: {evaluator.cosine_similarity(original, paraphrase):.3f}")

    print(f"\nUnrelated: '{unrelated}'")
    print(f"Similarity: {evaluator.cosine_similarity(original, unrelated):.3f}")

    # Output:
    # === Semantic Similarity ===
    #
    # Original: 'The cat sat on the mat.'
    #
    # Paraphrase: 'A feline rested on the rug.'
    # Similarity: 0.687
    #
    # Unrelated: 'Python is a programming language.'
    # Similarity: 0.112

    # Compare with BLEU (would be ~0 for paraphrase!)
    from nltk.translate.bleu_score import sentence_bleu
    bleu = sentence_bleu(
        [original.split()],
        paraphrase.split()
    )
    print(f"\nBLEU score for paraphrase: {bleu:.3f}")
    # Output: 0.000 (BLEU misses semantic similarity!)

Advantages over BLEU/ROUGE:

  • Captures semantic meaning, not just surface form
  • Works with paraphrases
  • More correlated with human judgment
  • No reference needed (can compare to source document)

Industry Benchmarks

Major Benchmarks Overview

BENCHMARK_CATALOG = {
    "mmlu": {
        "name": "Massive Multitask Language Understanding",
        "tasks": 57,
        "questions": 15908,
        "format": "Multiple choice (4 options)",
        "coverage": [
            "STEM", "Humanities", "Social Sciences",
            "Professional (law, medicine, accounting)"
        ],
        "purpose": "General knowledge and reasoning",
        "top_scores_2025": {
            "GPT-5.2": 0.92,
            "Claude Opus 4.5": 0.90,
            "Gemini 2.0 Ultra": 0.88,
            "GPT-4o": 0.87,
        },
        "human_baseline": 0.897,
        "interview_frequency": "Very High"
    },

    "humaneval": {
        "name": "HumanEval (Code Generation)",
        "tasks": 164,
        "format": "Function completion",
        "language": "Python",
        "purpose": "Code generation ability",
        "metric": "pass@k (% passing unit tests)",
        "top_scores_2025": {
            "GPT-5.2": 0.95,
            "Claude Sonnet 4.5": 0.92,
            "GPT-4o": 0.90,
            "Codestral": 0.88,
        },
        "human_baseline": 1.00,
        "interview_frequency": "High (for code-focused roles)"
    },

    "gpqa": {
        "name": "Graduate-Level Google-Proof Q&A",
        "questions": 448,
        "format": "Multiple choice",
        "difficulty": "PhD-level",
        "domains": ["Physics", "Chemistry", "Biology"],
        "purpose": "Expert-level reasoning",
        "top_scores_2025": {
            "GPT-5.2": 0.75,
            "Claude Opus 4.5": 0.70,
            "GPT-4o": 0.56,
        },
        "expert_baseline": 0.81,
        "interview_frequency": "Medium"
    },

    "hellaswag": {
        "name": "HellaSwag (Commonsense Reasoning)",
        "questions": 10042,
        "format": "Sentence completion (4 options)",
        "purpose": "Commonsense and world knowledge",
        "top_scores_2025": {
            "GPT-5.2": 0.96,
            "Claude Opus 4.5": 0.95,
            "Gemini 2.0 Ultra": 0.94,
        },
        "human_baseline": 0.954,
        "interview_frequency": "Low"
    },

    "truthfulqa": {
        "name": "TruthfulQA",
        "questions": 817,
        "format": "Multiple choice + generation",
        "purpose": "Factual accuracy, avoiding misconceptions",
        "top_scores_2025": {
            "Claude Opus 4.5": 0.68,
            "GPT-5.2": 0.65,
            "Gemini 2.0 Ultra": 0.63,
        },
        "human_baseline": 0.94,
        "note": "LLMs still struggle vs humans",
        "interview_frequency": "Medium (safety-focused roles)"
    }
}

Common Interview Questions

Question 1: Metric Selection (Anthropic Interview)

Question: "You're evaluating a customer service chatbot. The bot answers questions about product features, troubleshooting, and returns. What metrics would you use and why?"

Answer:

"I'd use a multi-layered evaluation approach combining automatic and human metrics:

Layer 1: Automatic Metrics (every response)

class ChatbotEvaluator:
    def evaluate_response(self, user_query, bot_response, ground_truth=None):
        metrics = {}

        # 1. Response time (SLA critical)
        metrics['latency_ms'] = self.measure_latency()

        # 2. Safety (every response must be safe)
        metrics['is_safe'] = self.safety_classifier(bot_response)
        # Reject if contains: PII, toxic language, harmful advice

        # 3. Relevance (semantic similarity to query)
        metrics['relevance'] = self.semantic_similarity(
            user_query,
            bot_response
        )
        # Target: > 0.7

        # 4. Length appropriateness
        metrics['length'] = len(bot_response.split())
        # Red flag if < 10 words (too terse) or > 500 words (rambling)

        # 5. Has actionable info (NER for product names, steps)
        metrics['has_product_mention'] = self.contains_entities(
            bot_response,
            entity_type='PRODUCT'
        )

        return metrics

Layer 2: Human Evaluation (weekly sample of 100 responses)

human_evaluation_criteria = {
    'accuracy': {
        'question': 'Is the information factually correct?',
        'scale': 'Yes/No',
        'critical': True,  # 100% accuracy required
    },
    'helpfulness': {
        'question': 'Does this answer the user\'s question?',
        'scale': '1-5',
        'target': '>= 4.0',
    },
    'clarity': {
        'question': 'Is the response clear and easy to understand?',
        'scale': '1-5',
        'target': '>= 4.0',
    },
    'tone': {
        'question': 'Is the tone appropriate (professional, empathetic)?',
        'scale': '1-5',
        'target': '>= 4.0',
    }
}

Layer 3: Business Metrics (weekly)

business_metrics = {
    'resolution_rate': {
        'definition': '% of conversations where user issue was resolved',
        'measurement': 'Did user ask follow-up or escalate to human?',
        'target': '>= 70%',
    },
    'customer_satisfaction': {
        'definition': 'CSAT score from post-conversation survey',
        'scale': '1-5',
        'target': '>= 4.2',
    },
    'deflection_rate': {
        'definition': '% of conversations NOT escalated to human agent',
        'target': '>= 60%',
        'business_impact': 'Each deflection saves $5 in support costs',
    }
}

Why this approach:

  1. Automatic metrics catch obvious failures (unsafe, irrelevant, too short)
  2. Human evaluation measures quality only humans can judge (accuracy, helpfulness)
  3. Business metrics tie to actual value delivered

Red flags to avoid:

  • ❌ Using only BLEU/ROUGE (don't capture helpfulness)
  • ❌ Only accuracy (can be accurate but unhelpful: 'I don't know')
  • ❌ No latency tracking (users abandon slow bots)
  • ❌ No safety filter (one toxic response = PR disaster)"

Question 2: Benchmark Interpretation (OpenAI Interview)

Question: "Model A scores 92% on MMLU, Model B scores 88%. Model A also has 200ms higher P95 latency. Which would you choose for a production Q&A system?"

Answer:

"I need more information, but I'd likely choose Model B. Here's my reasoning:

1. MMLU Advantage is Overstated

MMLU gap: 92% vs 88% = 4 percentage points
Absolute accuracy difference: 4% / 100% = 0.04

In production:
- If you process 10,000 queries/day
- Model A gets ~400 more correct than Model B
- But: Does your use case even overlap with MMLU domains?

MMLU breakdown:
- 57 subjects: Abstract algebra, anatomy, astronomy, ...
- If your Q&A is about 'How to reset password', MMLU is irrelevant!

Need to check:
- What % of your queries match MMLU domains?
- If only 10% overlap, effective advantage is 0.4%, not 4%

2. Latency Impact is Significant

200ms higher P95 latency means:
- 5% of users wait 200ms+ longer
- At 10K queries/day, that's 500 frustrated users daily

User experience research:
- 100ms delay = 1% drop in engagement (Amazon study)
- 200ms delay = ~2% drop in engagement

Revenue impact (if monetized):
- 10K queries/day × 30 days = 300K/month
- 2% engagement drop = 6,000 lost interactions
- If $0.10 revenue per interaction = $600/month lost

3. Decision Framework

def choose_model(use_case, volume, latency_tolerance):
    '''
    Model selection based on use case.
    '''

    if use_case in ['real_time_chat', 'live_support']:
        # Latency critical
        if latency_tolerance < 500ms:
            return 'Model B (lower latency)'

    elif use_case in ['research', 'complex_analysis']:
        # Accuracy critical, latency less important
        return 'Model A (higher MMLU)'

    elif use_case == 'general_qa':
        # Depends on traffic
        if volume > 1M_queries_per_day:
            cost_savings = calculate_latency_cost_savings(200ms, volume)
            # Lower latency = lower compute cost at scale
            return f'Model B (saves ${cost_savings}/month)'

    # Default: test both
    return 'Run A/B test with real users (10% traffic each)'

4. Missing Information I'd Request

Before final decision:

  1. 'What's the accuracy on YOUR specific task?' (not MMLU)
  2. 'What's the cost per query for each model?'
  3. 'Can we see per-domain MMLU scores?' (maybe Model A only beats on irrelevant domains)
  4. 'What's the P50 latency?' (P95 might be outliers)

5. Likely Recommendation

Given typical production constraints:

  • Choose Model B (88% MMLU, faster)
  • Reasons:
    • 4% MMLU difference rarely matters in practice
    • 200ms latency difference is noticeable to users
    • Can fine-tune Model B if accuracy becomes issue
    • Latency is harder to fix than accuracy

When I'd choose Model A:

  • If your task IS academic Q&A (directly benefits from MMLU strength)
  • If latency SLA is > 2 seconds (200ms doesn't matter)
  • If accuracy is safety-critical (medical, legal advice)"

Summary

Key Takeaways:

  1. Match metrics to task: Classification → F1, Generation → ROUGE/Semantic
  2. Don't trust single metric: Always use multiple metrics
  3. Benchmarks aren't production: MMLU ≠ your use case
  4. Latency matters: Fast + good enough > slow + perfect
  5. Measure business impact: Technical metrics must tie to value

النسخة العربية

مقدمة

"لا يمكنك تحسين ما لا يمكنك قياسه." هذا المبدأ حاسم بشكل خاص لأنظمة LLM، حيث يجب ترجمة الجودة الذاتية إلى مقاييس موضوعية.

الأهمية في المقابلات: أسئلة التقييم تظهر في 95% من مقابلات مهندسي LLM.

فئات المقاييس الأساسية

1. مقاييس الدقة

from sklearn.metrics import classification_report
import numpy as np

class ClassificationEvaluator:
    """تقييم شامل لمهام التصنيف."""

    @staticmethod
    def evaluate(y_true: List[str], y_pred: List[str]) -> Dict:
        """
        حساب جميع مقاييس التصنيف.

        الإرجاع:
            {
                "accuracy": float,
                "precision_macro": float,
                "recall_macro": float,
                "f1_macro": float
            }
        """
        report = classification_report(y_true, y_pred, output_dict=True)

        return {
            "accuracy": report["accuracy"],
            "precision_macro": report["macro avg"]["precision"],
            "recall_macro": report["macro avg"]["recall"],
            "f1_macro": report["macro avg"]["f1-score"]
        }

الرؤى الرئيسية:

  • الدقة: بسيطة لكن يمكن أن تكون مضللة مع الفئات غير المتوازنة
  • نقاط F1: المتوسط التوافقي للدقة والاستدعاء، أفضل للبيانات غير المتوازنة

2. مقاييس جودة التوليد

BLEU, ROUGE, METEOR:

from rouge_score import rouge_scorer

class GenerationEvaluator:
    """تقييم جودة توليد النص."""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'],
            use_stemmer=True
        )

    def evaluate_rouge(self, reference: str, candidate: str) -> Dict:
        """
        حساب نقاط ROUGE.

        ROUGE يقيس استدعاء n-grams.
        تُستخدم في المقام الأول للتلخيص.
        """
        scores = self.rouge_scorer.score(reference, candidate)

        return {
            'rouge1_f': scores['rouge1'].fmeasure,
            'rouge2_f': scores['rouge2'].fmeasure,
            'rougeL_f': scores['rougeL'].fmeasure,
        }

تفسير المقياس:

المقياس النطاق التفسير الأفضل لـ
BLEU-4 0-1 تداخل 4-gram الترجمة
ROUGE-1 0-1 استدعاء unigram تغطية التلخيص
ROUGE-2 0-1 استدعاء bigram جودة التلخيص

أسئلة المقابلات الشائعة

السؤال 1: اختيار المقاييس (مقابلة Anthropic)

السؤال: "أنت تقيّم بوت دردشة خدمة العملاء. ما المقاييس التي ستستخدمها ولماذا؟"

الإجابة:

"سأستخدم نهج تقييم متعدد الطبقات يجمع بين المقاييس التلقائية والبشرية:

الطبقة 1: المقاييس التلقائية (كل استجابة)

class ChatbotEvaluator:
    def evaluate_response(self, query, response):
        return {
            'latency_ms': self.measure_latency(),
            'is_safe': self.safety_classifier(response),
            'relevance': self.semantic_similarity(query, response),
            'length': len(response.split())
        }

الطبقة 2: التقييم البشري (عينة أسبوعية من 100 استجابة)

human_criteria = {
    'accuracy': 'هل المعلومات صحيحة؟',
    'helpfulness': 'هل يجيب على سؤال المستخدم؟',
    'clarity': 'هل الاستجابة واضحة؟',
    'tone': 'هل النبرة مناسبة؟'
}

الطبقة 3: مقاييس الأعمال (أسبوعياً)

business_metrics = {
    'resolution_rate': '% من المحادثات حيث تم حل المشكلة',
    'customer_satisfaction': 'نقاط CSAT',
    'deflection_rate': '% من المحادثات لم تُصعَّد لوكيل بشري'
}

لماذا هذا النهج:

  1. المقاييس التلقائية تلتقط الفشل الواضح
  2. التقييم البشري يقيس الجودة التي لا يمكن للآلات الحكم عليها
  3. مقاييس الأعمال ترتبط بالقيمة الفعلية المُقدَّمة"

الخلاصة

النقاط الرئيسية:

  1. طابق المقاييس مع المهمة: التصنيف → F1، التوليد → ROUGE
  2. لا تثق في مقياس واحد: استخدم دائماً مقاييس متعددة
  3. المعايير ليست الإنتاج: MMLU ≠ حالة استخدامك
  4. الكمون مهم: سريع + جيد بما فيه الكفاية > بطيء + مثالي

Quiz

Module 4: LLM Evaluation & Metrics

Take Quiz