Module 13 Evaluation Metrics - iffatAGheyas/applied-nlp-handbook GitHub Wiki

Module 13: Evaluation Metrics for Generation & Classification

This module covers how to quantitatively assess both classification models and text generation systems using standard metrics and libraries.


13.1 Classification Metrics

13.1.1 Accuracy, Precision, Recall & F₁

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

# Example true & predicted labels
y_true = ['spam','ham','spam','ham','spam']
y_pred = ['spam','spam','spam','ham','ham']

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision (spam):", precision_score(y_true, y_pred, pos_label='spam'))
print("Recall    (spam):", recall_score(y_true, y_pred, pos_label='spam'))
print("F1 Score  (spam):", f1_score(y_true, y_pred, pos_label='spam'))

Output:

Accuracy: 0.60
Precision (spam): 0.67
Recall    (spam): 0.67
F1 Score  (spam): 0.67

13.1.2 Confusion Matrix & Report

cm = confusion_matrix(y_true, y_pred, labels=['spam','ham'])
print("Confusion Matrix:\n", cm)

print("\nClassification Report:\n",
      classification_report(y_true, y_pred, target_names=['spam','ham']))

Output:

image

13.2 Generation Metrics

13.2.1 Perplexity (Language Models)

import math

# Suppose `eval_loss` from an LM Trainer
eval_loss = 2.5
perplexity = math.exp(eval_loss)
print(f"Perplexity = {perplexity:.2f}")

Output:

Perplexity = 12.18

13.2.2 BLEU Score (Translation & Generation)

from sacrebleu import corpus_bleu

# Example hypotheses & references
hypotheses = ["The cat sits on the mat", "Hello world"]
references = [["The cat is sitting on the mat", "A cat sits on mat"],
              ["Hi world", "Hello, world!"]]

bleu = corpus_bleu(hypotheses, references)
print(f"BLEU = {bleu.score:.2f}")

Output:

BLEU = 25.25

13.2.3 ROUGE (Summarization)

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
ref = "NLP is the intersection of linguistics and AI."
sys = "NLP combines linguistics with artificial intelligence."
scores = scorer.score(ref, sys)
for metric, result in scores.items():
    print(f"{metric}: P={result.precision:.2f}, R={result.recall:.2f}, F1={result.fmeasure:.2f}")

Output:

rouge1: P=0.33, R=0.25, F1=0.29
rouge2: P=0.00, R=0.00, F1=0.00
rougeL: P=0.33, R=0.25, F1=0.29

13.2.4 METEOR (Paraphrase Quality)

# 1. Install NLTK (once per kernel)
%pip install --quiet nltk

# 2. Download the required NLTK resources (once per environment)
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)

# 3. Import the scorer and a tokenizer
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

# 4. Your reference and hypothesis sentences
ref_sentence = "the cat is on the mat"
sys_sentence = "the cat sits on the mat"

# 5. Tokenize them into lists of words
reference_tokens  = word_tokenize(ref_sentence.lower())
hypothesis_tokens = word_tokenize(sys_sentence.lower())

# 6. Compute METEOR
score = meteor_score([reference_tokens], hypothesis_tokens)
print(f"METEOR = {score:.2f}")

Output:

METEOR = 0.81

13.2.5 BERTScore (Semantic Similarity)

# 1. Install the bert-score package (run once per kernel)
%pip install --quiet bert-score

# 2. Import and run BERTScore
from bert_score import score

# 3. Candidate and reference sentences
cands = ["the cat sits on the mat"]
refs  = ["the cat is on the mat"]

# 4. Compute precision, recall, and F1, rescaled to a baseline
P, R, F1 = score(cands, refs, lang='en', rescale_with_baseline=True)

# 5. Print the F1 score
print(f"BERTScore F1 = {F1.item():.3f}")

Output:

BERTScore F1 = 0.535