Module 2 5 Naive Bayes Text Classification - iffatAGheyas/NLP-handbook GitHub Wiki

Module 2.5: Naïve Bayes Text Classifier

Naïve Bayes is a generative, probabilistic classifier that assumes feature independence given the class. In text classification, each word is treated as a feature, and the model computes:

1. Training the Classifier

import math
from collections import defaultdict, Counter

# 1. Toy training set: (label, text)
train_data = [
    ('spam', "limited time offer, win money now"),
    ('ham',  "project meeting at 10 am"),
    ('spam', "win free tickets now"),
    ('ham',  "lunch with the team tomorrow"),
]

# 2. Estimate priors and word counts
class_counts = Counter()
word_counts  = defaultdict(Counter)
total_words  = Counter()

for label, text in train_data:
    class_counts[label] += 1
    tokens = text.lower().split()
    word_counts[label].update(tokens)
    total_words[label] += len(tokens)

# 3. Class priors P(c)
total_docs = sum(class_counts.values())
priors = {c: class_counts[c]/total_docs for c in class_counts}

# 4. Vocabulary size V
vocab = set(w for counts in word_counts.values() for w in counts)
V = len(vocab)

# 5. Compute P(w | c) with Laplace smoothing
likelihoods = defaultdict(dict)
for c in class_counts:
    for w in vocab:
        likelihoods[c][w] = (word_counts[c].get(w, 0) + 1) / (total_words[c] + V)

# 6. Print priors and a few likelihoods
print("Class priors:")
for c, p in priors.items():
    print(f"  P({c}) = {p:.3f}")
print("\nExample likelihoods (P(word|class)):")
examples = ['win', 'lunch', 'team', 'offer']
for w in examples:
    for c in priors:
        print(f"  P({w!r} | {c}) = {likelihoods[c].get(w, 0):.3f}")
    print()

# 7. Define predict() and test
def predict(text):
    tokens = text.lower().split()
    log_probs = {}
    for c in priors:
        log_prob = math.log(priors[c])
        for w in tokens:
            p_w = likelihoods[c].get(w, 1/(total_words[c] + V))
            log_prob += math.log(p_w)
        log_probs[c] = log_prob
    return max(log_probs, key=log_probs.get), log_probs

test_texts = [
    "win money now",
    "team lunch tomorrow",
    "secret offer",
    "meeting at 10",
]

print("Test classifications:")
for txt in test_texts:
    label, scores = predict(txt)
    print(f"  '{txt}' → {label}")

Output:

2. Prediction Function

def predict(text):
    tokens = text.lower().split()
    # Compute log-posterior for each class
    log_probs = {}
    for c in priors:
        log_prob = math.log(priors[c])
        for w in tokens:
            # Use uniform probability for OOV words
            p_w = likelihoods[c].get(w, 1/(total_words[c] + V))
            log_prob += math.log(p_w)
        log_probs[c] = log_prob
    # Return the class with highest log-probability
    return max(log_probs, key=log_probs.get), log_probs

# 3. Demo on unseen texts
test_texts = [
    "win money now",
    "team lunch tomorrow",
    "secret offer",
    "meeting at 10",
]

for txt in test_texts:
    label, scores = predict(txt)
    print(f"'{txt}' → {label}")

Output:

Continue to Module 3: Neural Network Fundamentals