Module 2 5 Naive Bayes Text Classification - iffatAGheyas/NLP-handbook GitHub Wiki
Module 2.5: Naïve Bayes Text Classifier
Naïve Bayes is a generative, probabilistic classifier that assumes feature independence given the class. In text classification, each word is treated as a feature, and the model computes:
1. Training the Classifier
import math
from collections import defaultdict, Counter
# 1. Toy training set: (label, text)
train_data = [
('spam', "limited time offer, win money now"),
('ham', "project meeting at 10 am"),
('spam', "win free tickets now"),
('ham', "lunch with the team tomorrow"),
]
# 2. Estimate priors and word counts
class_counts = Counter()
word_counts = defaultdict(Counter)
total_words = Counter()
for label, text in train_data:
class_counts[label] += 1
tokens = text.lower().split()
word_counts[label].update(tokens)
total_words[label] += len(tokens)
# 3. Class priors P(c)
total_docs = sum(class_counts.values())
priors = {c: class_counts[c]/total_docs for c in class_counts}
# 4. Vocabulary size V
vocab = set(w for counts in word_counts.values() for w in counts)
V = len(vocab)
# 5. Compute P(w | c) with Laplace smoothing
likelihoods = defaultdict(dict)
for c in class_counts:
for w in vocab:
likelihoods[c][w] = (word_counts[c].get(w, 0) + 1) / (total_words[c] + V)
# 6. Print priors and a few likelihoods
print("Class priors:")
for c, p in priors.items():
print(f" P({c}) = {p:.3f}")
print("\nExample likelihoods (P(word|class)):")
examples = ['win', 'lunch', 'team', 'offer']
for w in examples:
for c in priors:
print(f" P({w!r} | {c}) = {likelihoods[c].get(w, 0):.3f}")
print()
# 7. Define predict() and test
def predict(text):
tokens = text.lower().split()
log_probs = {}
for c in priors:
log_prob = math.log(priors[c])
for w in tokens:
p_w = likelihoods[c].get(w, 1/(total_words[c] + V))
log_prob += math.log(p_w)
log_probs[c] = log_prob
return max(log_probs, key=log_probs.get), log_probs
test_texts = [
"win money now",
"team lunch tomorrow",
"secret offer",
"meeting at 10",
]
print("Test classifications:")
for txt in test_texts:
label, scores = predict(txt)
print(f" '{txt}' → {label}")
Output:
2. Prediction Function
def predict(text):
tokens = text.lower().split()
# Compute log-posterior for each class
log_probs = {}
for c in priors:
log_prob = math.log(priors[c])
for w in tokens:
# Use uniform probability for OOV words
p_w = likelihoods[c].get(w, 1/(total_words[c] + V))
log_prob += math.log(p_w)
log_probs[c] = log_prob
# Return the class with highest log-probability
return max(log_probs, key=log_probs.get), log_probs
# 3. Demo on unseen texts
test_texts = [
"win money now",
"team lunch tomorrow",
"secret offer",
"meeting at 10",
]
for txt in test_texts:
label, scores = predict(txt)
print(f"'{txt}' → {label}")
Output:
Continue to Module 3: Neural Network Fundamentals