Module 4 Text Classification and Sentiment Analysis - iffatAGheyas/applied-nlp-handbook GitHub Wiki
This module demonstrates end-to-end pipelines for assigning labels to text—e.g. “spam vs. ham” or positive vs. negative sentiment—using both classical and simple neural models.
- Data Loading: corpus of labeled documents
- Preprocessing: clean, tokenize, vectorize (BoW, TF–IDF)
- Model Training: fit classifier
- Evaluation: accuracy, precision, recall, F₁
import nltk, random
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
# 1. Load and shuffle 200 labeled reviews
nltk.download('movie_reviews')
docs = [(list(movie_reviews.words(f)), cat)
for cat in movie_reviews.categories()
for f in movie_reviews.fileids(cat)]
random.shuffle(docs)
texts = [" ".join(words) for words, _ in docs[:200]]
labels = [label for _, label in docs[:200]]
# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
texts, labels, test_size=0.2, random_state=42
)
# 3. Pipeline: TF–IDF → Logistic Regression
model = make_pipeline(
TfidfVectorizer(max_features=5000),
LogisticRegression(max_iter=200)
)
model.fit(X_train, y_train)
# 4. Evaluate
acc = model.score(X_test, y_test)
print(f"Accuracy = {acc:.2f}")
Accuracy = 0.65
# sentiment_nn_full_demo.ipynb
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# --- 0. Example texts & labels (replace with your own) ---
texts = [
"I love this movie, it was fantastic!",
"Terrible film, I hated it.",
"What a great performance!",
"Worst movie ever."
]
labels = ['pos', 'neg', 'pos', 'neg']
# --- 1. Tokenize & sequence-encode ---
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
seqs = tokenizer.texts_to_sequences(texts)
data = pad_sequences(seqs, maxlen=200, padding='post', truncating='post')
# --- 2. Encode labels as integers and convert to NumPy array ---
y = np.array([1 if lab == 'pos' else 0 for lab in labels])
# --- 3. Train/test split ---
X_tr, X_te, y_tr, y_te = train_test_split(
data, y, test_size=0.2, random_state=42
)
# --- 4. Build and compile the model ---
model_nn = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=5000, output_dim=16), # input_length no longer needed
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid'),
])
model_nn.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
# --- 5. Train (5 epochs on CPU) ---
history = model_nn.fit(
X_tr, y_tr,
epochs=5,
batch_size=32,
validation_data=(X_te, y_te),
verbose=2
)
from sklearn.metrics import classification_report, confusion_matrix
# 1. Log-Reg: make sure you feed in class labels, not raw scores
# If `model` is a sklearn Pipeline ending in LogisticRegression,
# use predict(), else for raw probabilities do:
# y_prob_lr = model.predict_proba(X_test)[:,1]
# y_pred_lr = (y_prob_lr > 0.5).astype(int)
y_pred_lr = model.predict(X_test)
print("LogReg Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nLogReg Report:")
print(classification_report(
y_test,
y_pred_lr,
zero_division=0
))
# 2. Neural Net
# This already thresholds at 0.5
y_prob_nn = model_nn.predict(X_te)
y_pred_nn = (y_prob_nn > 0.5).astype(int).flatten()
print("\nNN Confusion Matrix:")
print(confusion_matrix(y_te, y_pred_nn))
print("\nNN Report:")
print(classification_report(
y_te,
y_pred_nn,
zero_division=0
))