Module 4 Text Classification and Sentiment Analysis - iffatAGheyas/applied-nlp-handbook GitHub Wiki

Module 4: Text Classification & Sentiment Analysis

This module demonstrates end-to-end pipelines for assigning labels to text—e.g. “spam vs. ham” or positive vs. negative sentiment—using both classical and simple neural models.


4.1 Pipeline Overview

  1. Data Loading: corpus of labeled documents
  2. Preprocessing: clean, tokenize, vectorize (BoW, TF–IDF)
  3. Model Training: fit classifier
  4. Evaluation: accuracy, precision, recall, F₁

4.2 Classical Model: Logistic Regression on Movie Reviews

import nltk, random
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# 1. Load and shuffle 200 labeled reviews
nltk.download('movie_reviews')
docs = [(list(movie_reviews.words(f)), cat)
        for cat in movie_reviews.categories()
        for f in movie_reviews.fileids(cat)]
random.shuffle(docs)
texts = [" ".join(words) for words, _ in docs[:200]]
labels = [label for _, label in docs[:200]]

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# 3. Pipeline: TF–IDF → Logistic Regression
model = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=200)
)
model.fit(X_train, y_train)

# 4. Evaluate
acc = model.score(X_test, y_test)
print(f"Accuracy = {acc:.2f}")

Output:

Accuracy = 0.65

4.3 Neural Model: Simple Text Classifier with Keras

# sentiment_nn_full_demo.ipynb

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# --- 0. Example texts & labels (replace with your own) ---
texts  = [
    "I love this movie, it was fantastic!",
    "Terrible film, I hated it.",
    "What a great performance!",
    "Worst movie ever."
]
labels = ['pos', 'neg', 'pos', 'neg']

# --- 1. Tokenize & sequence-encode ---
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
seqs = tokenizer.texts_to_sequences(texts)
data = pad_sequences(seqs, maxlen=200, padding='post', truncating='post')

# --- 2. Encode labels as integers and convert to NumPy array ---
y = np.array([1 if lab == 'pos' else 0 for lab in labels])

# --- 3. Train/test split ---
X_tr, X_te, y_tr, y_te = train_test_split(
    data, y, test_size=0.2, random_state=42
)

# --- 4. Build and compile the model ---
model_nn = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16),  # input_length no longer needed
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])
model_nn.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# --- 5. Train (5 epochs on CPU) ---
history = model_nn.fit(
    X_tr, y_tr,
    epochs=5,
    batch_size=32,
    validation_data=(X_te, y_te),
    verbose=2
)

Output:

image

4.4 Evaluation Metrics

from sklearn.metrics import classification_report, confusion_matrix

# 1. Log-Reg: make sure you feed in class labels, not raw scores
#    If `model` is a sklearn Pipeline ending in LogisticRegression,
#    use predict(), else for raw probabilities do:
# y_prob_lr = model.predict_proba(X_test)[:,1]
# y_pred_lr = (y_prob_lr > 0.5).astype(int)
y_pred_lr = model.predict(X_test)  

print("LogReg Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nLogReg Report:")
print(classification_report(
    y_test,
    y_pred_lr,
    zero_division=0
))

# 2. Neural Net
#    This already thresholds at 0.5
y_prob_nn = model_nn.predict(X_te)
y_pred_nn = (y_prob_nn > 0.5).astype(int).flatten()

print("\nNN Confusion Matrix:")
print(confusion_matrix(y_te, y_pred_nn))
print("\nNN Report:")
print(classification_report(
    y_te,
    y_pred_nn,
    zero_division=0
))

Output:

image

⚠️ **GitHub.com Fallback** ⚠️