Module 3 Text Representations - iffatAGheyas/applied-nlp-handbook GitHub Wiki

Module 3: Text Representations (BoW, TF–IDF & Embeddings)

This module covers three common ways to convert text into numeric features for NLP models:

  • 3.1 Bag-of-Words & Count Vectors
  • 3.2 TF–IDF Representation
  • 3.3 Word Embeddings (Word2Vec & GloVe)

3.1 Bag-of-Words & Count Vectors

Represent each document as a vector of raw token counts (the Document-Term Matrix).

3.1.1 scikit-learn CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "the cat saw the dog"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

df_counts = pd.DataFrame(
    X.toarray(),
    index=[f"doc{i+1}" for i in range(len(corpus))],
    columns=vectorizer.get_feature_names_out()
)
print(df_counts)

Output:

image

3.1.2 Manual Construction

from collections import Counter
import pandas as pd

# Build vocabulary
vocab = sorted({w for doc in corpus for w in doc.split()})

def doc_vec(doc):
    cnt = Counter(doc.split())
    return [cnt[w] for w in vocab]

matrix = [doc_vec(doc) for doc in corpus]
df_manual = pd.DataFrame(matrix, index=[f"doc{i+1}" for i in range(len(corpus))], columns=vocab)
print(df_manual)

Output:

image

3.2 TF–IDF Representation

Weight terms by their frequency in a document and inverse frequency across the corpus.

3.2.1 scikit-learn TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2', use_idf=True)
X_tfidf = tfidf.fit_transform(corpus)

df_tfidf = pd.DataFrame(
    X_tfidf.toarray(),
    index=[f"doc{i+1}" for i in range(len(corpus))],
    columns=tfidf.get_feature_names_out()
).round(3)
print(df_tfidf)

Output:

image

3.2.2 Manual TF–IDF

import math
from collections import Counter

N = len(corpus)
df = Counter()
for doc in corpus:
    for t in set(doc.split()):
        df[t] += 1

def compute_tfidf(doc):
    tf = Counter(doc.split())
    length = sum(tf.values())
    return {t: (tf[t]/length) * math.log(N/df[t]) for t in tf}

for i, doc in enumerate(corpus, 1):
    scores = compute_tfidf(doc)
    print(f"doc{i} TF–IDF:", {t: round(s,3) for t,s in scores.items()})

Output:

image

3.3 Word Embeddings (Word2Vec & GloVe)

Map words to dense vectors that capture semantic relationships.

3.3.1 Training Word2Vec with gensim

# pure_python_word2vec.ipynb

import numpy as np
import random
from collections import Counter

# 1. Build vocab & unigram freq for negative sampling
def build_vocab(sentences, min_count=1):
    words = [w for sent in sentences for w in sent]
    freq  = Counter(words)
    vocab = {w for w, c in freq.items() if c >= min_count}
    w2i   = {w:i for i,w in enumerate(sorted(vocab))}
    i2w   = {i:w for w,i in w2i.items()}
    counts = np.array([freq[i2w[i]] for i in range(len(i2w))], dtype=np.float32)
    uni = counts / counts.sum()
    return w2i, i2w, uni

# 2. Generate (centre, context) pairs
def generate_pairs(sentences, w2i, window):
    pairs = []
    for sent in sentences:
        idxs = [w2i[w] for w in sent if w in w2i]
        for pos, centre in enumerate(idxs):
            for offset in range(-window, window+1):
                ctx_pos = pos + offset
                if ctx_pos<0 or ctx_pos>=len(idxs) or ctx_pos==pos: continue
                pairs.append((centre, idxs[ctx_pos]))
    return pairs

# 3. Sigmoid helper
def sigmoid(x): return 1/(1+np.exp(-x))

# 4. Train skip-gram with negative sampling
def train_sg_ns(pairs, V, D=50, lr=0.025, epochs=100, neg=5, uni=None):
    W      = np.random.uniform(-0.5/D,0.5/D,(V,D))
    Wp     = np.zeros((V,D))
    for _ in range(epochs):
        random.shuffle(pairs)
        for c, o in pairs:
            v_c = W[c]; u_o = Wp[o]
            # positive
            s    = sigmoid(u_o.dot(v_c))
            grad = (s-1)
            Wp[o] -= lr * grad * v_c
            W[c]  -= lr * grad * u_o
            # negatives
            for _ in range(neg):
                k = np.random.choice(V, p=uni)
                if k==o: continue
                u_k = Wp[k]
                sn  = sigmoid(u_k.dot(v_c))
                gradn = sn
                Wp[k] -= lr * gradn * v_c
                W[c]  -= lr * gradn * u_k
    return W

# 5. Cosine-similarity helper
def most_similar(word, W, w2i, i2w, topn=3):
    if word not in w2i: return []
    v = W[w2i[word]]
    norms = np.linalg.norm(W, axis=1)
    sims  = W.dot(v)/(norms * np.linalg.norm(v)+1e-8)
    sims[w2i[word]] = -1
    idxs = np.argsort(-sims)[:topn]
    return [(i2w[i], sims[i]) for i in idxs]

# --- Demo on your toy corpus ---
corpus    = [
  "the cat sat on the mat",
  "the dog sat on the log",
  "the cat saw the dog",
  "the dog chased the cat",
  "the cat climbed the tree",
]
sents     = [s.split() for s in corpus]
w2i, i2w, uni = build_vocab(sents)
pairs     = generate_pairs(sents, w2i, window=2)
W         = train_sg_ns(pairs, V=len(w2i), D=50, epochs=200, uni=uni)

print("Vector for 'cat' (first 5 dims):", W[w2i['cat']][:5], "…\n")
print("Top-3 similar to 'cat':")
for w, score in most_similar('cat', W, w2i, i2w, topn=3):
    print(f"  {w} ({score:.3f})")

Output:

Vector for 'cat' (first 5 dims): [ 1.34053726  0.91767812  0.23705124 -0.53703058 -0.44004042] …

Top-3 similar to 'cat':
  dog (0.943)
  tree (0.843)
  log (0.698)

3.3.2 Exploring Pre-trained GloVe

import gensim.downloader as api

glove = api.load("glove-wiki-gigaword-50")
print("Dim:", len(glove['king']))

analogy = glove.most_similar(positive=['king','woman'], negative=['man'], topn=3)
print("\nking – man + woman ≈")
for w, sim in analogy:
    print(f"  {w} ({sim:.3f})")

Output:

Dim: 50

king – man + woman ≈
  queen (0.852)
  throne (0.766)
  prince (0.759)