Module 3 Text Representations - iffatAGheyas/applied-nlp-handbook GitHub Wiki
Module 3: Text Representations (BoW, TF–IDF & Embeddings)
This module covers three common ways to convert text into numeric features for NLP models:
- 3.1 Bag-of-Words & Count Vectors
- 3.2 TF–IDF Representation
- 3.3 Word Embeddings (Word2Vec & GloVe)
3.1 Bag-of-Words & Count Vectors
Represent each document as a vector of raw token counts (the Document-Term Matrix).
CountVectorizer
3.1.1 scikit-learn from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
corpus = [
"the cat sat on the mat",
"the dog sat on the log",
"the cat saw the dog"
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
df_counts = pd.DataFrame(
X.toarray(),
index=[f"doc{i+1}" for i in range(len(corpus))],
columns=vectorizer.get_feature_names_out()
)
print(df_counts)
Output:
3.1.2 Manual Construction
from collections import Counter
import pandas as pd
# Build vocabulary
vocab = sorted({w for doc in corpus for w in doc.split()})
def doc_vec(doc):
cnt = Counter(doc.split())
return [cnt[w] for w in vocab]
matrix = [doc_vec(doc) for doc in corpus]
df_manual = pd.DataFrame(matrix, index=[f"doc{i+1}" for i in range(len(corpus))], columns=vocab)
print(df_manual)
Output:
3.2 TF–IDF Representation
Weight terms by their frequency in a document and inverse frequency across the corpus.
3.2.1 scikit-learn TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(norm='l2', use_idf=True)
X_tfidf = tfidf.fit_transform(corpus)
df_tfidf = pd.DataFrame(
X_tfidf.toarray(),
index=[f"doc{i+1}" for i in range(len(corpus))],
columns=tfidf.get_feature_names_out()
).round(3)
print(df_tfidf)
Output:
3.2.2 Manual TF–IDF
import math
from collections import Counter
N = len(corpus)
df = Counter()
for doc in corpus:
for t in set(doc.split()):
df[t] += 1
def compute_tfidf(doc):
tf = Counter(doc.split())
length = sum(tf.values())
return {t: (tf[t]/length) * math.log(N/df[t]) for t in tf}
for i, doc in enumerate(corpus, 1):
scores = compute_tfidf(doc)
print(f"doc{i} TF–IDF:", {t: round(s,3) for t,s in scores.items()})
Output:
3.3 Word Embeddings (Word2Vec & GloVe)
Map words to dense vectors that capture semantic relationships.
3.3.1 Training Word2Vec with gensim
# pure_python_word2vec.ipynb
import numpy as np
import random
from collections import Counter
# 1. Build vocab & unigram freq for negative sampling
def build_vocab(sentences, min_count=1):
words = [w for sent in sentences for w in sent]
freq = Counter(words)
vocab = {w for w, c in freq.items() if c >= min_count}
w2i = {w:i for i,w in enumerate(sorted(vocab))}
i2w = {i:w for w,i in w2i.items()}
counts = np.array([freq[i2w[i]] for i in range(len(i2w))], dtype=np.float32)
uni = counts / counts.sum()
return w2i, i2w, uni
# 2. Generate (centre, context) pairs
def generate_pairs(sentences, w2i, window):
pairs = []
for sent in sentences:
idxs = [w2i[w] for w in sent if w in w2i]
for pos, centre in enumerate(idxs):
for offset in range(-window, window+1):
ctx_pos = pos + offset
if ctx_pos<0 or ctx_pos>=len(idxs) or ctx_pos==pos: continue
pairs.append((centre, idxs[ctx_pos]))
return pairs
# 3. Sigmoid helper
def sigmoid(x): return 1/(1+np.exp(-x))
# 4. Train skip-gram with negative sampling
def train_sg_ns(pairs, V, D=50, lr=0.025, epochs=100, neg=5, uni=None):
W = np.random.uniform(-0.5/D,0.5/D,(V,D))
Wp = np.zeros((V,D))
for _ in range(epochs):
random.shuffle(pairs)
for c, o in pairs:
v_c = W[c]; u_o = Wp[o]
# positive
s = sigmoid(u_o.dot(v_c))
grad = (s-1)
Wp[o] -= lr * grad * v_c
W[c] -= lr * grad * u_o
# negatives
for _ in range(neg):
k = np.random.choice(V, p=uni)
if k==o: continue
u_k = Wp[k]
sn = sigmoid(u_k.dot(v_c))
gradn = sn
Wp[k] -= lr * gradn * v_c
W[c] -= lr * gradn * u_k
return W
# 5. Cosine-similarity helper
def most_similar(word, W, w2i, i2w, topn=3):
if word not in w2i: return []
v = W[w2i[word]]
norms = np.linalg.norm(W, axis=1)
sims = W.dot(v)/(norms * np.linalg.norm(v)+1e-8)
sims[w2i[word]] = -1
idxs = np.argsort(-sims)[:topn]
return [(i2w[i], sims[i]) for i in idxs]
# --- Demo on your toy corpus ---
corpus = [
"the cat sat on the mat",
"the dog sat on the log",
"the cat saw the dog",
"the dog chased the cat",
"the cat climbed the tree",
]
sents = [s.split() for s in corpus]
w2i, i2w, uni = build_vocab(sents)
pairs = generate_pairs(sents, w2i, window=2)
W = train_sg_ns(pairs, V=len(w2i), D=50, epochs=200, uni=uni)
print("Vector for 'cat' (first 5 dims):", W[w2i['cat']][:5], "…\n")
print("Top-3 similar to 'cat':")
for w, score in most_similar('cat', W, w2i, i2w, topn=3):
print(f" {w} ({score:.3f})")
Output:
Vector for 'cat' (first 5 dims): [ 1.34053726 0.91767812 0.23705124 -0.53703058 -0.44004042] …
Top-3 similar to 'cat':
dog (0.943)
tree (0.843)
log (0.698)
3.3.2 Exploring Pre-trained GloVe
import gensim.downloader as api
glove = api.load("glove-wiki-gigaword-50")
print("Dim:", len(glove['king']))
analogy = glove.most_similar(positive=['king','woman'], negative=['man'], topn=3)
print("\nking – man + woman ≈")
for w, sim in analogy:
print(f" {w} ({sim:.3f})")
Output:
Dim: 50
king – man + woman ≈
queen (0.852)
throne (0.766)
prince (0.759)