Module 3 5 Word Embeddings Word2Vec GloVe - iffatAGheyas/NLP-handbook GitHub Wiki
Module 3.5: Word Embeddings (Word2Vec & GloVe)
Word embeddings map words to dense, low-dimensional vectors that capture semantic and syntactic relationships.
Key Concepts
- Distributional Hypothesis: “You shall know a word by the company it keeps.”
- Word2Vec
- CBOW (Continuous Bag-of-Words): predicts a target word from its context.
- Skip-gram: predicts surrounding context words given a target word.
- GloVe (Global Vectors): factorizes a global word–word co-occurrence matrix.
gensim
1. Training a Word2Vec Model with # word2vec_scratch_fixed.ipynb
import numpy as np
import random
from collections import Counter
def build_vocab(sentences, min_count=1):
"""
Build vocabulary mappings and unigram frequency distribution.
"""
words = [w for sent in sentences for w in sent]
freq = Counter(words)
vocab = {w for w, c in freq.items() if c >= min_count}
word2idx = {w: i for i, w in enumerate(sorted(vocab))}
idx2word = {i: w for w, i in word2idx.items()}
counts = np.array([freq[idx2word[i]] for i in range(len(idx2word))], dtype=np.float32)
word_freq = counts / counts.sum()
return word2idx, idx2word, word_freq
def generate_training_data(sentences, word2idx, window_size):
"""
Generate (centre, context) index pairs from sentences.
"""
pairs = []
for sent in sentences:
indices = [word2idx[w] for w in sent if w in word2idx]
for centre_pos, centre in enumerate(indices):
for offset in range(-window_size, window_size + 1):
context_pos = centre_pos + offset
if context_pos < 0 or context_pos >= len(indices) or context_pos == centre_pos:
continue
context = indices[context_pos]
pairs.append((centre, context))
return pairs
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def train_skipgram(pairs, vocab_size, embed_size=50, lr=0.025,
epochs=100, neg_samples=5, word_freq=None):
"""
Train a Skip‐gram Word2Vec model with negative sampling.
Returns the input‐to‐hidden weight matrix W (embeddings).
"""
# Initialise weight matrices
W = np.random.uniform(-0.5/embed_size, 0.5/embed_size, (vocab_size, embed_size))
W_prime = np.zeros((vocab_size, embed_size))
for epoch in range(epochs):
random.shuffle(pairs)
for centre, context in pairs:
v_c = W[centre]
u_o = W_prime[context]
# Positive update
score = sigmoid(np.dot(u_o, v_c))
grad_u = (score - 1.0) * v_c
grad_v = (score - 1.0) * u_o
W_prime[context] -= lr * grad_u
W[centre] -= lr * grad_v
# Negative sampling updates
for _ in range(neg_samples):
neg = np.random.choice(vocab_size, p=word_freq)
if neg == context:
continue
u_k = W_prime[neg]
score_neg = sigmoid(np.dot(u_k, v_c))
grad_u_neg = score_neg * v_c
grad_v_neg = score_neg * u_k
W_prime[neg] -= lr * grad_u_neg
W[centre] -= lr * grad_v_neg
return W
def most_similar(word, W, word2idx, idx2word, topn=3):
"""
Return top‐n most similar words to the given word by cosine similarity.
"""
if word not in word2idx:
return []
vec = W[word2idx[word]]
norms = np.linalg.norm(W, axis=1)
sims = np.dot(W, vec) / (norms * np.linalg.norm(vec) + 1e-8)
sims[word2idx[word]] = -1.0
top_idx = np.argsort(-sims)[:topn]
return [(idx2word[i], sims[i]) for i in top_idx]
# --- Example usage ---
if __name__ == "__main__":
# Toy corpus
corpus = [
"the cat sat on the mat",
"the dog sat on the log",
"the cat saw the dog",
"the dog chased the cat",
"the cat climbed the tree"
]
sentences = [sent.split() for sent in corpus]
# Build vocabulary
word2idx, idx2word, word_freq = build_vocab(sentences)
# Generate training pairs with window size 2
pairs = generate_training_data(sentences, word2idx, window_size=2)
# Train Skip‐gram Word2Vec
W = train_skipgram(
pairs,
vocab_size=len(word2idx),
embed_size=50,
lr=0.025,
epochs=200,
neg_samples=5,
word_freq=word_freq
)
# Inspect embedding for 'cat'
vec_cat = W[word2idx['cat']]
print("Vector for 'cat' (first 5 dims):", vec_cat[:5], "…\n")
# Show top‐3 similar words
print("Top-3 words similar to 'cat':")
for w, score in most_similar('cat', W, word2idx, idx2word):
print(f" {w} ({score:.3f})")
Output:
2. Exploring Pre-trained GloVe Embeddings
import gensim.downloader as api
# Download small GloVe vectors (50d)
glove = api.load("glove-wiki-gigaword-50")
# Inspect vector
print("GloVe vector dim:", len(glove['king']))
# Word analogy: king – man + woman ≈ ?
analogy = glove.most_similar(positive=['king','woman'], negative=['man'], topn=3)
print("\nAnalogy result for 'king - man + woman':")
for word, score in analogy:
print(f" {word} ({score:.3f})")
Output:
Continue to Module 4: Classical Machine Learning for NLP