Module 3 5 Word Embeddings Word2Vec GloVe - iffatAGheyas/NLP-handbook GitHub Wiki

Module 3.5: Word Embeddings (Word2Vec & GloVe)

Word embeddings map words to dense, low-dimensional vectors that capture semantic and syntactic relationships.


Key Concepts

  • Distributional Hypothesis: “You shall know a word by the company it keeps.”
  • Word2Vec
    • CBOW (Continuous Bag-of-Words): predicts a target word from its context.
    • Skip-gram: predicts surrounding context words given a target word.
  • GloVe (Global Vectors): factorizes a global word–word co-occurrence matrix.

1. Training a Word2Vec Model with gensim

# word2vec_scratch_fixed.ipynb

import numpy as np
import random
from collections import Counter

def build_vocab(sentences, min_count=1):
    """
    Build vocabulary mappings and unigram frequency distribution.
    """
    words = [w for sent in sentences for w in sent]
    freq = Counter(words)
    vocab = {w for w, c in freq.items() if c >= min_count}
    word2idx = {w: i for i, w in enumerate(sorted(vocab))}
    idx2word = {i: w for w, i in word2idx.items()}
    counts = np.array([freq[idx2word[i]] for i in range(len(idx2word))], dtype=np.float32)
    word_freq = counts / counts.sum()
    return word2idx, idx2word, word_freq

def generate_training_data(sentences, word2idx, window_size):
    """
    Generate (centre, context) index pairs from sentences.
    """
    pairs = []
    for sent in sentences:
        indices = [word2idx[w] for w in sent if w in word2idx]
        for centre_pos, centre in enumerate(indices):
            for offset in range(-window_size, window_size + 1):
                context_pos = centre_pos + offset
                if context_pos < 0 or context_pos >= len(indices) or context_pos == centre_pos:
                    continue
                context = indices[context_pos]
                pairs.append((centre, context))
    return pairs

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def train_skipgram(pairs, vocab_size, embed_size=50, lr=0.025,
                   epochs=100, neg_samples=5, word_freq=None):
    """
    Train a Skip‐gram Word2Vec model with negative sampling.
    Returns the input‐to‐hidden weight matrix W (embeddings).
    """
    # Initialise weight matrices
    W = np.random.uniform(-0.5/embed_size, 0.5/embed_size, (vocab_size, embed_size))
    W_prime = np.zeros((vocab_size, embed_size))
    
    for epoch in range(epochs):
        random.shuffle(pairs)
        for centre, context in pairs:
            v_c = W[centre]
            u_o = W_prime[context]
            
            # Positive update
            score = sigmoid(np.dot(u_o, v_c))
            grad_u = (score - 1.0) * v_c
            grad_v = (score - 1.0) * u_o
            W_prime[context] -= lr * grad_u
            W[centre]      -= lr * grad_v
            
            # Negative sampling updates
            for _ in range(neg_samples):
                neg = np.random.choice(vocab_size, p=word_freq)
                if neg == context:
                    continue
                u_k = W_prime[neg]
                score_neg = sigmoid(np.dot(u_k, v_c))
                grad_u_neg = score_neg * v_c
                grad_v_neg = score_neg * u_k
                W_prime[neg]  -= lr * grad_u_neg
                W[centre]     -= lr * grad_v_neg
                
    return W

def most_similar(word, W, word2idx, idx2word, topn=3):
    """
    Return top‐n most similar words to the given word by cosine similarity.
    """
    if word not in word2idx:
        return []
    vec = W[word2idx[word]]
    norms = np.linalg.norm(W, axis=1)
    sims = np.dot(W, vec) / (norms * np.linalg.norm(vec) + 1e-8)
    sims[word2idx[word]] = -1.0
    top_idx = np.argsort(-sims)[:topn]
    return [(idx2word[i], sims[i]) for i in top_idx]

# --- Example usage ---
if __name__ == "__main__":
    # Toy corpus
    corpus = [
        "the cat sat on the mat",
        "the dog sat on the log",
        "the cat saw the dog",
        "the dog chased the cat",
        "the cat climbed the tree"
    ]
    sentences = [sent.split() for sent in corpus]
    
    # Build vocabulary
    word2idx, idx2word, word_freq = build_vocab(sentences)
    
    # Generate training pairs with window size 2
    pairs = generate_training_data(sentences, word2idx, window_size=2)
    
    # Train Skip‐gram Word2Vec
    W = train_skipgram(
        pairs,
        vocab_size=len(word2idx),
        embed_size=50,
        lr=0.025,
        epochs=200,
        neg_samples=5,
        word_freq=word_freq
    )
    
    # Inspect embedding for 'cat'
    vec_cat = W[word2idx['cat']]
    print("Vector for 'cat' (first 5 dims):", vec_cat[:5], "…\n")
    
    # Show top‐3 similar words
    print("Top-3 words similar to 'cat':")
    for w, score in most_similar('cat', W, word2idx, idx2word):
        print(f"  {w} ({score:.3f})")

Output:

image

2. Exploring Pre-trained GloVe Embeddings

import gensim.downloader as api

# Download small GloVe vectors (50d)
glove = api.load("glove-wiki-gigaword-50")

# Inspect vector
print("GloVe vector dim:", len(glove['king']))

# Word analogy: king – man + woman ≈ ?
analogy = glove.most_similar(positive=['king','woman'], negative=['man'], topn=3)
print("\nAnalogy result for 'king - man + woman':")
for word, score in analogy:
    print(f"  {word} ({score:.3f})")

Output:

image

Continue to Module 4: Classical Machine Learning for NLP