Module 3 2 Stemming and Lemmatization - iffatAGheyas/NLP-handbook GitHub Wiki
Module 3.2: Stemming & Lemmatization
Morphological normalization reduces words to a base form.
- Stemming uses heuristic rules to strip affixes (may yield non-words).
- Lemmatization uses vocabulary and POS information to return valid lemmas.
1. Toy Rule-Based Stemmer
def simple_stemmer(word):
"""Strip common suffixes if word length remains ≥ 3."""
for suffix in ('ing', 'ly', 'ed', 'ness', 's'):
if word.endswith(suffix) and len(word) - len(suffix) >= 3:
return word[:-len(suffix)]
return word
words = ['running', 'happily', 'tested', 'kindness', 'cats', 'play']
stems = [(w, simple_stemmer(w)) for w in words]
for w, s in stems:
print(f"{w:10} → {s}")
Output:
2. NLTK’s Porter & Snowball Stemmers
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer
nltk.download('punkt') # for tokenization if needed
words = ['running', 'happily', 'tested', 'kindness', 'cats', 'play']
porter = PorterStemmer()
snowball = SnowballStemmer('english')
print(f"{'Word':10}{'Porter':10}{'Snowball':10}")
for w in words:
print(f"{w:10}{porter.stem(w):10}{snowball.stem(w):10}")
Output:
3. NLTK WordNet Lemmatizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
# Helper to map POS tag to WordNet format
pos_map = {'J':'a', 'V':'v', 'N':'n', 'R':'r'}
words = ['running', 'better', 'cats', 'studies', 'wolves']
print(f"{'Word':10}{'Lemma (default)':20}{'Lemma (+POS)':20}")
for w in words:
default = lemmatizer.lemmatize(w)
tag = pos_tag([w])[0][1][0].upper()
pos = pos_map.get(tag, 'n')
with_pos = lemmatizer.lemmatize(w, pos)
print(f"{w:10}{default:20}{with_pos:20}")
Output:
4. (Optional) spaCy Lemmatizer
# lemmatization_demo.ipynb
import nltk
# 1. Download required NLTK data (only needs to run once)
nltk.download('averaged_perceptron_tagger') # for POS tagging
nltk.download('wordnet') # WordNet lemmatizer data
nltk.download('omw-1.4') # WordNet multilingual data
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
"""
Convert NLTK POS tags (Penn Treebank) to WordNet POS tags.
Defaults to NOUN for any tag we don't recognise.
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def lemmatize_tokens(tokens):
"""
Given a list of tokens, POS‐tag them and lemmatize each
using the appropriate WordNet POS.
"""
lemmatizer = WordNetLemmatizer()
tagged = pos_tag(tokens) # list of (token, treebank_tag)
lemmas = [
lemmatizer.lemmatize(tok, get_wordnet_pos(tag))
for tok, tag in tagged
]
return lemmas
# --- Example usage ---
tokens = ['running', 'better', 'cats', 'studies', 'wolves']
lemmas = lemmatize_tokens(tokens)
print("Tokens:", tokens)
print("Lemmas:", lemmas)
Output:
Continue to 3.3 Bag-of-Words & Count Vectors