Module1 - iffatAGheyas/applied-nlp-handbook GitHub Wiki
Module 1: Linguistic & Computational Foundations
This module lays the groundwork for all NLP tasks by introducing the basic linguistic units and the simplest computational models for processing text.
1.1 Morphology: Stems & Affixes
Learn how words are built from stems and affixes, and how to normalize them via stemming or lemmatization.
Link: 1.1 Morphology: Stems & Affixes
1.2 Tokenization: Regex & Rule-based Methods
Split raw text into tokens (words, punctuation, URLs, etc.) using regular expressions and simple rules.
Link: 1.2 Tokenization: Regex & Rule-based Methods
1.3 Finite-State Automata: Concept & Implementation
Model tokenizers and simple pattern matchers as state machines; implement DFAs/NFAs in Python.
Link: 1.3 Finite-State Automata: Concept & Implementation
1.4 Basic Syntax: Phrase Structure Trees
Represent sentence structure with context-free grammars and parse trees (e.g. via NLTK).
Link: 1.4 Basic Syntax: Phrase Structure Trees
Module 1: Linguistic & Computational Foundations
This module introduces the basic linguistic units and the simplest computational models for processing text.
1.1 Morphology: Stems & Affixes
Morphology studies how words are built from stems (core lexical units) and affixes (prefixes, suffixes). Normalizing words to stems or lemmas simplifies many NLP tasks.
Rule‐Based Stemmer
def simple_stemmer(word):
"""Strip common suffixes if word length remains ≥ 3."""
for suffix in ('ing','ly','ed','ness','s'):
if word.endswith(suffix) and len(word)-len(suffix) >= 3:
return word[:-len(suffix)]
return word
words = ['running','happily','tested','kindness','cats','play']
for w in words:
print(f"{w:10} → {simple_stemmer(w)}")
Output:
NLTK Stemmer & Lemmatizer
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
words = ['running','happily','tested','kindness','cats','play']
porter = PorterStemmer()
snow = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
print("Word Porter Snowball Lemma")
for w in words:
lemma = lemmatizer.lemmatize(w, pos='v')
print(f"{w:10}{porter.stem(w):10}{snow.stem(w):10}{lemma:10}")
Output:
1.2 Tokenization: Regex & Rule-based Methods
Tokenization splits raw text into units (“tokens”) such as words, punctuation, URLs.
Simple Regex Tokenizer
import re
def simple_tokenize(text):
return re.findall(r"\w+|[^\w\s]", text)
text = "Hello, NLP world! Let's tokenize: words, punctuation..."
print(simple_tokenize(text))
Output:
['Hello', ',', 'NLP', 'world', '!', 'Let', "'", 's', 'tokenize', ':', 'words', ',', 'punctuation', '.', '.', '.']
Enhanced Regex for URLs & Contractions
pattern = r"""
https?://\S+ # URLs
|@\w+ # mentions
|#\w+ # hashtags
|[A-Za-z]+(?:'[A-Za-z]+)? # words with apostrophes
|\d+(?:\.\d+)? # numbers
|[^\s\w] # other characters
"""
tokenizer = re.compile(pattern, re.VERBOSE)
text = "Visit https://ex.com, it's #NLP @user!"
print(tokenizer.findall(text))
Output:
['', 'Visit', '', 'https://ex.com/,', '', '', "it's", '', '', '#', '', 'NLP', '', '@user', '', '!', '']
NLTK Tokenizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
print(word_tokenize("Dr. Smith isn't here. He'll come soon."))
Output:
['Dr.', 'Smith', 'is', "n't", 'here', '.', 'He', "'ll", 'come', 'soon', '.']
1.3 Finite-State Automata: Concept & Implementation
Finite-State Automata (FSA) are state machines that accept or reject sequences of symbols.
DFA for Strings Ending in “01”
states = {'q0','q1','q2'} # q0=start, q2=accept
alphabet = {'0','1'}
start, accept = 'q0', {'q2'}
trans = {
('q0','0'):'q1', ('q0','1'):'q0',
('q1','0'):'q1', ('q1','1'):'q2',
('q2','0'):'q1', ('q2','1'):'q0'
}
class DFA:
def __init__(self, trans, start, accept):
self.trans, self.start, self.accept = trans, start, accept
def accepts(self, s):
q = self.start
for ch in s:
q = self.trans.get((q,ch), None)
if q is None: return False
return q in self.accept
dfa = DFA(trans, start, accept)
tests = ['01','101','1101','100','010','']
print({t:dfa.accepts(t) for t in tests})
Output:
{'01': True, '101': True, '1101': True, '100': False, '010': False, '': False}
1.4 Basic Syntax: Phrase Structure Trees
Phrase structure trees represent sentence hierarchy via a Context-Free Grammar (CFG).
Define Grammar & Parse
import nltk
from nltk import CFG
from nltk.parse import RecursiveDescentParser
grammar = CFG.fromstring("""
S -> NP VP
NP -> Det N
VP -> V NP
Det -> 'the' | 'a'
N -> 'dog' | 'cat'
V -> 'sees' | 'pets'
""")
parser = RecursiveDescentParser(grammar)
sentence = "the dog sees a cat".split()
trees = list(parser.parse(sentence))
for tree in trees:
tree.pretty_print()
Output:
# tree_save_no_graphviz.ipynb
import os
import matplotlib.pyplot as plt
from nltk import Tree
def save_tree_png(tree: Tree, filename: str):
"""
Render an nltk.Tree to PNG using matplotlib only.
"""
# 1) Compute (x,y) positions
x_coords, y_coords = {}, {}
leaf_x = [0] # mutable counter
def _layout(node, depth=0):
if isinstance(node, Tree) and node:
child_xs = [_layout(child, depth+1) for child in node]
x = sum(child_xs) / len(child_xs)
else:
x = leaf_x[0]
leaf_x[0] += 1
x_coords[id(node)], y_coords[id(node)] = x, -depth
return x
_layout(tree)
# 2) Draw with matplotlib
fig, ax = plt.subplots(figsize=(8, 6))
def _draw(node):
x, y = x_coords[id(node)], y_coords[id(node)]
label = node.label() if isinstance(node, Tree) else str(node)
ax.text(x, y, label,
ha='center', va='center',
bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='black'))
if isinstance(node, Tree):
for child in node:
cx, cy = x_coords[id(child)], y_coords[id(child)]
ax.plot([x, cx], [y, cy], '-', color='black')
_draw(child)
_draw(tree)
ax.axis('off')
plt.tight_layout()
# 3) Ensure output dir exists
out_dir = os.path.dirname(filename)
if out_dir and not os.path.isdir(out_dir):
os.makedirs(out_dir)
# 4) Save and close
fig.savefig(filename, dpi=300)
plt.close(fig)
print(f"Saved tree PNG to {filename!r}")
# --- Usage ---
# Assuming you already have a list of parsed trees in `trees`:
save_tree_png(trees[0], "images/module1_4_tree.png")