Module1 - iffatAGheyas/applied-nlp-handbook GitHub Wiki

Module 1: Linguistic & Computational Foundations

This module lays the groundwork for all NLP tasks by introducing the basic linguistic units and the simplest computational models for processing text.


1.1 Morphology: Stems & Affixes

Learn how words are built from stems and affixes, and how to normalize them via stemming or lemmatization.
Link: 1.1 Morphology: Stems & Affixes

1.2 Tokenization: Regex & Rule-based Methods

Split raw text into tokens (words, punctuation, URLs, etc.) using regular expressions and simple rules.
Link: 1.2 Tokenization: Regex & Rule-based Methods

1.3 Finite-State Automata: Concept & Implementation

Model tokenizers and simple pattern matchers as state machines; implement DFAs/NFAs in Python.
Link: 1.3 Finite-State Automata: Concept & Implementation

1.4 Basic Syntax: Phrase Structure Trees

Represent sentence structure with context-free grammars and parse trees (e.g. via NLTK).
Link: 1.4 Basic Syntax: Phrase Structure Trees


Module 1: Linguistic & Computational Foundations

This module introduces the basic linguistic units and the simplest computational models for processing text.


1.1 Morphology: Stems & Affixes

Morphology studies how words are built from stems (core lexical units) and affixes (prefixes, suffixes). Normalizing words to stems or lemmas simplifies many NLP tasks.

Rule‐Based Stemmer

def simple_stemmer(word):
    """Strip common suffixes if word length remains ≥ 3."""
    for suffix in ('ing','ly','ed','ness','s'):
        if word.endswith(suffix) and len(word)-len(suffix) >= 3:
            return word[:-len(suffix)]
    return word

words = ['running','happily','tested','kindness','cats','play']
for w in words:
    print(f"{w:10} → {simple_stemmer(w)}")

Output:

image

NLTK Stemmer & Lemmatizer

import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

words = ['running','happily','tested','kindness','cats','play']
porter = PorterStemmer()
snow  = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

print("Word      Porter   Snowball  Lemma")
for w in words:
    lemma = lemmatizer.lemmatize(w, pos='v')
    print(f"{w:10}{porter.stem(w):10}{snow.stem(w):10}{lemma:10}")

Output:

image

1.2 Tokenization: Regex & Rule-based Methods

Tokenization splits raw text into units (“tokens”) such as words, punctuation, URLs.

Simple Regex Tokenizer

import re

def simple_tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text)

text = "Hello, NLP world! Let's tokenize: words, punctuation..."
print(simple_tokenize(text))

Output:

['Hello', ',', 'NLP', 'world', '!', 'Let', "'", 's', 'tokenize', ':', 'words', ',', 'punctuation', '.', '.', '.']

Enhanced Regex for URLs & Contractions

pattern = r"""
  https?://\S+            # URLs
  |@\w+                   # mentions
  |#\w+                   # hashtags
  |[A-Za-z]+(?:'[A-Za-z]+)?  # words with apostrophes
  |\d+(?:\.\d+)?          # numbers
  |[^\s\w]                # other characters
"""
tokenizer = re.compile(pattern, re.VERBOSE)
text = "Visit https://ex.com, it's #NLP @user!"
print(tokenizer.findall(text))

Output:

['', 'Visit', '', 'https://ex.com/,', '', '', "it's", '', '', '#', '', 'NLP', '', '@user', '', '!', '']

NLTK Tokenizer

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

print(word_tokenize("Dr. Smith isn't here. He'll come soon."))

Output:

['Dr.', 'Smith', 'is', "n't", 'here', '.', 'He', "'ll", 'come', 'soon', '.']

1.3 Finite-State Automata: Concept & Implementation

Finite-State Automata (FSA) are state machines that accept or reject sequences of symbols.

DFA for Strings Ending in “01”

states = {'q0','q1','q2'}      # q0=start, q2=accept
alphabet = {'0','1'}
start, accept = 'q0', {'q2'}
trans = {
    ('q0','0'):'q1', ('q0','1'):'q0',
    ('q1','0'):'q1', ('q1','1'):'q2',
    ('q2','0'):'q1', ('q2','1'):'q0'
}

class DFA:
    def __init__(self, trans, start, accept):
        self.trans, self.start, self.accept = trans, start, accept
    def accepts(self, s):
        q = self.start
        for ch in s:
            q = self.trans.get((q,ch), None)
            if q is None: return False
        return q in self.accept

dfa = DFA(trans, start, accept)
tests = ['01','101','1101','100','010','']
print({t:dfa.accepts(t) for t in tests})

Output:

{'01': True, '101': True, '1101': True, '100': False, '010': False, '': False}

1.4 Basic Syntax: Phrase Structure Trees

Phrase structure trees represent sentence hierarchy via a Context-Free Grammar (CFG).

Define Grammar & Parse

import nltk
from nltk import CFG
from nltk.parse import RecursiveDescentParser

grammar = CFG.fromstring("""
  S   -> NP VP
  NP  -> Det N
  VP  -> V NP
  Det -> 'the' | 'a'
  N   -> 'dog' | 'cat'
  V   -> 'sees' | 'pets'
""")
parser = RecursiveDescentParser(grammar)
sentence = "the dog sees a cat".split()
trees = list(parser.parse(sentence))

for tree in trees:
    tree.pretty_print()

Output:

image

# tree_save_no_graphviz.ipynb

import os
import matplotlib.pyplot as plt
from nltk import Tree

def save_tree_png(tree: Tree, filename: str):
    """
    Render an nltk.Tree to PNG using matplotlib only.
    """
    # 1) Compute (x,y) positions
    x_coords, y_coords = {}, {}
    leaf_x = [0]  # mutable counter

    def _layout(node, depth=0):
        if isinstance(node, Tree) and node:
            child_xs = [_layout(child, depth+1) for child in node]
            x = sum(child_xs) / len(child_xs)
        else:
            x = leaf_x[0]
            leaf_x[0] += 1
        x_coords[id(node)], y_coords[id(node)] = x, -depth
        return x

    _layout(tree)

    # 2) Draw with matplotlib
    fig, ax = plt.subplots(figsize=(8, 6))
    def _draw(node):
        x, y = x_coords[id(node)], y_coords[id(node)]
        label = node.label() if isinstance(node, Tree) else str(node)
        ax.text(x, y, label,
                ha='center', va='center',
                bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='black'))
        if isinstance(node, Tree):
            for child in node:
                cx, cy = x_coords[id(child)], y_coords[id(child)]
                ax.plot([x, cx], [y, cy], '-', color='black')
                _draw(child)

    _draw(tree)
    ax.axis('off')
    plt.tight_layout()

    # 3) Ensure output dir exists
    out_dir = os.path.dirname(filename)
    if out_dir and not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    # 4) Save and close
    fig.savefig(filename, dpi=300)
    plt.close(fig)
    print(f"Saved tree PNG to {filename!r}")

# --- Usage ---
# Assuming you already have a list of parsed trees in `trees`:
save_tree_png(trees[0], "images/module1_4_tree.png")

image