Module 1 4 Phase Structure Trees - iffatAGheyas/NLP-handbook GitHub Wiki

Module 1.4: Basic Syntax – Phrase Structure Trees

Syntax studies how words combine into phrases and sentences. Phrase structure trees visually represent hierarchical structure defined by a Context-Free Grammar (CFG).


Key Concepts

  • Context-Free Grammar (CFG)
    image

Phrase Structure Rules
image

  • Parse Tree
    A tree showing expansion from the start symbol to terminals via CFG rules.

1. Defining a Grammar and Parsing

import nltk
from nltk import CFG
from nltk.parse import RecursiveDescentParser

# Define a simple CFG
grammar = CFG.fromstring("""
S   -> NP VP
NP  -> Det N
VP  -> V NP
Det -> 'the' | 'a'
N   -> 'dog' | 'cat'
V   -> 'sees' | 'pets'
""")

# Initialize parser and parse a sentence
parser   = RecursiveDescentParser(grammar)
sentence = "the dog sees a cat".split()
trees     = list(parser.parse(sentence))

# Display the parse tree(s)
for tree in trees:
  tree.pretty_print()

image

2. Exporting a Graphical Parse Tree

# tree_matplotlib_demo.ipynb

import os
import matplotlib.pyplot as plt
from nltk import Tree

def save_tree_png_no_graphviz(tree: Tree, filename: str):
    """
    Render an nltk.Tree to PNG using matplotlib only.
    """
    # 1. Compute (x,y) positions for each node
    x_coords = {}
    y_coords = {}
    leaf_counter = [0]  # mutable counter

    def _layout(t, depth=0):
        """Recursively assign coords; returns x-position for t."""
        # If interior node, lay out children first
        if isinstance(t, Tree) and len(t) > 0:
            child_x = [_layout(child, depth+1) for child in t]
            x = sum(child_x) / len(child_x)
        else:
            # leaf: assign next available x
            x = leaf_counter[0]
            leaf_counter[0] += 1

        x_coords[id(t)] = x
        y_coords[id(t)] = -depth
        return x

    _layout(tree)

    # 2. Draw with matplotlib
    fig, ax = plt.subplots(figsize=(8, 6))
    def _draw(t):
        idx = id(t)
        x, y = x_coords[idx], y_coords[idx]

        # Node label
        label = t.label() if isinstance(t, Tree) else t
        ax.text(x, y, label,
                ha='center', va='center',
                bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='black', lw=1))

        # Edges to children
        if isinstance(t, Tree):
            for child in t:
                cx, cy = x_coords[id(child)], y_coords[id(child)]
                ax.plot([x, cx], [y, cy], '-', color='black')
                _draw(child)

    _draw(tree)

    ax.axis('off')
    plt.tight_layout()

    # 3. Ensure output dir exists
    out_dir = os.path.dirname(filename)
    if out_dir and not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # 4. Save and close
    fig.savefig(filename, dpi=300)
    plt.close(fig)
    print(f"Saved tree PNG to {filename!r}")

# --- Usage (assuming you have `trees` from your parser) ---
save_tree_png_no_graphviz(trees[0], "images/module1_4_tree.png")

image

Continue to Module 2: Probability and Statistics for NLP