Module 4 4 Conditional Random Fields for Sequence Labeling - iffatAGheyas/NLP-handbook GitHub Wiki

Module 4.4: Conditional Random Fields for Sequence Labeling

Key Concepts

Discriminative vs. Generative

1. Installing the CRF Library

pip install sklearn-crfsuite

2. Defining Feature Extraction

Features for each token can include the word itself, prefixes/suffixes, word shape, and neighboring words.

The three functions shown only *define* how to extract features and labels—they don’t print anything until they are *called*. To see their output, apply them to a sample sentence and print the results. For example:

```python
# 1. Feature extraction on a list of words
sent_words = ["John", "lives", "in", "London"]
features = sent2features(sent_words)
for i, feat in enumerate(features):
    print(f"Token {i} features:", feat)

# Expected output (approximate):
# Token 0 features: {'bias': 1.0, 'word.lower()': 'john', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'BOS': True, '+1:word.lower()': 'lives', ...}
# Token 1 features: {'bias': 1.0, 'word.lower()': 'lives', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'john', 'BOS': False, '+1:word.lower()': 'in', ...}
# …and so on for each token

# 2. Label extraction on a list of (word, tag) pairs
sent_labeled = [("John", "B-PER"), ("lives", "O"), ("in", "O"), ("London", "B-LOC")]
labels = sent2labels(sent_labeled)
print("Labels:", labels)
# Expected output:
# Labels: ['B-PER', 'O', 'O', 'B-LOC']

Output:

Token 0 features: {'bias': 1.0, 'word.lower()': 'john', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'BOS': True, '+1:word.lower()': 'lives', '+1:word.istitle()': False, '+1:word.isupper()': False}
Token 1 features: {'bias': 1.0, 'word.lower()': 'lives', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'john', '-1:word.istitle()': True, '-1:word.isupper()': False, '+1:word.lower()': 'in', '+1:word.istitle()': False, '+1:word.isupper()': False}
Token 2 features: {'bias': 1.0, 'word.lower()': 'in', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'lives', '-1:word.istitle()': False, '-1:word.isupper()': False, '+1:word.lower()': 'london', '+1:word.istitle()': True, '+1:word.isupper()': False}
Token 3 features: {'bias': 1.0, 'word.lower()': 'london', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, '-1:word.lower()': 'in', '-1:word.istitle()': False, '-1:word.isupper()': False, 'EOS': True}
Labels: ['B-PER', 'O', 'O', 'B-LOC']

3. Preparing a Toy Dataset

# ner_toy_dataset_demo.ipynb

import pprint

# 1. Toy dataset: each sentence is a list of (word, tag) tuples
train_sents = [
    [('John', 'B-PER'), ('lives', 'O'), ('in', 'O'), ('London', 'B-LOC')],
    [('Mary', 'B-PER'), ('works', 'O'), ('at', 'O'), ('Google', 'B-ORG')],
]
test_sents = [
    [('Alice', 'B-PER'), ('visited', 'O'), ('Paris', 'B-LOC')],
]

# 2. Feature extraction functions
def word2features(sent, i):
    """
    Extract features for the token at index i in sentence sent.
    sent: list of (word, tag) tuples
    """
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    # previous word features
    if i > 0:
        prev_word = sent[i - 1][0]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence
    # next word features
    if i < len(sent) - 1:
        next_word = sent[i + 1][0]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [tag for (_word, tag) in sent]

# 3. Prepare feature and label sets
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s)   for s in train_sents]
X_test  = [sent2features(s) for s in test_sents]
y_test  = [sent2labels(s)   for s in test_sents]

# 4. Print a few examples
pp = pprint.PrettyPrinter(indent=2)

print("=== Training example 0 ===")
print("Input features (per token):")
pp.pprint(X_train[0])
print("Target labels:")
pp.pprint(y_train[0])

print("\n=== Training example 1 ===")
print("Input features (per token):")
pp.pprint(X_train[1])
print("Target labels:")
pp.pprint(y_train[1])

print("\n=== Test example 0 ===")
print("Input features (per token):")
pp.pprint(X_test[0])
print("Target labels:")
pp.pprint(y_test[0])

Output

=== Training example 0 ===
Input features (per token):
[ { '+1:word.istitle()': False,
    '+1:word.isupper()': False,
    '+1:word.lower()': 'lives',
    'BOS': True,
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': True,
    'word.isupper()': False,
    'word.lower()': 'john'},
  { '+1:word.istitle()': False,
    '+1:word.isupper()': False,
    '+1:word.lower()': 'in',
    '-1:word.istitle()': True,
    '-1:word.isupper()': False,
    '-1:word.lower()': 'john',
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': False,
    'word.isupper()': False,
    'word.lower()': 'lives'},
  { '+1:word.istitle()': True,
    '+1:word.isupper()': False,
    '+1:word.lower()': 'london',
    '-1:word.istitle()': False,
    '-1:word.isupper()': False,
    '-1:word.lower()': 'lives',
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': False,
    'word.isupper()': False,
    'word.lower()': 'in'},
  { '-1:word.istitle()': False,
    '-1:word.isupper()': False,
    '-1:word.lower()': 'in',
    'EOS': True,
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': True,
    'word.isupper()': False,
    'word.lower()': 'london'}]
Target labels:
['B-PER', 'O', 'O', 'B-LOC']

=== Training example 1 ===
Input features (per token):
[ { '+1:word.istitle()': False,
    '+1:word.isupper()': False,
    '+1:word.lower()': 'works',
    'BOS': True,
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': True,
    'word.isupper()': False,
    'word.lower()': 'mary'},
  { '+1:word.istitle()': False,
    '+1:word.isupper()': False,
    '+1:word.lower()': 'at',
    '-1:word.istitle()': True,
    '-1:word.isupper()': False,
    '-1:word.lower()': 'mary',
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': False,
    'word.isupper()': False,
    'word.lower()': 'works'},
  { '+1:word.istitle()': True,
    '+1:word.isupper()': False,
    '+1:word.lower()': 'google',
    '-1:word.istitle()': False,
    '-1:word.isupper()': False,
    '-1:word.lower()': 'works',
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': False,
    'word.isupper()': False,
    'word.lower()': 'at'},
  { '-1:word.istitle()': False,
    '-1:word.isupper()': False,
    '-1:word.lower()': 'at',
    'EOS': True,
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': True,
    'word.isupper()': False,
    'word.lower()': 'google'}]
Target labels:
['B-PER', 'O', 'O', 'B-ORG']

=== Test example 0 ===
Input features (per token):
[ { '+1:word.istitle()': False,
    '+1:word.isupper()': False,
    '+1:word.lower()': 'visited',
    'BOS': True,
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': True,
    'word.isupper()': False,
    'word.lower()': 'alice'},
  { '+1:word.istitle()': True,
    '+1:word.isupper()': False,
    '+1:word.lower()': 'paris',
    '-1:word.istitle()': True,
    '-1:word.isupper()': False,
    '-1:word.lower()': 'alice',
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': False,
    'word.isupper()': False,
    'word.lower()': 'visited'},
  { '-1:word.istitle()': False,
    '-1:word.isupper()': False,
    '-1:word.lower()': 'visited',
    'EOS': True,
    'bias': 1.0,
    'word.isdigit()': False,
    'word.istitle()': True,
    'word.isupper()': False,
    'word.lower()': 'paris'}]
Target labels:
['B-PER', 'O', 'B-LOC']

4. Training and Prediction

import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,            # L1 penalty
    c2=0.1,            # L2 penalty
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# Predict tags for the test sentence
y_pred = crf.predict(X_test)
print("Input Sentence:", [w for w,_ in test_sents[0]])
print("Predicted Tags:", y_pred[0])

Output:

Input Sentence: ['Alice', 'visited', 'Paris']
Predicted Tags: ['B-PER' 'O' 'B-ORG']

Continue to Module 4.5: Evaluation Metrics – Precision, Recall & F₁-Score