Module 4 4 Conditional Random Fields for Sequence Labeling - iffatAGheyas/NLP-handbook GitHub Wiki
Module 4.4: Conditional Random Fields for Sequence Labeling
Key Concepts
- Discriminative vs. Generative
1. Installing the CRF Library
pip install sklearn-crfsuite
2. Defining Feature Extraction
Features for each token can include the word itself, prefixes/suffixes, word shape, and neighboring words.
The three functions shown only *define* how to extract features and labels—they don’t print anything until they are *called*. To see their output, apply them to a sample sentence and print the results. For example:
```python
# 1. Feature extraction on a list of words
sent_words = ["John", "lives", "in", "London"]
features = sent2features(sent_words)
for i, feat in enumerate(features):
print(f"Token {i} features:", feat)
# Expected output (approximate):
# Token 0 features: {'bias': 1.0, 'word.lower()': 'john', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'BOS': True, '+1:word.lower()': 'lives', ...}
# Token 1 features: {'bias': 1.0, 'word.lower()': 'lives', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'john', 'BOS': False, '+1:word.lower()': 'in', ...}
# …and so on for each token
# 2. Label extraction on a list of (word, tag) pairs
sent_labeled = [("John", "B-PER"), ("lives", "O"), ("in", "O"), ("London", "B-LOC")]
labels = sent2labels(sent_labeled)
print("Labels:", labels)
# Expected output:
# Labels: ['B-PER', 'O', 'O', 'B-LOC']
Output:
Token 0 features: {'bias': 1.0, 'word.lower()': 'john', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'BOS': True, '+1:word.lower()': 'lives', '+1:word.istitle()': False, '+1:word.isupper()': False}
Token 1 features: {'bias': 1.0, 'word.lower()': 'lives', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'john', '-1:word.istitle()': True, '-1:word.isupper()': False, '+1:word.lower()': 'in', '+1:word.istitle()': False, '+1:word.isupper()': False}
Token 2 features: {'bias': 1.0, 'word.lower()': 'in', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'lives', '-1:word.istitle()': False, '-1:word.isupper()': False, '+1:word.lower()': 'london', '+1:word.istitle()': True, '+1:word.isupper()': False}
Token 3 features: {'bias': 1.0, 'word.lower()': 'london', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, '-1:word.lower()': 'in', '-1:word.istitle()': False, '-1:word.isupper()': False, 'EOS': True}
Labels: ['B-PER', 'O', 'O', 'B-LOC']
3. Preparing a Toy Dataset
# ner_toy_dataset_demo.ipynb
import pprint
# 1. Toy dataset: each sentence is a list of (word, tag) tuples
train_sents = [
[('John', 'B-PER'), ('lives', 'O'), ('in', 'O'), ('London', 'B-LOC')],
[('Mary', 'B-PER'), ('works', 'O'), ('at', 'O'), ('Google', 'B-ORG')],
]
test_sents = [
[('Alice', 'B-PER'), ('visited', 'O'), ('Paris', 'B-LOC')],
]
# 2. Feature extraction functions
def word2features(sent, i):
"""
Extract features for the token at index i in sentence sent.
sent: list of (word, tag) tuples
"""
word = sent[i][0]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
}
# previous word features
if i > 0:
prev_word = sent[i - 1][0]
features.update({
'-1:word.lower()': prev_word.lower(),
'-1:word.istitle()': prev_word.istitle(),
'-1:word.isupper()': prev_word.isupper(),
})
else:
features['BOS'] = True # Beginning of sentence
# next word features
if i < len(sent) - 1:
next_word = sent[i + 1][0]
features.update({
'+1:word.lower()': next_word.lower(),
'+1:word.istitle()': next_word.istitle(),
'+1:word.isupper()': next_word.isupper(),
})
else:
features['EOS'] = True # End of sentence
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [tag for (_word, tag) in sent]
# 3. Prepare feature and label sets
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]
# 4. Print a few examples
pp = pprint.PrettyPrinter(indent=2)
print("=== Training example 0 ===")
print("Input features (per token):")
pp.pprint(X_train[0])
print("Target labels:")
pp.pprint(y_train[0])
print("\n=== Training example 1 ===")
print("Input features (per token):")
pp.pprint(X_train[1])
print("Target labels:")
pp.pprint(y_train[1])
print("\n=== Test example 0 ===")
print("Input features (per token):")
pp.pprint(X_test[0])
print("Target labels:")
pp.pprint(y_test[0])
Output
=== Training example 0 ===
Input features (per token):
[ { '+1:word.istitle()': False,
'+1:word.isupper()': False,
'+1:word.lower()': 'lives',
'BOS': True,
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': True,
'word.isupper()': False,
'word.lower()': 'john'},
{ '+1:word.istitle()': False,
'+1:word.isupper()': False,
'+1:word.lower()': 'in',
'-1:word.istitle()': True,
'-1:word.isupper()': False,
'-1:word.lower()': 'john',
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': False,
'word.isupper()': False,
'word.lower()': 'lives'},
{ '+1:word.istitle()': True,
'+1:word.isupper()': False,
'+1:word.lower()': 'london',
'-1:word.istitle()': False,
'-1:word.isupper()': False,
'-1:word.lower()': 'lives',
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': False,
'word.isupper()': False,
'word.lower()': 'in'},
{ '-1:word.istitle()': False,
'-1:word.isupper()': False,
'-1:word.lower()': 'in',
'EOS': True,
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': True,
'word.isupper()': False,
'word.lower()': 'london'}]
Target labels:
['B-PER', 'O', 'O', 'B-LOC']
=== Training example 1 ===
Input features (per token):
[ { '+1:word.istitle()': False,
'+1:word.isupper()': False,
'+1:word.lower()': 'works',
'BOS': True,
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': True,
'word.isupper()': False,
'word.lower()': 'mary'},
{ '+1:word.istitle()': False,
'+1:word.isupper()': False,
'+1:word.lower()': 'at',
'-1:word.istitle()': True,
'-1:word.isupper()': False,
'-1:word.lower()': 'mary',
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': False,
'word.isupper()': False,
'word.lower()': 'works'},
{ '+1:word.istitle()': True,
'+1:word.isupper()': False,
'+1:word.lower()': 'google',
'-1:word.istitle()': False,
'-1:word.isupper()': False,
'-1:word.lower()': 'works',
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': False,
'word.isupper()': False,
'word.lower()': 'at'},
{ '-1:word.istitle()': False,
'-1:word.isupper()': False,
'-1:word.lower()': 'at',
'EOS': True,
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': True,
'word.isupper()': False,
'word.lower()': 'google'}]
Target labels:
['B-PER', 'O', 'O', 'B-ORG']
=== Test example 0 ===
Input features (per token):
[ { '+1:word.istitle()': False,
'+1:word.isupper()': False,
'+1:word.lower()': 'visited',
'BOS': True,
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': True,
'word.isupper()': False,
'word.lower()': 'alice'},
{ '+1:word.istitle()': True,
'+1:word.isupper()': False,
'+1:word.lower()': 'paris',
'-1:word.istitle()': True,
'-1:word.isupper()': False,
'-1:word.lower()': 'alice',
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': False,
'word.isupper()': False,
'word.lower()': 'visited'},
{ '-1:word.istitle()': False,
'-1:word.isupper()': False,
'-1:word.lower()': 'visited',
'EOS': True,
'bias': 1.0,
'word.isdigit()': False,
'word.istitle()': True,
'word.isupper()': False,
'word.lower()': 'paris'}]
Target labels:
['B-PER', 'O', 'B-LOC']
4. Training and Prediction
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1, # L1 penalty
c2=0.1, # L2 penalty
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
# Predict tags for the test sentence
y_pred = crf.predict(X_test)
print("Input Sentence:", [w for w,_ in test_sents[0]])
print("Predicted Tags:", y_pred[0])
Output:
Input Sentence: ['Alice', 'visited', 'Paris']
Predicted Tags: ['B-PER' 'O' 'B-ORG']
Continue to Module 4.5: Evaluation Metrics – Precision, Recall & F₁-Score