Module 7 Language Modeling and Text Generation - iffatAGheyas/applied-nlp-handbook GitHub Wiki
Module 7: Language Modeling & Text Generation
This module demonstrates how to leverage pretrained causal language models for generation, explore sampling strategies, fine-tune on a small corpus, and evaluate via perplexity.
7.1 Loading & Sampling from a Pretrained LM
from transformers import AutoTokenizer, AutoModelForCausalLM
# 1. Load DistilGPT-2 (small, CPU-friendly)
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
# 2. Generation helper
def generate_text(prompt, max_new_tokens=50, **gen_kwargs):
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, **gen_kwargs)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# 3. Demo
prompt = "In a world where machines dream,"
print(generate_text(prompt))
Output:
In a world where machines dream, we need to be able to do things that are not possible.
7.2 Sampling Strategies
# Make sure you have transformers installed:
# %pip install --quiet transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# 1. Load model & tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# 2. Build a generation pipeline (CPU)
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=-1 # use CPU
)
# 3. A small helper that passes all flags through
def generate_text(prompt: str,
do_sample: bool = False,
top_k: int = None,
top_p: float = None,
max_new_tokens: int = 50) -> str:
out = generator(
prompt,
do_sample=do_sample,
top_k=top_k,
top_p=top_p,
max_new_tokens=max_new_tokens,
pad_token_id=tokenizer.eos_token_id
)
# out is a list of dicts; grab the generated text
return out[0]["generated_text"]
# 4. Your prompt
prompt = "In a world where machines dream,"
# 5. Three decodings
print("Greedy :", generate_text(prompt, do_sample=False))
print("Top-k :", generate_text(prompt, do_sample=True, top_k=50))
print("Top-p :", generate_text(prompt, do_sample=True, top_p=0.9))
Output:
Greedy : In a world where machines dream, we need to be able to do things that are not possible in the real world.
The world is not a simulation. It is a reality.
The world is not a simulation. It is a reality.
The world is
Top-k : In a world where machines dream, we can find ourselves in a world where machines are always real.
It's easy to see why.
The world where machines dream has the potential to be a real world, where we can't escape our dreams, and where the future
Top-p : In a world where machines dream, and where we can live in a world where we can dream, we have to work with a new type of dream.
Think about what happens when you combine dreams with reality. When you're in a dream, you can see your vision and
7.3 Fine-tuning on a Toy Corpus
# 1. Install required libraries (only needs to run once per kernel)
%pip install --quiet transformers datasets
# 2. Imports
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
# 3. Load model & tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# 4. Ensure we have a pad token (GPT-2 has none by default)
# We’ll just alias pad_token to the existing eos_token
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
# 5. Prepare tiny dataset
texts = [
"Hello world.",
"NLP is fascinating.",
"Transformers power modern AI.",
"Text generation is creative."
]
ds = Dataset.from_dict({"text": texts})
# 6. Tokenization + label setup
def tokenize_fn(batch):
enc = tokenizer(
batch["text"],
truncation=True,
padding="max_length",
max_length=32
)
# For causal‐LM fine‐tuning, labels are just the input_ids
enc["labels"] = enc["input_ids"].copy()
return enc
tok_ds = ds.map(
tokenize_fn,
batched=True,
remove_columns=["text"]
)
tok_ds.set_format(
type="torch",
columns=["input_ids", "attention_mask", "labels"]
)
# 7. Training arguments
args = TrainingArguments(
output_dir="./finetuned",
per_device_train_batch_size=2,
num_train_epochs=1,
logging_steps=5,
no_cuda=True, # force CPU
save_strategy="no" # don’t write checkpoints for this tiny demo
)
# 8. Trainer and fine-tune
trainer = Trainer(
model=model,
args=args,
train_dataset=tok_ds
)
trainer.train()
Output:
TrainOutput(global_step=2, training_loss=7.136963844299316, metrics={'train_runtime': 2.4136, 'train_samples_per_second': 1.657, 'train_steps_per_second': 0.829, 'total_flos': 65323008000.0, 'train_loss': 7.136963844299316, 'epoch': 1.0})
7.4 Evaluation via Perplexity
# finetune_and_evaluate_legacy.ipynb
# 1. (Re-)install if you need a specific transformers version:
# %pip install --quiet "transformers<4.10.0" datasets
import math
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
# 2. Load GPT-2 and add pad token
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
# 3. Tiny dataset
texts = [
"Hello world.",
"NLP is fascinating.",
"Transformers power modern AI.",
"Text generation is creative."
]
ds = Dataset.from_dict({"text": texts})
# 4. Tokenisation + labels
def tokenize_fn(batch):
enc = tokenizer(
batch["text"],
truncation=True,
padding="max_length",
max_length=32
)
enc["labels"] = enc["input_ids"].copy()
return enc
tok_ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
tok_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# 5. TrainingArguments (legacy)
args = TrainingArguments(
output_dir="./finetuned",
per_device_train_batch_size=2,
num_train_epochs=1,
logging_steps=5,
no_cuda=True,
# save_steps=10, # optionally control checkpointing frequency
# eval_steps=10, # there's no built-in evaluation strategy—see below
)
# 6. Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=tok_ds,
eval_dataset=tok_ds # you can still pass eval_dataset
)
# 7. Train
trainer.train()
# 8. Evaluate and compute perplexity
eval_results = trainer.evaluate() # runs at end of training
perplexity = math.exp(eval_results["eval_loss"])
print(f"Eval loss = {eval_results['eval_loss']:.4f}")
print(f"Perplexity = {perplexity:.2f}")
Output:
Eval loss = 4.6768
Perplexity = 107.43
7.5 Prompt Engineering & Use Cases
- Zero-shot classification using prompts
- Text continuation for storytelling
- Chatbot prototypes by alternating prompts
# Example: zero-shot sentiment
from transformers import pipeline
sentiment = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
print(sentiment("I love NLP tutorials!"))
Output:
[{'label': 'POSITIVE', 'score': 0.9996709823608398}]