Module 7 Language Modeling and Text Generation - iffatAGheyas/applied-nlp-handbook GitHub Wiki

Module 7: Language Modeling & Text Generation

This module demonstrates how to leverage pretrained causal language models for generation, explore sampling strategies, fine-tune on a small corpus, and evaluate via perplexity.

7.1 Loading & Sampling from a Pretrained LM

from transformers import AutoTokenizer, AutoModelForCausalLM

# 1. Load DistilGPT-2 (small, CPU-friendly)
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model     = AutoModelForCausalLM.from_pretrained("distilgpt2")

# 2. Generation helper
def generate_text(prompt, max_new_tokens=50, **gen_kwargs):
    inputs  = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, **gen_kwargs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 3. Demo
prompt = "In a world where machines dream,"
print(generate_text(prompt))

Output:

In a world where machines dream, we need to be able to do things that are not possible.

7.2 Sampling Strategies

# Make sure you have transformers installed:
# %pip install --quiet transformers

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# 1. Load model & tokenizer
model_name = "gpt2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(model_name)

# 2. Build a generation pipeline (CPU)
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=-1           # use CPU
)

# 3. A small helper that passes all flags through
def generate_text(prompt: str,
                  do_sample: bool = False,
                  top_k: int     = None,
                  top_p: float   = None,
                  max_new_tokens: int = 50) -> str:
    out = generator(
        prompt,
        do_sample=do_sample,
        top_k=top_k,
        top_p=top_p,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id
    )
    # out is a list of dicts; grab the generated text
    return out[0]["generated_text"]

# 4. Your prompt
prompt = "In a world where machines dream,"

# 5. Three decodings
print("Greedy  :", generate_text(prompt, do_sample=False))
print("Top-k    :", generate_text(prompt, do_sample=True,  top_k=50))
print("Top-p    :", generate_text(prompt, do_sample=True,  top_p=0.9))

Output:

Greedy  : In a world where machines dream, we need to be able to do things that are not possible in the real world.

The world is not a simulation. It is a reality.

The world is not a simulation. It is a reality.

The world is
Top-k    : In a world where machines dream, we can find ourselves in a world where machines are always real.

It's easy to see why.

The world where machines dream has the potential to be a real world, where we can't escape our dreams, and where the future
Top-p    : In a world where machines dream, and where we can live in a world where we can dream, we have to work with a new type of dream.

Think about what happens when you combine dreams with reality. When you're in a dream, you can see your vision and

7.3 Fine-tuning on a Toy Corpus

# 1. Install required libraries (only needs to run once per kernel)
%pip install --quiet transformers datasets

# 2. Imports
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

# 3. Load model & tokenizer
model_name = "gpt2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(model_name)

# 4. Ensure we have a pad token (GPT-2 has none by default)
#    We’ll just alias pad_token to the existing eos_token
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# 5. Prepare tiny dataset
texts = [
    "Hello world.",
    "NLP is fascinating.",
    "Transformers power modern AI.",
    "Text generation is creative."
]
ds = Dataset.from_dict({"text": texts})

# 6. Tokenization + label setup
def tokenize_fn(batch):
    enc = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=32
    )
    # For causal‐LM fine‐tuning, labels are just the input_ids
    enc["labels"] = enc["input_ids"].copy()
    return enc

tok_ds = ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)
tok_ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

# 7. Training arguments
args = TrainingArguments(
    output_dir="./finetuned",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=5,
    no_cuda=True,            # force CPU
    save_strategy="no"       # don’t write checkpoints for this tiny demo
)

# 8. Trainer and fine-tune
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tok_ds
)
trainer.train()

Output:

TrainOutput(global_step=2, training_loss=7.136963844299316, metrics={'train_runtime': 2.4136, 'train_samples_per_second': 1.657, 'train_steps_per_second': 0.829, 'total_flos': 65323008000.0, 'train_loss': 7.136963844299316, 'epoch': 1.0})

7.4 Evaluation via Perplexity

# finetune_and_evaluate_legacy.ipynb

# 1. (Re-)install if you need a specific transformers version:
# %pip install --quiet "transformers<4.10.0" datasets

import math
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

# 2. Load GPT-2 and add pad token
model_name = "gpt2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# 3. Tiny dataset
texts = [
    "Hello world.",
    "NLP is fascinating.",
    "Transformers power modern AI.",
    "Text generation is creative."
]
ds = Dataset.from_dict({"text": texts})

# 4. Tokenisation + labels
def tokenize_fn(batch):
    enc = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=32
    )
    enc["labels"] = enc["input_ids"].copy()
    return enc

tok_ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
tok_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 5. TrainingArguments (legacy)
args = TrainingArguments(
    output_dir="./finetuned",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=5,
    no_cuda=True,
    # save_steps=10,       # optionally control checkpointing frequency
    # eval_steps=10,       # there's no built-in evaluation strategy—see below
)

# 6. Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tok_ds,
    eval_dataset=tok_ds   # you can still pass eval_dataset
)

# 7. Train
trainer.train()

# 8. Evaluate and compute perplexity
eval_results = trainer.evaluate()  # runs at end of training
perplexity = math.exp(eval_results["eval_loss"])
print(f"Eval loss  = {eval_results['eval_loss']:.4f}")
print(f"Perplexity = {perplexity:.2f}")

Output:

Eval loss  = 4.6768
Perplexity = 107.43

7.5 Prompt Engineering & Use Cases

Zero-shot classification using prompts
Text continuation for storytelling
Chatbot prototypes by alternating prompts

# Example: zero-shot sentiment
from transformers import pipeline
sentiment = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
print(sentiment("I love NLP tutorials!"))

Output:

[{'label': 'POSITIVE', 'score': 0.9996709823608398}]