05a2 GPT2 - cccbook/py2gpt GitHub Wiki
-
GPT-2 had 48 layers and used 1600 dimensional vectors for word embedding.
-
Larger vocabulary of 50,257 tokens was used.
-
Larger batch size of 512 and larger context window of 1024 tokens were used.
-
Layer normalisation was moved to input of each sub-block and an additional layer normalisation was added after final self-attention block.
-
At initialisation, the weight of residual layers was scaled by 1/√N, where N was the number of residual layers.
The authors trained four language models with 117M (same as GPT-1), 345M, 762M and 1.5B (GPT-2) parameters. Each subsequent model had lower perplexity than previous one.
class SelfAttentionGPT2(nn.Module):
"""
This is the self-attention operation as implemented in the Huggingface port of GPT2. The code has been
simplified to remove several features not used here but otherwise it should do exactly the same as GPT2 when run with
normal parameters.
It is very similar to the default SelfAttention below, with the exception of the way it's initialized and some
small speed improvements in the custom implementation of the linear layer (the Conv1D defined above).
We include this primarily for comparison with our own canonical implementation to check for performance differences.
"""
def __init__(self, emb, heads, mask=False):
super().__init__()
self.nheads = heads
self.emb = emb
self.mask = mask
#self.c_attn = Conv1D(3 * emb, emb)
# -- (out_channels, in_channels):
# This is a very slight modification of a linear layer
self.c_attn = nn.Linear(emb, 3*emb)
#self.c_proj = Conv1D(emb, emb)
self.c_proj = nn.Linear(emb, emb)
def _attn(self, q, k, v):
dot = torch.matmul(q, k) # raw attention weights
dot = dot / (float(v.size(-1)) ** 0.5) # scaled attention weights
if self.mask: # Apply the attention mask
mask_(dot, maskval=float('-inf'), mask_diagonal=False)
# -- This is implemented differently in the Huggingface version, but the effect should be the same.
dot = nn.Softmax(dim=-1)(dot) # normalized attention weights
return torch.matmul(dot, v) # attention over values
def merge_heads(self, x):
x = x.permute(0, 2, 1, 3).contiguous()
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
return x.view(*new_x_shape)
def split_heads(self, x, is_key=False):
new_x_shape = x.size()[:-1] + (self.nheads, x.size(-1) // self.nheads)
x = x.view(*new_x_shape)
if is_key:
return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length)
else:
return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
def forward(self, input_sequence):
b, t, e = input_sequence.size()
query, key, value = self.c_attn(input_sequence).split(e, dim=2)
query = self.split_heads(query)
key = self.split_heads(key, is_key=True)
value = self.split_heads(value)
a = self._attn(query, key, value)
a = self.merge_heads(a)
a = self.c_proj(a)
return a