BACKUP_AI_LabReport_Week7 - TheEvergreenStateCollege/upper-division-cs-23-24 GitHub Wiki

2.6 Sampling


Encoded text length: 101048
x: [3285, 546, 340, 11]
y:       [546, 340, 11, 262]
[tensor([[1212, 1492,  373,  717]]), tensor([[1492,  373,  717, 3199]])]
[tensor([[1492,  373,  717, 3199]]), tensor([[ 373,  717, 3199,  287]])]
Inputs:
 tensor([[ 1212,  1492,   373,   717],
        [ 3199,   287, 27937,   416],
        [ 7703,    11,  4373,   290],
        [ 5834,    13,   628,   200],
        [10970,   327, 11417,  1137],
        [ 3268,  3336,   371, 48743],
        [  198,   198,  3886,   198],
        [  198,    41,    13,    35]])
Targets:
tensor([[ 1492,   373,   717,  3199],
[  287, 27937,   416,  7703],
[   11,  4373,   290,  5834],
[   13,   628,   200, 10970],
[  327, 11417,  1137,  3268],
[ 3336,   371, 48743,   198],
[  198,  3886,   198,   198],
[   41,    13,    35,    13]])
Parameter containing:
tensor([[ 0.5224, -1.7729,  0.1980],
[-0.3565,  0.6338, -1.6640],
[ 0.8090, -0.0369,  0.7485],
[ 2.4581,  0.7263, -0.7000],
[-0.3004, -1.2820, -0.0088],
[ 0.9215, -0.6833,  1.0089]], requires_grad=True)
tensor([[ 2.4581,  0.7263, -0.7000]], grad_fn=)
tensor([[ 0.8090, -0.0369,  0.7485],
[ 2.4581,  0.7263, -0.7000],
[ 0.9215, -0.6833,  1.0089],
[-0.3565,  0.6338, -1.6640]], grad_fn=)
token IDs:
tensor([[   13,  4849,  3889,   198],
[  198, 16224, 27937,   198],
[  198, 41481,   352,   198],
[  198,  1532,   345,  1107],
[  765,   284,  3285,   546],
[  340,    11,   262,   717],
[ 1517,   345,  1183,  2192],
[  765,   284,   760,   318]])
Inputs shape:
torch.Size([8, 4])
torch.Size([8, 4, 256])
pos_embeddings.shape :torch.Size([4, 256])
token_embeddings.shape :torch.Size([8, 4, 256])
torch.Size([8, 4, 256])
101048
x: [3285, 546, 340, 11]
y:      [546, 340, 11, 262]
[3285] ----> 546
[3285, 546] ----> 340
[3285, 546, 340] ----> 11
[3285, 546, 340, 11] ----> 262
hear ---->  about
hear about ---->  it
hear about it ----> ,
hear about it, ---->  the
First Batch :
[tensor([[1212, 1492,  373,  717]]), tensor([[1492,  373,  717, 3199]])]
Second Batch :
[tensor([[1492,  373,  717, 3199]]), tensor([[ 373,  717, 3199,  287]])]

2.7 Embeddings


----- EMBEDDING LAYER WEIGHT MATRIX -----
Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
[ 0.9178,  1.5810,  1.3010],
[ 1.2753, -0.2010, -0.1606],
[-0.4015,  0.9666, -1.1481],
[-1.1589,  0.3255, -0.6315],
[-2.8400, -0.7849, -1.4096]], requires_grad=True)
----- SINGLE TOKEN ID, TENSOR([3]) --> EMBEDDED VECTOR -----
tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=)
----- FOUR INPUT ID VALUES: [2, 3, 5, 1] --> MATRIX EMBEDDING -----
tensor([[ 1.2753, -0.2010, -0.1606],
[-0.4015,  0.9666, -1.1481],
[-2.8400, -0.7849, -1.4096],
[ 0.9178,  1.5810,  1.3010]], grad_fn=)

2.8 Positional


Encoded text length: 101048
x: [3285, 546, 340, 11]
y:       [546, 340, 11, 262]
[tensor([[1212, 1492,  373,  717]]), tensor([[1492,  373,  717, 3199]])]
[tensor([[1492,  373,  717, 3199]]), tensor([[ 373,  717, 3199,  287]])]
Inputs:
tensor([[ 1212,  1492,   373,   717],
[ 3199,   287, 27937,   416],
[ 7703,    11,  4373,   290],
[ 5834,    13,   628,   200],
[10970,   327, 11417,  1137],
[ 3268,  3336,   371, 48743],
[  198,   198,  3886,   198],
[  198,    41,    13,    35]])
Targets:
tensor([[ 1492,   373,   717,  3199],
[  287, 27937,   416,  7703],
[   11,  4373,   290,  5834],
[   13,   628,   200, 10970],
[  327, 11417,  1137,  3268],
[ 3336,   371, 48743,   198],
[  198,  3886,   198,   198],
[   41,    13,    35,    13]])
Parameter containing:
tensor([[ 0.5224, -1.7729,  0.1980],
[-0.3565,  0.6338, -1.6640],
[ 0.8090, -0.0369,  0.7485],
[ 2.4581,  0.7263, -0.7000],
[-0.3004, -1.2820, -0.0088],
[ 0.9215, -0.6833,  1.0089]], requires_grad=True)
tensor([[ 2.4581,  0.7263, -0.7000]], grad_fn=)
tensor([[ 0.8090, -0.0369,  0.7485],
[ 2.4581,  0.7263, -0.7000],
[ 0.9215, -0.6833,  1.0089],
[-0.3565,  0.6338, -1.6640]], grad_fn=)
token IDs:
tensor([[   13,  4849,  3889,   198],
[  198, 16224, 27937,   198],
[  198, 41481,   352,   198],
[  198,  1532,   345,  1107],
[  765,   284,  3285,   546],
[  340,    11,   262,   717],
[ 1517,   345,  1183,  2192],
[  765,   284,   760,   318]])
Inputs shape:
torch.Size([8, 4])
torch.Size([8, 4, 256])
pos_embeddings.shape :torch.Size([4, 256])
token_embeddings.shape :torch.Size([8, 4, 256])
torch.Size([8, 4, 256])
Token IDs:
tensor([[ 1212,  1492,   373,   717],
[ 3199,   287, 27937,   416],
[ 7703,    11,  4373,   290],
[ 5834,    13,   628,   200],
[10970,   327, 11417,  1137],
[ 3268,  3336,   371, 48743],
[  198,   198,  3886,   198],
[  198,    41,    13,    35]])
Inputs shape:
torch.Size([8, 4])
Token Embeddings Shape:
torch.Size([8, 4, 256])
Positional Embeddings:
torch.Size([4, 256])
Input Embeddings:
torch.Size([8, 4, 256])

3.3.1 Untrainable


Encoded text length: 101048
x: [3285, 546, 340, 11]
y:       [546, 340, 11, 262]
[tensor([[1212, 1492,  373,  717]]), tensor([[1492,  373,  717, 3199]])]
[tensor([[1492,  373,  717, 3199]]), tensor([[ 373,  717, 3199,  287]])]
Inputs:
tensor([[ 1212,  1492,   373,   717],
[ 3199,   287, 27937,   416],
[ 7703,    11,  4373,   290],
[ 5834,    13,   628,   200],
[10970,   327, 11417,  1137],
[ 3268,  3336,   371, 48743],
[  198,   198,  3886,   198],
[  198,    41,    13,    35]])
Targets:
tensor([[ 1492,   373,   717,  3199],
[  287, 27937,   416,  7703],
[   11,  4373,   290,  5834],
[   13,   628,   200, 10970],
[  327, 11417,  1137,  3268],
[ 3336,   371, 48743,   198],
[  198,  3886,   198,   198],
[   41,    13,    35,    13]])
Parameter containing:
tensor([[ 0.5224, -1.7729,  0.1980],
[-0.3565,  0.6338, -1.6640],
[ 0.8090, -0.0369,  0.7485],
[ 2.4581,  0.7263, -0.7000],
[-0.3004, -1.2820, -0.0088],
[ 0.9215, -0.6833,  1.0089]], requires_grad=True)
tensor([[ 2.4581,  0.7263, -0.7000]], grad_fn=)
tensor([[ 0.8090, -0.0369,  0.7485],
[ 2.4581,  0.7263, -0.7000],
[ 0.9215, -0.6833,  1.0089],
[-0.3565,  0.6338, -1.6640]], grad_fn=)
token IDs:
tensor([[   13,  4849,  3889,   198],
[  198, 16224, 27937,   198],
[  198, 41481,   352,   198],
[  198,  1532,   345,  1107],
[  765,   284,  3285,   546],
[  340,    11,   262,   717],
[ 1517,   345,  1183,  2192],
[  765,   284,   760,   318]])
Inputs shape:
torch.Size([8, 4])
torch.Size([8, 4, 256])
pos_embeddings.shape :torch.Size([4, 256])
token_embeddings.shape :torch.Size([8, 4, 256])
torch.Size([8, 4, 256])
tensor([ 3199,   287, 27937,   416])
Attention Score For inputs[1]:
tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07,
1.0934e+08, 1.0229e+06])
tensor(15024165)
Dot Product Computed Using torch.dot():
tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07,
1.0934e+08, 1.0229e+06])
Attention Weights:
tensor([0.0100, 0.5286, 0.0982, 0.0243, 0.2370, 0.0281, 0.0731, 0.0007])
Sum:
tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07,
1.0934e+08, 1.0229e+06])
Softmax functions are used to ensure that attention weights are always positive. This way, the output is interpretable as probabilities.
Using softmax_naive(): [NOT ADVISABLE]
Attention weights:
tensor([nan, nan, nan, nan, nan, nan, nan, nan])
Sum:
tensor(nan)
Using torch.softmax(): [ADVISABLE]
Attention weights: tensor([0., 1., 0., 0., 0., 0., 0., 0.])
Sum: tensor(1.)
The "context vector" is calculated by multiply ea input vector by corresponding attention weight.
Context Vector:
tensor([ 3199.,   287., 27937.,   416.])

3.3.2 Trainable

Encoded text length: 101048 x: [3285, 546, 340, 11] y: [546, 340, 11, 262] [tensor(1212, 1492, 373, 717), tensor(1492, 373, 717, 3199)] [tensor(1492, 373, 717, 3199), tensor(373, 717, 3199, 287)] Inputs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])

Targets: tensor([[ 1492, 373, 717, 3199], [ 287, 27937, 416, 7703], [ 11, 4373, 290, 5834], [ 13, 628, 200, 10970], [ 327, 11417, 1137, 3268], [ 3336, 371, 48743, 198], [ 198, 3886, 198, 198], [ 41, 13, 35, 13]]) Parameter containing: tensor([[ 0.5224, -1.7729, 0.1980], [-0.3565, 0.6338, -1.6640], [ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [-0.3004, -1.2820, -0.0088], [ 0.9215, -0.6833, 1.0089]], requires_grad=True) tensor(2.4581, 0.7263, -0.7000, grad_fn=) tensor([[ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [ 0.9215, -0.6833, 1.0089], [-0.3565, 0.6338, -1.6640]], grad_fn=) token IDs: tensor([[ 13, 4849, 3889, 198], [ 198, 16224, 27937, 198], [ 198, 41481, 352, 198], [ 198, 1532, 345, 1107], [ 765, 284, 3285, 546], [ 340, 11, 262, 717], [ 1517, 345, 1183, 2192], [ 765, 284, 760, 318]])

Inputs shape: torch.Size([8, 4]) torch.Size([8, 4, 256])

pos_embeddings.shape :torch.Size([4, 256])

token_embeddings.shape :torch.Size([8, 4, 256])

torch.Size([8, 4, 256]) tensor([[ 4348226, 15024165, 11191507, 7467848, 18857294, 44025242, 2126836, 331092], [ 15024165, 790964995, 146934195, 36294333, 354616600, 42053479, 109335778, 1022910], [ 11191507, 146934195, 78543559, 47743689, 134761778, 40967953, 18578270, 1592644], [ 7467848, 36294333, 47743689, 34470109, 71400507, 29090468, 3637714, 1170829], [ 18857294, 354616600, 134761778, 71400507, 252088487, 96597330, 46828394, 2373683], [ 44025242, 42053479, 40967953, 29090468, 96597330, 2397826410, 12400412, 2494668], [ 2126836, 109335778, 18578270, 3637714, 46828394, 12400412, 15218608, 104770], [ 331092, 1022910, 1592644, 1170829, 2373683, 2494668, 104770, 42279]]) <class 'torch.Tensor'>