BACKUP_AI_LabReport_Week7 - TheEvergreenStateCollege/upper-division-cs-23-24 GitHub Wiki
Encoded text length: 101048
x: [3285, 546, 340, 11]
y: [546, 340, 11, 262]
[tensor([[1212, 1492, 373, 717]]), tensor([[1492, 373, 717, 3199]])]
[tensor([[1492, 373, 717, 3199]]), tensor([[ 373, 717, 3199, 287]])]
Inputs:
tensor([[ 1212, 1492, 373, 717],
[ 3199, 287, 27937, 416],
[ 7703, 11, 4373, 290],
[ 5834, 13, 628, 200],
[10970, 327, 11417, 1137],
[ 3268, 3336, 371, 48743],
[ 198, 198, 3886, 198],
[ 198, 41, 13, 35]])
Targets:
tensor([[ 1492, 373, 717, 3199],
[ 287, 27937, 416, 7703],
[ 11, 4373, 290, 5834],
[ 13, 628, 200, 10970],
[ 327, 11417, 1137, 3268],
[ 3336, 371, 48743, 198],
[ 198, 3886, 198, 198],
[ 41, 13, 35, 13]])
Parameter containing:
tensor([[ 0.5224, -1.7729, 0.1980],
[-0.3565, 0.6338, -1.6640],
[ 0.8090, -0.0369, 0.7485],
[ 2.4581, 0.7263, -0.7000],
[-0.3004, -1.2820, -0.0088],
[ 0.9215, -0.6833, 1.0089]], requires_grad=True)
tensor([[ 2.4581, 0.7263, -0.7000]], grad_fn=)
tensor([[ 0.8090, -0.0369, 0.7485],
[ 2.4581, 0.7263, -0.7000],
[ 0.9215, -0.6833, 1.0089],
[-0.3565, 0.6338, -1.6640]], grad_fn=)
token IDs:
tensor([[ 13, 4849, 3889, 198],
[ 198, 16224, 27937, 198],
[ 198, 41481, 352, 198],
[ 198, 1532, 345, 1107],
[ 765, 284, 3285, 546],
[ 340, 11, 262, 717],
[ 1517, 345, 1183, 2192],
[ 765, 284, 760, 318]])
Inputs shape:
torch.Size([8, 4])
torch.Size([8, 4, 256])
pos_embeddings.shape :torch.Size([4, 256])
token_embeddings.shape :torch.Size([8, 4, 256])
torch.Size([8, 4, 256])
101048
x: [3285, 546, 340, 11]
y: [546, 340, 11, 262]
[3285] ----> 546
[3285, 546] ----> 340
[3285, 546, 340] ----> 11
[3285, 546, 340, 11] ----> 262
hear ----> about
hear about ----> it
hear about it ----> ,
hear about it, ----> the
First Batch :
[tensor([[1212, 1492, 373, 717]]), tensor([[1492, 373, 717, 3199]])]
Second Batch :
[tensor([[1492, 373, 717, 3199]]), tensor([[ 373, 717, 3199, 287]])]
----- EMBEDDING LAYER WEIGHT MATRIX -----
Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
[ 0.9178, 1.5810, 1.3010],
[ 1.2753, -0.2010, -0.1606],
[-0.4015, 0.9666, -1.1481],
[-1.1589, 0.3255, -0.6315],
[-2.8400, -0.7849, -1.4096]], requires_grad=True)
----- SINGLE TOKEN ID, TENSOR([3]) --> EMBEDDED VECTOR -----
tensor([[-0.4015, 0.9666, -1.1481]], grad_fn=)
----- FOUR INPUT ID VALUES: [2, 3, 5, 1] --> MATRIX EMBEDDING -----
tensor([[ 1.2753, -0.2010, -0.1606],
[-0.4015, 0.9666, -1.1481],
[-2.8400, -0.7849, -1.4096],
[ 0.9178, 1.5810, 1.3010]], grad_fn=)
Encoded text length: 101048
x: [3285, 546, 340, 11]
y: [546, 340, 11, 262]
[tensor([[1212, 1492, 373, 717]]), tensor([[1492, 373, 717, 3199]])]
[tensor([[1492, 373, 717, 3199]]), tensor([[ 373, 717, 3199, 287]])]
Inputs:
tensor([[ 1212, 1492, 373, 717],
[ 3199, 287, 27937, 416],
[ 7703, 11, 4373, 290],
[ 5834, 13, 628, 200],
[10970, 327, 11417, 1137],
[ 3268, 3336, 371, 48743],
[ 198, 198, 3886, 198],
[ 198, 41, 13, 35]])
Targets:
tensor([[ 1492, 373, 717, 3199],
[ 287, 27937, 416, 7703],
[ 11, 4373, 290, 5834],
[ 13, 628, 200, 10970],
[ 327, 11417, 1137, 3268],
[ 3336, 371, 48743, 198],
[ 198, 3886, 198, 198],
[ 41, 13, 35, 13]])
Parameter containing:
tensor([[ 0.5224, -1.7729, 0.1980],
[-0.3565, 0.6338, -1.6640],
[ 0.8090, -0.0369, 0.7485],
[ 2.4581, 0.7263, -0.7000],
[-0.3004, -1.2820, -0.0088],
[ 0.9215, -0.6833, 1.0089]], requires_grad=True)
tensor([[ 2.4581, 0.7263, -0.7000]], grad_fn=)
tensor([[ 0.8090, -0.0369, 0.7485],
[ 2.4581, 0.7263, -0.7000],
[ 0.9215, -0.6833, 1.0089],
[-0.3565, 0.6338, -1.6640]], grad_fn=)
token IDs:
tensor([[ 13, 4849, 3889, 198],
[ 198, 16224, 27937, 198],
[ 198, 41481, 352, 198],
[ 198, 1532, 345, 1107],
[ 765, 284, 3285, 546],
[ 340, 11, 262, 717],
[ 1517, 345, 1183, 2192],
[ 765, 284, 760, 318]])
Inputs shape:
torch.Size([8, 4])
torch.Size([8, 4, 256])
pos_embeddings.shape :torch.Size([4, 256])
token_embeddings.shape :torch.Size([8, 4, 256])
torch.Size([8, 4, 256])
Token IDs:
tensor([[ 1212, 1492, 373, 717],
[ 3199, 287, 27937, 416],
[ 7703, 11, 4373, 290],
[ 5834, 13, 628, 200],
[10970, 327, 11417, 1137],
[ 3268, 3336, 371, 48743],
[ 198, 198, 3886, 198],
[ 198, 41, 13, 35]])
Inputs shape:
torch.Size([8, 4])
Token Embeddings Shape:
torch.Size([8, 4, 256])
Positional Embeddings:
torch.Size([4, 256])
Input Embeddings:
torch.Size([8, 4, 256])
Encoded text length: 101048
x: [3285, 546, 340, 11]
y: [546, 340, 11, 262]
[tensor([[1212, 1492, 373, 717]]), tensor([[1492, 373, 717, 3199]])]
[tensor([[1492, 373, 717, 3199]]), tensor([[ 373, 717, 3199, 287]])]
Inputs:
tensor([[ 1212, 1492, 373, 717],
[ 3199, 287, 27937, 416],
[ 7703, 11, 4373, 290],
[ 5834, 13, 628, 200],
[10970, 327, 11417, 1137],
[ 3268, 3336, 371, 48743],
[ 198, 198, 3886, 198],
[ 198, 41, 13, 35]])
Targets:
tensor([[ 1492, 373, 717, 3199],
[ 287, 27937, 416, 7703],
[ 11, 4373, 290, 5834],
[ 13, 628, 200, 10970],
[ 327, 11417, 1137, 3268],
[ 3336, 371, 48743, 198],
[ 198, 3886, 198, 198],
[ 41, 13, 35, 13]])
Parameter containing:
tensor([[ 0.5224, -1.7729, 0.1980],
[-0.3565, 0.6338, -1.6640],
[ 0.8090, -0.0369, 0.7485],
[ 2.4581, 0.7263, -0.7000],
[-0.3004, -1.2820, -0.0088],
[ 0.9215, -0.6833, 1.0089]], requires_grad=True)
tensor([[ 2.4581, 0.7263, -0.7000]], grad_fn=)
tensor([[ 0.8090, -0.0369, 0.7485],
[ 2.4581, 0.7263, -0.7000],
[ 0.9215, -0.6833, 1.0089],
[-0.3565, 0.6338, -1.6640]], grad_fn=)
token IDs:
tensor([[ 13, 4849, 3889, 198],
[ 198, 16224, 27937, 198],
[ 198, 41481, 352, 198],
[ 198, 1532, 345, 1107],
[ 765, 284, 3285, 546],
[ 340, 11, 262, 717],
[ 1517, 345, 1183, 2192],
[ 765, 284, 760, 318]])
Inputs shape:
torch.Size([8, 4])
torch.Size([8, 4, 256])
pos_embeddings.shape :torch.Size([4, 256])
token_embeddings.shape :torch.Size([8, 4, 256])
torch.Size([8, 4, 256])
tensor([ 3199, 287, 27937, 416])
Attention Score For inputs[1]:
tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07,
1.0934e+08, 1.0229e+06])
tensor(15024165)
Dot Product Computed Using torch.dot():
tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07,
1.0934e+08, 1.0229e+06])
Attention Weights:
tensor([0.0100, 0.5286, 0.0982, 0.0243, 0.2370, 0.0281, 0.0731, 0.0007])
Sum:
tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07,
1.0934e+08, 1.0229e+06])
Softmax functions are used to ensure that attention weights are always positive. This way, the output is interpretable as probabilities.
Using softmax_naive(): [NOT ADVISABLE]
Attention weights:
tensor([nan, nan, nan, nan, nan, nan, nan, nan])
Sum:
tensor(nan)
Using torch.softmax(): [ADVISABLE]
Attention weights: tensor([0., 1., 0., 0., 0., 0., 0., 0.])
Sum: tensor(1.)
The "context vector" is calculated by multiply ea input vector by corresponding attention weight.
Context Vector:
tensor([ 3199., 287., 27937., 416.])
Encoded text length: 101048 x: [3285, 546, 340, 11] y: [546, 340, 11, 262] [tensor(1212, 1492, 373, 717), tensor(1492, 373, 717, 3199)] [tensor(1492, 373, 717, 3199), tensor(373, 717, 3199, 287)] Inputs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])
Targets: tensor([[ 1492, 373, 717, 3199], [ 287, 27937, 416, 7703], [ 11, 4373, 290, 5834], [ 13, 628, 200, 10970], [ 327, 11417, 1137, 3268], [ 3336, 371, 48743, 198], [ 198, 3886, 198, 198], [ 41, 13, 35, 13]]) Parameter containing: tensor([[ 0.5224, -1.7729, 0.1980], [-0.3565, 0.6338, -1.6640], [ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [-0.3004, -1.2820, -0.0088], [ 0.9215, -0.6833, 1.0089]], requires_grad=True) tensor(2.4581, 0.7263, -0.7000, grad_fn=) tensor([[ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [ 0.9215, -0.6833, 1.0089], [-0.3565, 0.6338, -1.6640]], grad_fn=) token IDs: tensor([[ 13, 4849, 3889, 198], [ 198, 16224, 27937, 198], [ 198, 41481, 352, 198], [ 198, 1532, 345, 1107], [ 765, 284, 3285, 546], [ 340, 11, 262, 717], [ 1517, 345, 1183, 2192], [ 765, 284, 760, 318]])
Inputs shape: torch.Size([8, 4]) torch.Size([8, 4, 256])
pos_embeddings.shape :torch.Size([4, 256])
token_embeddings.shape :torch.Size([8, 4, 256])
torch.Size([8, 4, 256]) tensor([[ 4348226, 15024165, 11191507, 7467848, 18857294, 44025242, 2126836, 331092], [ 15024165, 790964995, 146934195, 36294333, 354616600, 42053479, 109335778, 1022910], [ 11191507, 146934195, 78543559, 47743689, 134761778, 40967953, 18578270, 1592644], [ 7467848, 36294333, 47743689, 34470109, 71400507, 29090468, 3637714, 1170829], [ 18857294, 354616600, 134761778, 71400507, 252088487, 96597330, 46828394, 2373683], [ 44025242, 42053479, 40967953, 29090468, 96597330, 2397826410, 12400412, 2494668], [ 2126836, 109335778, 18578270, 3637714, 46828394, 12400412, 15218608, 104770], [ 331092, 1022910, 1592644, 1170829, 2373683, 2494668, 104770, 42279]]) <class 'torch.Tensor'>
Encoded text length: 101048 x: [3285, 546, 340, 11] y: [546, 340, 11, 262] [tensor(1212, 1492, 373, 717), tensor(1492, 373, 717, 3199)] [tensor(1492, 373, 717, 3199), tensor(373, 717, 3199, 287)] Inputs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])
Targets: tensor([[ 1492, 373, 717, 3199], [ 287, 27937, 416, 7703], [ 11, 4373, 290, 5834], [ 13, 628, 200, 10970], [ 327, 11417, 1137, 3268], [ 3336, 371, 48743, 198], [ 198, 3886, 198, 198], [ 41, 13, 35, 13]]) Parameter containing: tensor([[ 0.5224, -1.7729, 0.1980], [-0.3565, 0.6338, -1.6640], [ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [-0.3004, -1.2820, -0.0088], [ 0.9215, -0.6833, 1.0089]], requires_grad=True) tensor(2.4581, 0.7263, -0.7000, grad_fn=) tensor([[ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [ 0.9215, -0.6833, 1.0089], [-0.3565, 0.6338, -1.6640]], grad_fn=) token IDs: tensor([[ 13, 4849, 3889, 198], [ 198, 16224, 27937, 198], [ 198, 41481, 352, 198], [ 198, 1532, 345, 1107], [ 765, 284, 3285, 546], [ 340, 11, 262, 717], [ 1517, 345, 1183, 2192], [ 765, 284, 760, 318]])
Inputs shape: torch.Size([8, 4]) torch.Size([8, 4, 256])
pos_embeddings.shape :torch.Size([4, 256])
token_embeddings.shape :torch.Size([8, 4, 256])
torch.Size([8, 4, 256]) tensor([[ 4348226, 15024165, 11191507, 7467848, 18857294, 44025242, 2126836, 331092], [ 15024165, 790964995, 146934195, 36294333, 354616600, 42053479, 109335778, 1022910], [ 11191507, 146934195, 78543559, 47743689, 134761778, 40967953, 18578270, 1592644], [ 7467848, 36294333, 47743689, 34470109, 71400507, 29090468, 3637714, 1170829], [ 18857294, 354616600, 134761778, 71400507, 252088487, 96597330, 46828394, 2373683], [ 44025242, 42053479, 40967953, 29090468, 96597330, 2397826410, 12400412, 2494668], [ 2126836, 109335778, 18578270, 3637714, 46828394, 12400412, 15218608, 104770], [ 331092, 1022910, 1592644, 1170829, 2373683, 2494668, 104770, 42279]]) <class 'torch.Tensor'>