BACKUP_AI_LabReport_Week7 - TheEvergreenStateCollege/upper-division-cs-23-24 GitHub Wiki

2.6 Sampling

Encoded text length: 101048 x: [3285, 546, 340, 11] y: [546, 340, 11, 262] [tensor([[1212, 1492, 373, 717]]), tensor([[1492, 373, 717, 3199]])] [tensor([[1492, 373, 717, 3199]]), tensor([[ 373, 717, 3199, 287]])] Inputs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])

Targets: tensor([[ 1492, 373, 717, 3199], [ 287, 27937, 416, 7703], [ 11, 4373, 290, 5834], [ 13, 628, 200, 10970], [ 327, 11417, 1137, 3268], [ 3336, 371, 48743, 198], [ 198, 3886, 198, 198], [ 41, 13, 35, 13]]) Parameter containing: tensor([[ 0.5224, -1.7729, 0.1980], [-0.3565, 0.6338, -1.6640], [ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [-0.3004, -1.2820, -0.0088], [ 0.9215, -0.6833, 1.0089]], requires_grad=True) tensor([[ 2.4581, 0.7263, -0.7000]], grad_fn=) tensor([[ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [ 0.9215, -0.6833, 1.0089], [-0.3565, 0.6338, -1.6640]], grad_fn=) token IDs: tensor([[ 13, 4849, 3889, 198], [ 198, 16224, 27937, 198], [ 198, 41481, 352, 198], [ 198, 1532, 345, 1107], [ 765, 284, 3285, 546], [ 340, 11, 262, 717], [ 1517, 345, 1183, 2192], [ 765, 284, 760, 318]])

Inputs shape: torch.Size([8, 4]) torch.Size([8, 4, 256])

pos_embeddings.shape :torch.Size([4, 256])

token_embeddings.shape :torch.Size([8, 4, 256])

torch.Size([8, 4, 256]) 101048 x: [3285, 546, 340, 11] y:      [546, 340, 11, 262] [3285] ----> 546 [3285, 546] ----> 340 [3285, 546, 340] ----> 11 [3285, 546, 340, 11] ----> 262 hear ----> about hear about ----> it hear about it ----> , hear about it, ----> the First Batch :

[tensor([[1212, 1492, 373, 717]]), tensor([[1492, 373, 717, 3199]])] Second Batch :

[tensor([[1492, 373, 717, 3199]]), tensor([[ 373, 717, 3199, 287]])]

2.7 Embeddings

----- EMBEDDING LAYER WEIGHT MATRIX -----

Parameter containing: tensor([[ 0.3374, -0.1778, -0.1690], [ 0.9178, 1.5810, 1.3010], [ 1.2753, -0.2010, -0.1606], [-0.4015, 0.9666, -1.1481], [-1.1589, 0.3255, -0.6315], [-2.8400, -0.7849, -1.4096]], requires_grad=True)

----- SINGLE TOKEN ID, TENSOR([3]) --> EMBEDDED VECTOR -----

tensor([[-0.4015, 0.9666, -1.1481]], grad_fn=)

----- FOUR INPUT ID VALUES: [2, 3, 5, 1] --> MATRIX EMBEDDING -----

tensor([[ 1.2753, -0.2010, -0.1606], [-0.4015, 0.9666, -1.1481], [-2.8400, -0.7849, -1.4096], [ 0.9178, 1.5810, 1.3010]], grad_fn=)

2.8 Positional

Encoded text length: 101048 x: [3285, 546, 340, 11] y: [546, 340, 11, 262] [tensor([[1212, 1492, 373, 717]]), tensor([[1492, 373, 717, 3199]])] [tensor([[1492, 373, 717, 3199]]), tensor([[ 373, 717, 3199, 287]])] Inputs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])

Targets: tensor([[ 1492, 373, 717, 3199], [ 287, 27937, 416, 7703], [ 11, 4373, 290, 5834], [ 13, 628, 200, 10970], [ 327, 11417, 1137, 3268], [ 3336, 371, 48743, 198], [ 198, 3886, 198, 198], [ 41, 13, 35, 13]]) Parameter containing: tensor([[ 0.5224, -1.7729, 0.1980], [-0.3565, 0.6338, -1.6640], [ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [-0.3004, -1.2820, -0.0088], [ 0.9215, -0.6833, 1.0089]], requires_grad=True) tensor([[ 2.4581, 0.7263, -0.7000]], grad_fn=) tensor([[ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [ 0.9215, -0.6833, 1.0089], [-0.3565, 0.6338, -1.6640]], grad_fn=) token IDs: tensor([[ 13, 4849, 3889, 198], [ 198, 16224, 27937, 198], [ 198, 41481, 352, 198], [ 198, 1532, 345, 1107], [ 765, 284, 3285, 546], [ 340, 11, 262, 717], [ 1517, 345, 1183, 2192], [ 765, 284, 760, 318]])

Inputs shape: torch.Size([8, 4]) torch.Size([8, 4, 256])

pos_embeddings.shape :torch.Size([4, 256])

token_embeddings.shape :torch.Size([8, 4, 256])

torch.Size([8, 4, 256])

Token IDs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])

Inputs shape: torch.Size([8, 4])

Token Embeddings Shape: torch.Size([8, 4, 256])

Positional Embeddings: torch.Size([4, 256])

Input Embeddings: torch.Size([8, 4, 256])

3.3.1 Untrainable

Encoded text length: 101048 x: [3285, 546, 340, 11] y: [546, 340, 11, 262] [tensor([[1212, 1492, 373, 717]]), tensor([[1492, 373, 717, 3199]])] [tensor([[1492, 373, 717, 3199]]), tensor([[ 373, 717, 3199, 287]])] Inputs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])

Targets: tensor([[ 1492, 373, 717, 3199], [ 287, 27937, 416, 7703], [ 11, 4373, 290, 5834], [ 13, 628, 200, 10970], [ 327, 11417, 1137, 3268], [ 3336, 371, 48743, 198], [ 198, 3886, 198, 198], [ 41, 13, 35, 13]]) Parameter containing: tensor([[ 0.5224, -1.7729, 0.1980], [-0.3565, 0.6338, -1.6640], [ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [-0.3004, -1.2820, -0.0088], [ 0.9215, -0.6833, 1.0089]], requires_grad=True) tensor([[ 2.4581, 0.7263, -0.7000]], grad_fn=) tensor([[ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [ 0.9215, -0.6833, 1.0089], [-0.3565, 0.6338, -1.6640]], grad_fn=) token IDs: tensor([[ 13, 4849, 3889, 198], [ 198, 16224, 27937, 198], [ 198, 41481, 352, 198], [ 198, 1532, 345, 1107], [ 765, 284, 3285, 546], [ 340, 11, 262, 717], [ 1517, 345, 1183, 2192], [ 765, 284, 760, 318]])

Inputs shape: torch.Size([8, 4]) torch.Size([8, 4, 256])

pos_embeddings.shape :torch.Size([4, 256])

token_embeddings.shape :torch.Size([8, 4, 256])

torch.Size([8, 4, 256]) tensor([ 3199, 287, 27937, 416])

Attention Score For inputs[1]: tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07, 1.0934e+08, 1.0229e+06]) tensor(15024165)

Dot Product Computed Using torch.dot(): tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07, 1.0934e+08, 1.0229e+06])

Attention Weights: tensor([0.0100, 0.5286, 0.0982, 0.0243, 0.2370, 0.0281, 0.0731, 0.0007])

Sum: tensor([1.5024e+07, 7.9096e+08, 1.4693e+08, 3.6294e+07, 3.5462e+08, 4.2053e+07, 1.0934e+08, 1.0229e+06])

Softmax functions are used to ensure that attention weights are always positive. This way, the output is interpretable as probabilities.

Using softmax_naive(): [NOT ADVISABLE]

Attention weights: tensor([nan, nan, nan, nan, nan, nan, nan, nan])

Sum: tensor(nan)

Using torch.softmax(): [ADVISABLE]

Attention weights: tensor([0., 1., 0., 0., 0., 0., 0., 0.]) Sum: tensor(1.)

The "context vector" is calculated by multiply ea input vector by corresponding attention weight.

Context Vector: tensor([ 3199., 287., 27937., 416.])

3.3.2 Trainable

Encoded text length: 101048 x: [3285, 546, 340, 11] y: [546, 340, 11, 262] [tensor(1212, 1492, 373, 717), tensor(1492, 373, 717, 3199)] [tensor(1492, 373, 717, 3199), tensor(373, 717, 3199, 287)] Inputs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])

Targets: tensor([[ 1492, 373, 717, 3199], [ 287, 27937, 416, 7703], [ 11, 4373, 290, 5834], [ 13, 628, 200, 10970], [ 327, 11417, 1137, 3268], [ 3336, 371, 48743, 198], [ 198, 3886, 198, 198], [ 41, 13, 35, 13]]) Parameter containing: tensor([[ 0.5224, -1.7729, 0.1980], [-0.3565, 0.6338, -1.6640], [ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [-0.3004, -1.2820, -0.0088], [ 0.9215, -0.6833, 1.0089]], requires_grad=True) tensor(2.4581, 0.7263, -0.7000, grad_fn=) tensor([[ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [ 0.9215, -0.6833, 1.0089], [-0.3565, 0.6338, -1.6640]], grad_fn=) token IDs: tensor([[ 13, 4849, 3889, 198], [ 198, 16224, 27937, 198], [ 198, 41481, 352, 198], [ 198, 1532, 345, 1107], [ 765, 284, 3285, 546], [ 340, 11, 262, 717], [ 1517, 345, 1183, 2192], [ 765, 284, 760, 318]])

Inputs shape: torch.Size([8, 4]) torch.Size([8, 4, 256])

pos_embeddings.shape :torch.Size([4, 256])

token_embeddings.shape :torch.Size([8, 4, 256])

torch.Size([8, 4, 256]) tensor([[ 4348226, 15024165, 11191507, 7467848, 18857294, 44025242, 2126836, 331092], [ 15024165, 790964995, 146934195, 36294333, 354616600, 42053479, 109335778, 1022910], [ 11191507, 146934195, 78543559, 47743689, 134761778, 40967953, 18578270, 1592644], [ 7467848, 36294333, 47743689, 34470109, 71400507, 29090468, 3637714, 1170829], [ 18857294, 354616600, 134761778, 71400507, 252088487, 96597330, 46828394, 2373683], [ 44025242, 42053479, 40967953, 29090468, 96597330, 2397826410, 12400412, 2494668], [ 2126836, 109335778, 18578270, 3637714, 46828394, 12400412, 15218608, 104770], [ 331092, 1022910, 1592644, 1170829, 2373683, 2494668, 104770, 42279]]) <class 'torch.Tensor'>

Encoded text length: 101048 x: [3285, 546, 340, 11] y: [546, 340, 11, 262] [tensor(1212, 1492, 373, 717), tensor(1492, 373, 717, 3199)] [tensor(1492, 373, 717, 3199), tensor(373, 717, 3199, 287)] Inputs: tensor([[ 1212, 1492, 373, 717], [ 3199, 287, 27937, 416], [ 7703, 11, 4373, 290], [ 5834, 13, 628, 200], [10970, 327, 11417, 1137], [ 3268, 3336, 371, 48743], [ 198, 198, 3886, 198], [ 198, 41, 13, 35]])

Targets: tensor([[ 1492, 373, 717, 3199], [ 287, 27937, 416, 7703], [ 11, 4373, 290, 5834], [ 13, 628, 200, 10970], [ 327, 11417, 1137, 3268], [ 3336, 371, 48743, 198], [ 198, 3886, 198, 198], [ 41, 13, 35, 13]]) Parameter containing: tensor([[ 0.5224, -1.7729, 0.1980], [-0.3565, 0.6338, -1.6640], [ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [-0.3004, -1.2820, -0.0088], [ 0.9215, -0.6833, 1.0089]], requires_grad=True) tensor(2.4581, 0.7263, -0.7000, grad_fn=) tensor([[ 0.8090, -0.0369, 0.7485], [ 2.4581, 0.7263, -0.7000], [ 0.9215, -0.6833, 1.0089], [-0.3565, 0.6338, -1.6640]], grad_fn=) token IDs: tensor([[ 13, 4849, 3889, 198], [ 198, 16224, 27937, 198], [ 198, 41481, 352, 198], [ 198, 1532, 345, 1107], [ 765, 284, 3285, 546], [ 340, 11, 262, 717], [ 1517, 345, 1183, 2192], [ 765, 284, 760, 318]])

Inputs shape: torch.Size([8, 4]) torch.Size([8, 4, 256])

pos_embeddings.shape :torch.Size([4, 256])

token_embeddings.shape :torch.Size([8, 4, 256])

torch.Size([8, 4, 256]) tensor([[ 4348226, 15024165, 11191507, 7467848, 18857294, 44025242, 2126836, 331092], [ 15024165, 790964995, 146934195, 36294333, 354616600, 42053479, 109335778, 1022910], [ 11191507, 146934195, 78543559, 47743689, 134761778, 40967953, 18578270, 1592644], [ 7467848, 36294333, 47743689, 34470109, 71400507, 29090468, 3637714, 1170829], [ 18857294, 354616600, 134761778, 71400507, 252088487, 96597330, 46828394, 2373683], [ 44025242, 42053479, 40967953, 29090468, 96597330, 2397826410, 12400412, 2494668], [ 2126836, 109335778, 18578270, 3637714, 46828394, 12400412, 15218608, 104770], [ 331092, 1022910, 1592644, 1170829, 2373683, 2494668, 104770, 42279]]) <class 'torch.Tensor'>

⚠️ **GitHub.com Fallback** ⚠️