huggingface compute_loss - beyondnlp/nlp GitHub Wiki

commpute_loss를 분석하기 위해 코드를 찾아보면 아래의 함수 호출이 메인인 것을 알수 있다.

compute_loss : https://github.com/huggingface/transformers/blob/984bc11b0882ff1e5b34ba717ea357e069ceced9/src/transformers/trainer_pt_utils.py#L545

https://huggingface.co/transformers/v4.4.2/_modules/transformers/trainer_pt_utils.html

LabelSmoother Class


def __call__(self, model_output, labels):
    logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0]
    log_probs = -torch.nn.functional.log_softmax(logits, dim=-1)
    if labels.dim() == log_probs.dim() - 1:
        labels = labels.unsqueeze(-1)

    padding_mask = labels.eq(self.ignore_index)
    # ignore_index(-100)인 것만 True로 아닌 것은  False로
    # padding_mask :  tensor([[True],  [True], [True],
    # ...,
    # [True], [True],[True]], device='cuda:0')



    # In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask
    # will ignore them in any case.
    labels.clamp_min_(0)
    # 0이하의 값은 0으로 치환한다
    # lamp label:   tensor([[0], [0], [0],
    # ...,
    # [0], [0], [0]], device='cuda:0')


    nll_loss = log_probs.gather(dim=-1, index=labels)
    # nll_loss :  tensor([[7.0193], [7.7626], [7.5791],
    # ...,
    # [7.0840], [7.2005], [7.5168]], device='cuda:0', grad_fn=<GatherBackward0>)



    smoothed_loss = log_probs.sum(dim=-1, keepdim=True)
    # smoothed_loss :  tensor([[1896513.1250], [1970930.8750], [1950668.2500],
    # ...,
    # [1780925.7500], [1792207.0000], [1794247.5000]], device='cuda:0', grad_fn=<SumBackward1>)



    nll_loss.masked_fill_(padding_mask, 0.0)
    # tensor([[0.], [0.], [0.],
    # ...,
    # [0.], [0.], [0.]], device='cuda:0', grad_fn=<MaskedFillBackward0>)



    smoothed_loss.masked_fill_(padding_mask, 0.0)

    # Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded):
    num_active_elements = padding_mask.numel() - padding_mask.long().sum()
    # num_act :  tensor(407, device='cuda:0')


    nll_loss = nll_loss.sum() / num_active_elements
    # nll_loss.sum()/num_act :  tensor(11.9873, device='cuda:0', grad_fn=<DivBackward0>)



    smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
    #  smoothed_loss :  tensor(15.4860, device='cuda:0', grad_fn=<DivBackward0>)


    return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss
    # tensor(12.3371, device='cuda:0', grad_fn=<AddBackward0>)

주요 함수

torch.eq()

>>> torch.eq(torch.tensor([1, 2], [3, 4](/beyondnlp/nlp/wiki/1,-2],-[3,-4)), torch.tensor([1, 1], [4, 4](/beyondnlp/nlp/wiki/1,-1],-[4,-4)))
tensor([[ True, False],
[False, True]])

torch.numel()

tensor의 element의 개숫

>>> a = torch.randn(1, 2, 3, 4, 5)
>>> torch.numel(a)
120
>>> a = torch.zeros(4,4)
>>> torch.numel(a)
16

torch.gather()

out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2

torch.clamp()

>>> a = torch.randn(4)
>>> a
tensor([-1.7120,  0.1734, -0.0478, -0.0922])
>>> torch.clamp(a, min=-0.5, max=0.5)
tensor([-0.5000,  0.1734, -0.0478, -0.0922])

>>> min = torch.linspace(-1, 1, steps=4)
>>> torch.clamp(a, min=min)
tensor([-1.0000,  0.1734,  0.3333,  1.0000])

torch.masked_fill()

import torch
import torch.nn.functional as F

sample=torch.FloatTEnsor([1,2,3], [ 4,5,6](/beyondnlp/nlp/wiki/1,2,3],-[-4,5,6) )
masked = torch.FloatTensor([0, 1, 1], [1, 1, 0](/beyondnlp/nlp/wiki/0,-1,-1],-[1,-1,-0) )
out = sample.masked_fill(mask == 0, 10)
print(out)
# tensor([[ 10.000, 2.0000,  3.0000 ],
#         [ 4.0000, 5.0000, 10.0000 ]] )

torch.gather()

import random
import torch
import torch.nn as nn
import numpy as np

arr=torch.Tensor([[1,2,3], [4,5,6](/beyondnlp/nlp/wiki/[1,2,3],-[4,5,6), [7,8,9], [10,11,12](/beyondnlp/nlp/wiki/7,8,9],-[10,11,12)])
print(f"ARR={arr}, shape={arr.shape}")

indices = torch.arange(2).expand(2,1,2)
print(f"indices={indices}, shape={indices.shape}")

output1=torch.gather(arr, 1, indices )
print(f"output1={output1}, shape={output1.shape}")

x, y, z = arr.shape
print(x,y,z)

for i in range(x):
   for j in range(y):
       for k in range(z):
           print(f"ARR[{i}][{j}][{k}]={arr[i][j][k]}")

x, y, z = indices.shape
print(x,y,z)

for i in range(x):
   for j in range(y):
       for k in range(z):
           print(f"indices[{i}][{j}][{k}]={indices[i][j][k]}")


x, y, z = indices.shape
print(x,y,z)

for i in range(x):
   for j in range(y):
       for k in range(z):
           dim=indices[i][j][k]
           print(f"output[{i}][{dim}][{k}]={arr[i][dim][k]}")

########################################################
ARR=tensor([[[ 1.,  2.,  3.],
         [ 4.,  5.,  6.]],

        [[ 7.,  8.,  9.],
         [10., 11., 12.]]]), shape=torch.Size([2, 2, 3])
indices=tensor([[0, 1](/beyondnlp/nlp/wiki/[0,-1),

        [0, 1](/beyondnlp/nlp/wiki/0,-1)]), shape=torch.Size([2, 1, 2])
output1=tensor([[ 1.,  5.](/beyondnlp/nlp/wiki/[-1.,--5.),

        [ 7., 11.](/beyondnlp/nlp/wiki/-7.,-11.)]), shape=torch.Size([2, 1, 2])
arr.shape=[2,2,3]
ARR[0][0][0]=1.0
ARR[0][0][1]=2.0
ARR[0][0][2]=3.0
ARR[0][1][0]=4.0
ARR[0][1][1]=5.0
ARR[0][1][2]=6.0
ARR[1][0][0]=7.0
ARR[1][0][1]=8.0
ARR[1][0][2]=9.0
ARR[1][1][0]=10.0
ARR[1][1][1]=11.0
ARR[1][1][2]=12.0
indices.shape=[2,1,2]
indices[0][0][0]=0
indices[0][0][1]=1
indices[1][0][0]=0
indices[1][0][1]=1
indices.shape=[2,1,2]
output[0][0][0]=1.0
output[0][1][1]=5.0
output[1][0][0]=7.0
output[1][1][1]=11.0