huggingface compute_loss - beyondnlp/nlp GitHub Wiki
-
commpute_loss๋ฅผ ๋ถ์ํ๊ธฐ ์ํด ์ฝ๋๋ฅผ ์ฐพ์๋ณด๋ฉด ์๋์ ํจ์ ํธ์ถ์ด ๋ฉ์ธ์ธ ๊ฒ์ ์์ ์๋ค.
-
compute_loss : https://github.com/huggingface/transformers/blob/984bc11b0882ff1e5b34ba717ea357e069ceced9/src/transformers/trainer_pt_utils.py#L545
-
https://huggingface.co/transformers/v4.4.2/_modules/transformers/trainer_pt_utils.html
- LabelSmoother Class
def __call__(self, model_output, labels): logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0] log_probs = -torch.nn.functional.log_softmax(logits, dim=-1) if labels.dim() == log_probs.dim() - 1: labels = labels.unsqueeze(-1) padding_mask = labels.eq(self.ignore_index) # ignore_index(-100)์ธ ๊ฒ๋ง True๋ก ์๋ ๊ฒ์ False๋ก # padding_mask : tensor([[True], [True], [True], # ..., # [True], [True],[True]], device='cuda:0') # In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask # will ignore them in any case. labels.clamp_min_(0) # 0์ดํ์ ๊ฐ์ 0์ผ๋ก ์นํํ๋ค # lamp label: tensor([[0], [0], [0], # ..., # [0], [0], [0]], device='cuda:0') nll_loss = log_probs.gather(dim=-1, index=labels) # nll_loss : tensor([[7.0193], [7.7626], [7.5791], # ..., # [7.0840], [7.2005], [7.5168]], device='cuda:0', grad_fn=<GatherBackward0>) smoothed_loss = log_probs.sum(dim=-1, keepdim=True) # smoothed_loss : tensor([[1896513.1250], [1970930.8750], [1950668.2500], # ..., # [1780925.7500], [1792207.0000], [1794247.5000]], device='cuda:0', grad_fn=<SumBackward1>) nll_loss.masked_fill_(padding_mask, 0.0) # tensor([[0.], [0.], [0.], # ..., # [0.], [0.], [0.]], device='cuda:0', grad_fn=<MaskedFillBackward0>) smoothed_loss.masked_fill_(padding_mask, 0.0) # Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded): num_active_elements = padding_mask.numel() - padding_mask.long().sum() # num_act : tensor(407, device='cuda:0') nll_loss = nll_loss.sum() / num_active_elements # nll_loss.sum()/num_act : tensor(11.9873, device='cuda:0', grad_fn=<DivBackward0>) smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1]) # smoothed_loss : tensor(15.4860, device='cuda:0', grad_fn=<DivBackward0>) return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss # tensor(12.3371, device='cuda:0', grad_fn=<AddBackward0>)
- ์ฃผ์ ํจ์
- torch.eq()
>>> torch.eq(torch.tensor([1, 2], [3, 4](/beyondnlp/nlp/wiki/1,-2],-[3,-4)), torch.tensor([1, 1], [4, 4](/beyondnlp/nlp/wiki/1,-1],-[4,-4))) tensor([[ True, False], [False, True]])
- torch.numel()
- tensor์ element์ ๊ฐ์ซ
>>> a = torch.randn(1, 2, 3, 4, 5) >>> torch.numel(a) 120 >>> a = torch.zeros(4,4) >>> torch.numel(a) 16
- torch.gather()
out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0 out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1 out[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2
- torch.clamp()
>>> a = torch.randn(4) >>> a tensor([-1.7120, 0.1734, -0.0478, -0.0922]) >>> torch.clamp(a, min=-0.5, max=0.5) tensor([-0.5000, 0.1734, -0.0478, -0.0922]) >>> min = torch.linspace(-1, 1, steps=4) >>> torch.clamp(a, min=min) tensor([-1.0000, 0.1734, 0.3333, 1.0000])
- torch.masked_fill()
import torch import torch.nn.functional as F sample=torch.FloatTEnsor([1,2,3], [ 4,5,6](/beyondnlp/nlp/wiki/1,2,3],-[-4,5,6) ) masked = torch.FloatTensor([0, 1, 1], [1, 1, 0](/beyondnlp/nlp/wiki/0,-1,-1],-[1,-1,-0) ) out = sample.masked_fill(mask == 0, 10) print(out) # tensor([[ 10.000, 2.0000, 3.0000 ], # [ 4.0000, 5.0000, 10.0000 ]] )
- torch.eq()
-
-
torch.gather()
import random import torch import torch.nn as nn import numpy as np arr=torch.Tensor([[1,2,3], [4,5,6](/beyondnlp/nlp/wiki/[1,2,3],-[4,5,6), [7,8,9], [10,11,12](/beyondnlp/nlp/wiki/7,8,9],-[10,11,12)]) print(f"ARR={arr}, shape={arr.shape}") indices = torch.arange(2).expand(2,1,2) print(f"indices={indices}, shape={indices.shape}") output1=torch.gather(arr, 1, indices ) print(f"output1={output1}, shape={output1.shape}") x, y, z = arr.shape print(x,y,z) for i in range(x): for j in range(y): for k in range(z): print(f"ARR[{i}][{j}][{k}]={arr[i][j][k]}") x, y, z = indices.shape print(x,y,z) for i in range(x): for j in range(y): for k in range(z): print(f"indices[{i}][{j}][{k}]={indices[i][j][k]}") x, y, z = indices.shape print(x,y,z) for i in range(x): for j in range(y): for k in range(z): dim=indices[i][j][k] print(f"output[{i}][{dim}][{k}]={arr[i][dim][k]}") ######################################################## ARR=tensor([[[ 1., 2., 3.], [ 4., 5., 6.]], [[ 7., 8., 9.], [10., 11., 12.]]]), shape=torch.Size([2, 2, 3]) indices=tensor([[0, 1](/beyondnlp/nlp/wiki/[0,-1), [0, 1](/beyondnlp/nlp/wiki/0,-1)]), shape=torch.Size([2, 1, 2]) output1=tensor([[ 1., 5.](/beyondnlp/nlp/wiki/[-1.,--5.), [ 7., 11.](/beyondnlp/nlp/wiki/-7.,-11.)]), shape=torch.Size([2, 1, 2]) arr.shape=[2,2,3] ARR[0][0][0]=1.0 ARR[0][0][1]=2.0 ARR[0][0][2]=3.0 ARR[0][1][0]=4.0 ARR[0][1][1]=5.0 ARR[0][1][2]=6.0 ARR[1][0][0]=7.0 ARR[1][0][1]=8.0 ARR[1][0][2]=9.0 ARR[1][1][0]=10.0 ARR[1][1][1]=11.0 ARR[1][1][2]=12.0 indices.shape=[2,1,2] indices[0][0][0]=0 indices[0][0][1]=1 indices[1][0][0]=0 indices[1][0][1]=1 indices.shape=[2,1,2] output[0][0][0]=1.0 output[0][1][1]=5.0 output[1][0][0]=7.0 output[1][1][1]=11.0