huggingface compute_loss - beyondnlp/nlp GitHub Wiki

  • commpute_loss๋ฅผ ๋ถ„์„ํ•˜๊ธฐ ์œ„ํ•ด ์ฝ”๋“œ๋ฅผ ์ฐพ์•„๋ณด๋ฉด ์•„๋ž˜์˜ ํ•จ์ˆ˜ ํ˜ธ์ถœ์ด ๋ฉ”์ธ์ธ ๊ฒƒ์„ ์•Œ์ˆ˜ ์žˆ๋‹ค.

    • compute_loss : https://github.com/huggingface/transformers/blob/984bc11b0882ff1e5b34ba717ea357e069ceced9/src/transformers/trainer_pt_utils.py#L545

    • https://huggingface.co/transformers/v4.4.2/_modules/transformers/trainer_pt_utils.html

      • LabelSmoother Class
      
      def __call__(self, model_output, labels):
          logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0]
          log_probs = -torch.nn.functional.log_softmax(logits, dim=-1)
          if labels.dim() == log_probs.dim() - 1:
              labels = labels.unsqueeze(-1)
      
          padding_mask = labels.eq(self.ignore_index)
          # ignore_index(-100)์ธ ๊ฒƒ๋งŒ True๋กœ ์•„๋‹Œ ๊ฒƒ์€  False๋กœ
          # padding_mask :  tensor([[True],  [True], [True],
          # ...,
          # [True], [True],[True]], device='cuda:0')
      
      
      
          # In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask
          # will ignore them in any case.
          labels.clamp_min_(0)
          # 0์ดํ•˜์˜ ๊ฐ’์€ 0์œผ๋กœ ์น˜ํ™˜ํ•œ๋‹ค
          # lamp label:   tensor([[0], [0], [0],
          # ...,
          # [0], [0], [0]], device='cuda:0')
      
      
          nll_loss = log_probs.gather(dim=-1, index=labels)
          # nll_loss :  tensor([[7.0193], [7.7626], [7.5791],
          # ...,
          # [7.0840], [7.2005], [7.5168]], device='cuda:0', grad_fn=<GatherBackward0>)
      
      
      
          smoothed_loss = log_probs.sum(dim=-1, keepdim=True)
          # smoothed_loss :  tensor([[1896513.1250], [1970930.8750], [1950668.2500],
          # ...,
          # [1780925.7500], [1792207.0000], [1794247.5000]], device='cuda:0', grad_fn=<SumBackward1>)
      
      
      
          nll_loss.masked_fill_(padding_mask, 0.0)
          # tensor([[0.], [0.], [0.],
          # ...,
          # [0.], [0.], [0.]], device='cuda:0', grad_fn=<MaskedFillBackward0>)
      
      
      
          smoothed_loss.masked_fill_(padding_mask, 0.0)
      
          # Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded):
          num_active_elements = padding_mask.numel() - padding_mask.long().sum()
          # num_act :  tensor(407, device='cuda:0')
      
      
          nll_loss = nll_loss.sum() / num_active_elements
          # nll_loss.sum()/num_act :  tensor(11.9873, device='cuda:0', grad_fn=<DivBackward0>)
      
      
      
          smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
          #  smoothed_loss :  tensor(15.4860, device='cuda:0', grad_fn=<DivBackward0>)
      
      
          return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss
          # tensor(12.3371, device='cuda:0', grad_fn=<AddBackward0>)
      
      
      • ์ฃผ์š” ํ•จ์ˆ˜
        • torch.eq()
          >>> torch.eq(torch.tensor([1, 2], [3, 4](/beyondnlp/nlp/wiki/1,-2],-[3,-4)), torch.tensor([1, 1], [4, 4](/beyondnlp/nlp/wiki/1,-1],-[4,-4)))
          tensor([[ True, False],
          [False, True]])
          
        • torch.numel()
          • tensor์˜ element์˜ ๊ฐœ์ˆซ
          >>> a = torch.randn(1, 2, 3, 4, 5)
          >>> torch.numel(a)
          120
          >>> a = torch.zeros(4,4)
          >>> torch.numel(a)
          16
          
        • torch.gather()
          out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
          out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
          out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2        
          
        • torch.clamp()
          >>> a = torch.randn(4)
          >>> a
          tensor([-1.7120,  0.1734, -0.0478, -0.0922])
          >>> torch.clamp(a, min=-0.5, max=0.5)
          tensor([-0.5000,  0.1734, -0.0478, -0.0922])
          
          >>> min = torch.linspace(-1, 1, steps=4)
          >>> torch.clamp(a, min=min)
          tensor([-1.0000,  0.1734,  0.3333,  1.0000])
          
        • torch.masked_fill()
          import torch
          import torch.nn.functional as F
          
          sample=torch.FloatTEnsor([1,2,3], [ 4,5,6](/beyondnlp/nlp/wiki/1,2,3],-[-4,5,6) )
          masked = torch.FloatTensor([0, 1, 1], [1, 1, 0](/beyondnlp/nlp/wiki/0,-1,-1],-[1,-1,-0) )
          out = sample.masked_fill(mask == 0, 10)
          print(out)
          # tensor([[ 10.000, 2.0000,  3.0000 ],
          #         [ 4.0000, 5.0000, 10.0000 ]] )
          
          
          
  • torch.gather()

    import random
    import torch
    import torch.nn as nn
    import numpy as np
    
    arr=torch.Tensor([[1,2,3], [4,5,6](/beyondnlp/nlp/wiki/[1,2,3],-[4,5,6), [7,8,9], [10,11,12](/beyondnlp/nlp/wiki/7,8,9],-[10,11,12)])
    print(f"ARR={arr}, shape={arr.shape}")
    
    indices = torch.arange(2).expand(2,1,2)
    print(f"indices={indices}, shape={indices.shape}")
    
    output1=torch.gather(arr, 1, indices )
    print(f"output1={output1}, shape={output1.shape}")
    
    x, y, z = arr.shape
    print(x,y,z)
    
    for i in range(x):
       for j in range(y):
           for k in range(z):
               print(f"ARR[{i}][{j}][{k}]={arr[i][j][k]}")
    
    x, y, z = indices.shape
    print(x,y,z)
    
    for i in range(x):
       for j in range(y):
           for k in range(z):
               print(f"indices[{i}][{j}][{k}]={indices[i][j][k]}")
    
    
    x, y, z = indices.shape
    print(x,y,z)
    
    for i in range(x):
       for j in range(y):
           for k in range(z):
               dim=indices[i][j][k]
               print(f"output[{i}][{dim}][{k}]={arr[i][dim][k]}")
    
    ########################################################
    ARR=tensor([[[ 1.,  2.,  3.],
             [ 4.,  5.,  6.]],
    
            [[ 7.,  8.,  9.],
             [10., 11., 12.]]]), shape=torch.Size([2, 2, 3])
    indices=tensor([[0, 1](/beyondnlp/nlp/wiki/[0,-1),
    
            [0, 1](/beyondnlp/nlp/wiki/0,-1)]), shape=torch.Size([2, 1, 2])
    output1=tensor([[ 1.,  5.](/beyondnlp/nlp/wiki/[-1.,--5.),
    
            [ 7., 11.](/beyondnlp/nlp/wiki/-7.,-11.)]), shape=torch.Size([2, 1, 2])
    arr.shape=[2,2,3]
    ARR[0][0][0]=1.0
    ARR[0][0][1]=2.0
    ARR[0][0][2]=3.0
    ARR[0][1][0]=4.0
    ARR[0][1][1]=5.0
    ARR[0][1][2]=6.0
    ARR[1][0][0]=7.0
    ARR[1][0][1]=8.0
    ARR[1][0][2]=9.0
    ARR[1][1][0]=10.0
    ARR[1][1][1]=11.0
    ARR[1][1][2]=12.0
    indices.shape=[2,1,2]
    indices[0][0][0]=0
    indices[0][0][1]=1
    indices[1][0][0]=0
    indices[1][0][1]=1
    indices.shape=[2,1,2]
    output[0][0][0]=1.0
    output[0][1][1]=5.0
    output[1][0][0]=7.0
    output[1][1][1]=11.0