Caption tutorial notes - HongkuanZhang/Technique-Notes GitHub Wiki

数据前处理:关于utils.py的笔记

import os
import numpy as np
import h5py  #这个函数用来生成HDF5文件(用于存储图像)
import json
import torch
from scipy.misc import imread, imresize #最新的spicy已经移除了这两个函数,用imageio的imread和Image的imresize,参考此链接: https://blog.csdn.net/weekdawn/article/details/97777747
from tqdm import tqdm
from collections import Counter # Counter用来数单词出现的次数,非常方便,不用像以前那样创建函数遇到单词给字典中词频+1那样记录
from random import seed, choice, sample # choice用来随机选择iteration中的一个元素,sample用来从iteration中采样一部分(小于等于iteration长度)数据


def create_input_files(dataset, karpathy_json_path, image_folder, captions_per_image, min_word_freq, output_folder,
                       max_len=100):
    """
    Creates input files for training, validation, and test data.
    :param dataset: name of dataset, one of 'coco', 'flickr8k', 'flickr30k'
    :param karpathy_json_path: path of Karpathy JSON file with splits and captions
    :param image_folder: folder with downloaded images
    :param captions_per_image: number of captions to sample per image
    :param min_word_freq: words occuring less frequently than this threshold are binned as <unk>s
    :param output_folder: folder to save files
    :param max_len: don't sample captions longer than this length
    """

    assert dataset in {'coco', 'flickr8k', 'flickr30k'}

    # Read Karpathy JSON
    with open(karpathy_json_path, 'r') as j: #karpathy的zip文件解压的文件夹,里面包含三个json文件,分别对应三种dataset的captions
        data = json.load(j) #加载的其实是一个字典,字典包含两个key为: dict_keys(['images', 'dataset']),其中images对应的值为一个字典列表
                            #每个字典包含了一张图片的详细信息,dataset对应的为dataset名称的字符串如'coco'

    # Read image paths and captions for each image
    train_image_paths = []
    train_image_captions = []
    val_image_paths = []
    val_image_captions = []
    test_image_paths = []
    test_image_captions = []
    word_freq = Counter() # 为生成单词-词频字典,初始化counter

    for img in data['images']: # 读入每个image的多个captions
        captions = [] # captions变量存储一张图的多个标注
        for c in img['sentences']:
            # Update word frequency
            word_freq.update(c['tokens']) # counter.update(token_list)可以自动根据词频更新单词-词频字典
            if len(c['tokens']) <= max_len: # 只有token数小于最大长度的caption(token列表)才会被添加到captions
                captions.append(c['tokens'])

        if len(captions) == 0: # 若一张图的所有caption包含的token数量都太大则舍弃这张图
            continue

        path = os.path.join(image_folder, img['filepath'], img['filename']) if dataset == 'coco' else os.path.join(
            image_folder, img['filename'])

        if img['split'] in {'train', 'restval'}:
            train_image_paths.append(path)
            train_image_captions.append(captions)
        elif img['split'] in {'val'}:
            val_image_paths.append(path)
            val_image_captions.append(captions)
        elif img['split'] in {'test'}:
            test_image_paths.append(path)
            test_image_captions.append(captions)

    # Sanity check
    # 各个path中包含每张图的路径,各个captions中包含每张图的多个captions,path数量和caption list数量必须一致(一图对多个captions,每张图的captions数量可能会不一,后面会处理)
    assert len(train_image_paths) == len(train_image_captions)
    assert len(val_image_paths) == len(val_image_captions)
    assert len(test_image_paths) == len(test_image_captions)

    # Create word map
    # “单词-词频”词典中小于最低出现频率的单词会被舍弃,然后筛选出的相对高频词会生成“单词-index”的词典word_map,并且要添加四个特殊元素,注意pad添加为0索引元素
    words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
    word_map = {k: v + 1 for v, k in enumerate(words)}
    word_map['<unk>'] = len(word_map) + 1
    word_map['<start>'] = len(word_map) + 1
    word_map['<end>'] = len(word_map) + 1
    word_map['<pad>'] = 0

    # Create a base/root name for all output files 
    # 输出word_map的文件名称
    base_filename = dataset + '_' + str(captions_per_image) + '_cap_per_img_' + str(min_word_freq) + '_min_word_freq'

    # Save word map to a JSON
    with open(os.path.join(output_folder, 'WORDMAP_' + base_filename + '.json'), 'w') as j:
        json.dump(word_map, j)

    # Sample captions for each image, save images to HDF5 file, and captions and their lengths to JSON files
    # HDF5文件就是个二叉树字典,文件本身是根节点,文件包含有很多group/dataset为节点,每个节点中包含dataset(也可以包含下属子节点)。
    seed(123)
    for impaths, imcaps, split in [(train_image_paths, train_image_captions, 'TRAIN'),
                                   (val_image_paths, val_image_captions, 'VAL'),
                                   (test_image_paths, test_image_captions, 'TEST')]:

        with h5py.File(os.path.join(output_folder, split + '_IMAGES_' + base_filename + '.hdf5'), 'a') as h:
            # Make a note of the number of captions we are sampling per image 
            # 将每个图片包含的caption数这个参数写入HDF5字典的attribute中
            h.attrs['captions_per_image'] = captions_per_image

            # Create dataset inside HDF5 file to store images
            # 这个HDF5文件只包含了train/test/val图像dataset,它的形状为(图像数量,3通道,256,256)
            images = h.create_dataset('images', (len(impaths), 3, 256, 256), dtype='uint8')

            print("\nReading %s images and captions, storing to file...\n" % split)

            enc_captions = [] # dataset中全部图像的的全部captions(图像数 * 每个图像包含的caption数量)
            caplens = [] # 每个caption的实际长度(token数+2)

            for i, path in enumerate(tqdm(impaths)):

                # Sample captions
                if len(imcaps[i]) < captions_per_image: # 每个图像对应caption数量如果小于规定数则重复sample来补充
                    captions = imcaps[i] + [choice(imcaps[i]) for _ in range(captions_per_image - len(imcaps[i]))]
                else: # 反之如果达到了或者甚至超过了,则采样规定数量的caption来控制数量
                    captions = sample(imcaps[i], k=captions_per_image)

                # Sanity check
                # 确保每个图像对应的caption数量都等于规定数量
                assert len(captions) == captions_per_image

                # Read images
                # 读入图像,形状为(3,256,256)
                img = imread(impaths[i])
                if len(img.shape) == 2:
                    img = img[:, :, np.newaxis]
                    img = np.concatenate([img, img, img], axis=2)
                img = imresize(img, (256, 256))
                img = img.transpose(2, 0, 1)
                assert img.shape == (3, 256, 256)
                assert np.max(img) <= 255

                # Save image to HDF5 file
                # images变量中存储每个图像(images形状为(image_num,3,256,256))
                images[i] = img

                for j, c in enumerate(captions):#把caption的token换成对应字典中的数字,然后加上四种特殊元素
                    # Encode captions
                    # 这里encode的时候对Counter类的word_map用get来获取字典中存在的对应,不存在的则当做unk处理
                    enc_c = [word_map['<start>']] + [word_map.get(word, word_map['<unk>']) for word in c] + [
                        word_map['<end>']] + [word_map['<pad>']] * (max_len - len(c))

                    # Find caption lengths
                    c_len = len(c) + 2

                    enc_captions.append(enc_c)
                    caplens.append(c_len) # 注意这里存储的每个实际的caption长度加了2

            # Sanity check
            # dataset中全部图像的的全部captions(图像数 * 每个图像包含的caption数量)
            assert images.shape[0] * captions_per_image == len(enc_captions) == len(caplens)

            # Save encoded captions and their lengths to JSON files
            # enc_captions中包含所有图像的所有captions, 形状为(images.shape[0] * captions_per_image, padded_length)
            with open(os.path.join(output_folder, split + '_CAPTIONS_' + base_filename + '.json'), 'w') as j:
                json.dump(enc_captions, j)
            # caplens包含所有caption的实际长度,形状为((images.shape[0] * captions_per_image,)
            with open(os.path.join(output_folder, split + '_CAPLENS_' + base_filename + '.json'), 'w') as j:
                json.dump(caplens, j)


def init_embedding(embeddings): # 根据word embedding的dim来初始化embedding层参数
    """
    Fills embedding tensor with values from the uniform distribution.
    :param embeddings: embedding tensor
    """
    bias = np.sqrt(3.0 / embeddings.size(1))
    torch.nn.init.uniform_(embeddings, -bias, bias)


def load_embeddings(emb_file, word_map):# 从glove中加载embedding,并修改embedding层中对应的参数
    """
    Creates an embedding tensor for the specified word map, for loading into the model.
    :param emb_file: file containing embeddings (stored in GloVe format)
    :param word_map: word map
    :return: embeddings in the same order as the words in the word map, dimension of embeddings
    """

    # Find embedding dimension
    with open(emb_file, 'r') as f:
        emb_dim = len(f.readline().split(' ')) - 1 # 减1是减去第一个单词占的长度

    vocab = set(word_map.keys())

    # Create tensor to hold embeddings, initialize
    embeddings = torch.FloatTensor(len(vocab), emb_dim)
    init_embedding(embeddings) # 初始化embedding

    # Read embedding file
    print("\nLoading embeddings...")
    for line in open(emb_file, 'r'):
        line = line.split(' ')

        emb_word = line[0]
        # 这里取embedding用了两个函数,filter(function,iteration)可以筛选iteration中符合条件的元素
        # (例如如果iteration是list,那么外侧加个list(filter)能返回符合条件的list),这里filter筛掉了空元素""和空格字符串" "
        # 然后map(function,filter)则是对filter中的每个元素进行float数据类型变换,最后外面的list函数返回list数据类型
        # 这种map+filter的联用方式在其他数据处理中也会很常用
        embedding = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:])))

        # Ignore word if not in train_vocab
        # 对于不在词典中的词不会更改embedding层(unk保持初始化值)
        if emb_word not in vocab:
            continue

        embeddings[word_map[emb_word]] = torch.FloatTensor(embedding)

    return embeddings, emb_dim


def clip_gradient(optimizer, grad_clip):
    """
    Clips gradients computed during backpropagation to avoid explosion of gradients.
    :param optimizer: optimizer with the gradients to be clipped
    :param grad_clip: clip value
    """
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)


def save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer,
                    bleu4, is_best):
    """
    Saves model checkpoint.
    :param data_name: base name of processed dataset
    :param epoch: epoch number
    :param epochs_since_improvement: number of epochs since last improvement in BLEU-4 score
    :param encoder: encoder model
    :param decoder: decoder model
    :param encoder_optimizer: optimizer to update encoder's weights, if fine-tuning
    :param decoder_optimizer: optimizer to update decoder's weights
    :param bleu4: validation BLEU-4 score for this epoch
    :param is_best: is this checkpoint the best so far?
    """
    state = {'epoch': epoch,
             'epochs_since_improvement': epochs_since_improvement,
             'bleu-4': bleu4,
             'encoder': encoder,
             'decoder': decoder,
             'encoder_optimizer': encoder_optimizer,
             'decoder_optimizer': decoder_optimizer}
    filename = 'checkpoint_' + data_name + '.pth.tar'
    torch.save(state, filename)
    # If this checkpoint is the best so far, store a copy so it doesn't get overwritten by a worse checkpoint
    if is_best:
        torch.save(state, 'BEST_' + filename)


class AverageMeter(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, shrink_factor):
    """
    Shrinks learning rate by a specified factor.
    :param optimizer: optimizer whose learning rate must be shrunk.
    :param shrink_factor: factor in interval (0, 1) to multiply learning rate with.
    """

    print("\nDECAYING learning rate.")
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * shrink_factor
    print("The new learning rate is %f\n" % (optimizer.param_groups[0]['lr'],))


def accuracy(scores, targets, k):
    """
    Computes top-k accuracy, from predicted and true labels.
    :param scores: scores from the model
    :param targets: true labels
    :param k: k in top-k accuracy
    :return: top-k accuracy
    """

    batch_size = targets.size(0)
    # 这个torchtensor.topk可以筛选对应维度中的最大/小k个数值,1表示第一个维度(即每个时间步softmax输出),后面两个True分别表示取最大k值和按照topk顺序输出
    _, ind = scores.topk(k, 1, True, True) 
    # 这里由于模型每个时间步输出取了topk,则每个时间步的正确答案也要expand为k个来计算正确预测数
    # 例如,一个caption对应的正确答案为4个token,则形状为(4,),但模型由于topk=2输出ind的形状为(4,2),因此caption的形状也应该相应扩张为(4,2)
    # correct的形状为(b_s,k),这里的b_s不是1个batch中的caption数量,而是所有有效时间步的数量(pack_padded_squences之后所有有效时间步的和)
    # 由于eq所以元素都为True/False(相对位置元素是否相等),如下图
    # tensor([[False,  True],
    #   [False,  True],
    #   [False,  True],
    #   [False,  True]])
    correct = ind.eq(targets.view(-1, 1).expand_as(ind)) 
    correct_total = correct.view(-1).float().sum()  # 计算True的个数(0D tensor)
    return correct_total.item() * (100.0 / batch_size) # 用.item()取出scalar数值,b_s等于有效时间步总和(即有效tokens数量和),因此correct/b_s为accuracy

读入前处理后的数据并生成dataset:Dataset类的构造

import torch
from torch.utils.data import Dataset
import h5py
import json
import os

class CaptionDataset(Dataset): # 这个CaptionDataset类继承了pytorch中的Dataset类,这个类会被后面的DataLoader用来读入batch数据
    """
    A PyTorch Dataset class to be used in a PyTorch DataLoader to create batches.
    """

    def __init__(self, data_folder, data_name, split, transform=None):
        """
        :param data_folder: folder where data files are stored
        :param data_name: base name of processed datasets
        :param split: split, one of 'TRAIN', 'VAL', or 'TEST'
        :param transform: image transform pipeline
        """
        self.split = split
        assert self.split in {'TRAIN', 'VAL', 'TEST'}

        # Open hdf5 file where images are stored
        self.h = h5py.File(os.path.join(data_folder, self.split + '_IMAGES_' + data_name + '.hdf5'), 'r')
        self.imgs = self.h['images']

        # Captions per image
        self.cpi = self.h.attrs['captions_per_image']

        # Load encoded captions (completely into memory)
        with open(os.path.join(data_folder, self.split + '_CAPTIONS_' + data_name + '.json'), 'r') as j:
            self.captions = json.load(j)

        # Load caption lengths (completely into memory)
        with open(os.path.join(data_folder, self.split + '_CAPLENS_' + data_name + '.json'), 'r') as j:
            self.caplens = json.load(j)

        # PyTorch transformation pipeline for the image (normalizing, etc.)
        # 在实际输入中这里的trainsform变量为对数据的标准化流程,具体如下
        # import torchvision.transforms as transforms
        # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        # CaptionDataset(....,transform=transforms.Compose([normalize]))
        # 这里面transforms.Compose([a,b,c,d])(input)可以对输入数据进行连续的a,b,c,d数据处理操作,在这里只用到了normalization因此list中只有一项。
        # mean和std的取值也是ImageNet给出的设置,一般都是这个固定的参数,如果想从0训练则可以都更改为[0.5,0.5,0.5](一般都不会这么做)。
        self.transform = transform

        # Total number of datapoints
        # dataset的size为当前dataset中所有caption的数量
        self.dataset_size = len(self.captions)

    def __getitem__(self, i):
        # Remember, the Nth caption corresponds to the (N // captions_per_image)th image
        # 第 Nth 的标注对应 (N // cpi)th 的图片,读入图片并对元素除以255使得所有元素取值落在[0,1]之间
        img = torch.FloatTensor(self.imgs[i // self.cpi] / 255.)
        # 对数据标准化
        if self.transform is not None:
            img = self.transform(img)

        caption = torch.LongTensor(self.captions[i]) # 注意这里captions[i]是一个列表,装着一个caption里的所有tokens

        caplen = torch.LongTensor([self.caplens[i]]) # 这里caplens[i]是一个scalar,为了能变成tensor,在这里加上了[]变成了[caplens[i]]
                                                     # 后面的decoder中的caption_lengths那里会通过squeeze(1)从(b_s,1)恢复成(b_s,)形状

        if self.split is 'TRAIN':
            return img, caption, caplen
        else:
            # For validation of testing, also return all 'captions_per_image' captions to find BLEU-4 score
            # 对于test和val,除了返回image-caption(1条)-caption_length(一个整数),还会返回对应图片的全部captions
            all_captions = torch.LongTensor(
                self.captions[((i // self.cpi) * self.cpi):(((i // self.cpi) * self.cpi) + self.cpi)])
            return img, caption, caplen, all_captions

    def __len__(self):
        return self.dataset_size

Attention-based Ecoder-Decoder模型构建:关于models.py的笔记

import torch
from torch import nn
import torchvision

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class Encoder(nn.Module):
    """
    Encoder. 其实就是个去掉后两层分类层的Resnet101网络
    输出为encode后的图像
    """

    def __init__(self, encoded_image_size=14): 
        super(Encoder, self).__init__()

        # encode后的图像尺寸为14*14
        self.enc_image_size = encoded_image_size
 
        # 从torchvision中导入预训练的resnet101
        resnet = torchvision.models.resnet101(pretrained=True)  # pretrained ImageNet ResNet-101

        # Remove linear and pool layers (since we're not doing classification)
        # 移除resnet101最后的两个用于分类的层:averagepool层和softmax层
        # 注意这里使用的方法是用model.children()把模型中的各层存入list中,然后list删去最后两个层得到我们需要的modules
        # 然后用nn.Sequential(*modules)来得到线性网络
        # PS:nn.Sequential(*layers)/nn.Sequential和nn.Module建立网络的区别是,前者把网络堆叠起来,输入则按堆叠顺序来进行处理
        # 但nn.Modules则除了在init中定义各层,还需要在forward中处理各个网络之间的连接顺序和方式
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)

        # Resize image to fixed size to allow input images of variable size
        # 这个nn.AdaptiveAvgPool2d会对输入图像进行池化,池化参数会自动计算
        # 保证最终输出的形状为(encoded_image_size, encoded_image_size)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))

        self.fine_tune()

    def forward(self, images):
        """
        Forward propagation.
        :param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size)
        :return: encoded images
        """
        # 最终输出形状为 (batch_size, encoded_image_size, encoded_image_size, 2048)
        # 其中2048为通道数,也是后面代码中的encoded_dim
        out = self.resnet(images)  # (batch_size, 2048, image_size/32, image_size/32)
        out = self.adaptive_pool(out)  # (batch_size, 2048, encoded_image_size, encoded_image_size)
        out = out.permute(0, 2, 3, 1)  # (batch_size, encoded_image_size, encoded_image_size, 2048)
        return out

    def fine_tune(self, fine_tune=True):
        """
        fine-tune函数使得resnet中只有第五层至最后一层(2th卷积层到4th卷积层)的参数被fine-tune
        而第一到第四层参数不变动(保持预训练中学习到的基础知识)
        Allow or prevent the computation of gradients for convolutional blocks 2 through 4 of the encoder.
        :param fine_tune: Allow?
        """
        for p in self.resnet.parameters():
            p.requires_grad = False
        # If fine-tuning, only fine-tune convolutional blocks 2 through 4
        for c in list(self.resnet.children())[5:]:
            for p in c.parameters():
                p.requires_grad = fine_tune


class Attention(nn.Module):
    """
    Attention Network. 这个attention主要是计算decoder中hidden state和encoder中hidden states的注意力
    输出为attention_weighted_encoding和注意力矩阵
    """

    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        """
        :param encoder_dim: feature size of encoded images
        :param decoder_dim: size of decoder's RNN
        :param attention_dim: size of the attention network
        """
        super(Attention, self).__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)  # 将encoder_dim(2048)转换为attention_dim以便后面计算注意力数值
        self.decoder_att = nn.Linear(decoder_dim, attention_dim)  # 同理将decoder_dim转换为attention_dim
        self.full_att = nn.Linear(attention_dim, 1)  # 变换后的encoder和decoder的维度为attention_dim, 两者相加之后通过这个线性层计算attention score
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)  # softmax layer to calculate weights

    def forward(self, encoder_out, decoder_hidden):
        """
        Forward propagation.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
        注:这里的num_pixels=14*14,因为encoder的输出后面会通过view(b_s,-1,encoder_dim)变为上述这个形状
        :param decoder_hidden: previous decoder output, a tensor of dimension (batch_size, decoder_dim)
        :return: attention weighted encoding, weights
        """
        att1 = self.encoder_att(encoder_out)  # (batch_size, num_pixels, attention_dim)
        att2 = self.decoder_att(decoder_hidden)  # (batch_size, attention_dim)
        att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2)  # (batch_size, num_pixels)
        alpha = self.softmax(att)  # (batch_size, num_pixels) alpha即为b_s行,14*14列的attention矩阵,每行的每个元素表示对每个pixel的注意力值
        # encoder输出*注意力矩阵并在pixel维度相加得到weighted ecoder output,形状为(batch_size, attention_dim)
        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)  # (batch_size, attention_dim)

        # 返回注意力加权的encoder输出和注意力矩阵
        return attention_weighted_encoding, alpha


class DecoderWithAttention(nn.Module):
    """
    Decoder. 
    接收encoder的输出进行解码,每一步解码时除了输入上一步得到的预测单词(训练时因为是teacher forcing所以是用golden的上一步的单词)
    还要计算上一步decoder hidden state和encoder输出之间的注意力得到注意力encoder输出,并把注意力输出和上步预测词concat作为当前解码的总输入
    """

    def __init__(self, attention_dim, embed_dim, decoder_dim, vocab_size, encoder_dim=2048, dropout=0.5):
        """
        :param attention_dim: size of attention network
        :param embed_dim: embedding size
        :param decoder_dim: size of decoder's RNN
        :param vocab_size: size of vocabulary
        :param encoder_dim: feature size of encoded images
        :param dropout: dropout
        """
        super(DecoderWithAttention, self).__init__()

        self.encoder_dim = encoder_dim
        self.attention_dim = attention_dim
        self.embed_dim = embed_dim
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.dropout = dropout

        self.attention = Attention(encoder_dim, decoder_dim, attention_dim)  # attention network

        self.embedding = nn.Embedding(vocab_size, embed_dim)  # embedding layer
        self.dropout = nn.Dropout(p=self.dropout)
        self.decode_step = nn.LSTMCell(embed_dim + encoder_dim, decoder_dim, bias=True)  # decoding LSTMCell
        self.init_h = nn.Linear(encoder_dim, decoder_dim)  # linear layer to find initial hidden state of LSTMCell
        self.init_c = nn.Linear(encoder_dim, decoder_dim)  # linear layer to find initial cell state of LSTMCell
        self.f_beta = nn.Linear(decoder_dim, encoder_dim)  # linear layer to create a sigmoid-activated gate
        self.sigmoid = nn.Sigmoid()
        self.fc = nn.Linear(decoder_dim, vocab_size)  # linear layer to find scores over vocabulary
        self.init_weights()  # initialize some layers with the uniform distribution

    def init_weights(self):
        """
        初始化embedding和FFN层的参数,使用的是layer.weight/bias.data.method_(-XX,XX)
        Initializes some parameters with values from the uniform distribution, for easier convergence.
        """
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-0.1, 0.1)

    def load_pretrained_embeddings(self, embeddings):
        """
        加载预训练如Golve的embedding
        Loads embedding layer with pre-trained embeddings.
        :param embeddings: pre-trained embeddings
        """
        self.embedding.weight = nn.Parameter(embeddings)

    def fine_tune_embeddings(self, fine_tune=True):
        """
        如果要fine-tune embedding层,则用这个函数控制requires_grad=True
        Allow fine-tuning of embedding layer? (Only makes sense to not-allow if using pre-trained embeddings).
        :param fine_tune: Allow?
        """
        for p in self.embedding.parameters():
            p.requires_grad = fine_tune

    def init_hidden_state(self, encoder_out):
        """
        Creates the initial hidden and cell states for the decoder's LSTM based on the encoded images.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
        :return: hidden state, cell state
        """
        mean_encoder_out = encoder_out.mean(dim=1) # encoder输出对全部pixel的向量取均值,得到形状为(b_s,encoder_dim)的tensor
        h = self.init_h(mean_encoder_out)  # (batch_size, decoder_dim) 初始化线性层变为(batch_size, decoder_dim)的形状作为解码器的h0
        c = self.init_c(mean_encoder_out)
        return h, c

    def forward(self, encoder_out, encoded_captions, caption_lengths):
        """
        Forward propagation.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, enc_image_size, enc_image_size, encoder_dim)
        :param encoded_captions: encoded captions, a tensor of dimension (batch_size, max_caption_length)
        :param caption_lengths: caption lengths, a tensor of dimension (batch_size, 1)
        :return: scores for vocabulary, sorted encoded captions, decode lengths, weights, sort indices
        """

        batch_size = encoder_out.size(0)
        encoder_dim = encoder_out.size(-1)
        vocab_size = self.vocab_size

        # Flatten image
        encoder_out = encoder_out.view(batch_size, -1, encoder_dim)  # (batch_size, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # Sort input data by decreasing lengths; why? apparent below
        # 对caption_lengths(形状为(batch_size, 1))根据每个caption长度进行排序,得到从长到短的排序以及相应的index顺序
        caption_lengths, sort_ind = caption_lengths.squeeze(1).sort(dim=0, descending=True)
        # 根据index顺序对encoder输出和相应caption重新排列
        encoder_out = encoder_out[sort_ind]
        encoded_captions = encoded_captions[sort_ind]

        # Embedding
        # 得到全部captions(golden labelled的)的embedding
        embeddings = self.embedding(encoded_captions)  # (batch_size, max_caption_length, embed_dim)

        # Initialize LSTM state
        h, c = self.init_hidden_state(encoder_out)  # (batch_size, decoder_dim)

        # We won't decode at the <end> position, since we've finished generating as soon as we generate <end>
        # So, decoding lengths are actual lengths - 1
        # 对于每个caption,解码步数正常来说为caption长度,但由于输出end时不用再输入end进行解码,所以需要减去1
        decode_lengths = (caption_lengths - 1).tolist()

        # Create tensors to hold word predicion scores and alphas
        # predictions中存储一个batch中所有caption对应的各个decoder时间步的softmax输出
        # 对于小于最大句长的caption,不足的部分为0tensor
        predictions = torch.zeros(batch_size, max(decode_lengths), vocab_size).to(device)
        # alphas存储一个batch中所有caption对应的各个decoder时间步输入时计算的attention矩阵
        alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)

        # At each time-step, decode by
        # attention-weighing the encoder's output based on the decoder's previous hidden state output
        # then generate a new word in the decoder with the previous word and the attention weighted encoding
        for t in range(max(decode_lengths)):
            # 计算当前时间步t的动态batch_size_t,由于句长由长到短排列,随着t增大batch_size_t会逐渐减小(满足句长大于t的句子会越来越少)
            batch_size_t = sum([l > t for l in decode_lengths])
            # 取对应batch_size_t个图像的encoder输出和相应的decoder hidden state,然后计算得到加权encoder输出和注意力矩阵
            attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t],
                                                                h[:batch_size_t])
            # 把batch_size_t个decoder hidden state变为(batch_size_t, encoder_dim)形状并输入到sigmoid中作为gate
            gate = self.sigmoid(self.f_beta(h[:batch_size_t]))  # gating scalar, (batch_size_t, encoder_dim)
            # gate和刚刚得到的加权encoder输出相乘得到最终的加权encoder输出
            attention_weighted_encoding = gate * attention_weighted_encoding
            # 当前解码时间步输入为batch_size_t个词向量为embeddings[:batch_size_t, t, :](形状为(b_s_t,embed_dim))和
            # attention计算出的加权encoder输出attention_weighted_encoding(形状为(b_s_t,encoder_dim))的concat
            # 形状为((b_s_t,embed_dim+encoder_dim))
            # 然后输入和h与c输入到LSTMCell中得到更新的h和c(形状为(batch_size_t, decoder_dim))
            h, c = self.decode_step(
                torch.cat([embeddings[:batch_size_t, t, :], attention_weighted_encoding], dim=1),
                (h[:batch_size_t], c[:batch_size_t]))  # (batch_size_t, decoder_dim)
            # 更新的h用于计算preds,形状为(batch_size_t, vocab_size)
            preds = self.fc(self.dropout(h))  # (batch_size_t, vocab_size)
            # 对应行存储预测值和attention矩阵
            predictions[:batch_size_t, t, :] = preds
            alphas[:batch_size_t, t, :] = alpha

        return predictions, encoded_captions, decode_lengths, alphas, sort_ind

训练encoder-decoder模型: 关于train.py的笔记

import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data # import这个是为了用里面的DataLoader类
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from datasets import * # import了自定义的CaptionDataset类
from utils import *
from nltk.translate.bleu_score import corpus_bleu

# Data parameters
data_folder = '/media/ssd/caption data'  # folder with data files saved by create_input_files.py
data_name = 'coco_5_cap_per_img_5_min_word_freq'  # base name shared by data files

# Model parameters
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# Training parameters
start_epoch = 0
epochs = 120  # number of epochs to train for (if early stopping is not triggered)
epochs_since_improvement = 0  # 验证集中距离上次BLEU-4 score提升的epoch(th)之间相隔的epoches数
batch_size = 32
workers = 1  # for data-loading; right now, only 1 works with h5py

# dencoder的学习率要大于decode,因为decoder参数为随机初始化的需要更大的学习率来达到收敛
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder

# 梯度剪裁是为了防止梯度爆炸,当梯度大于设定的超参数时,会对梯度进行缩放
grad_clip = 5.  # clip gradients at an absolute value of
# Show and Tell文章中用了'doubly stochastic attention'来标准化参数,这个具体没看怎么回事
alpha_c = 1.  # regularization parameter for 'doubly stochastic attention', as in the paper
# 每次valid都会记录best的BLUE分数并对最好结果和epochs_since_improvement(变为0)进行更新
best_bleu4 = 0.  # BLEU-4 score right now
# 每100个batches报告一次train和valid的状态
print_freq = 100  # print training/validation stats every __ batches
# 是否finetune encoder
fine_tune_encoder = False  # fine-tune encoder?
# 是否从之前训练的模型的checkpoint开始训练
checkpoint = None  # path to checkpoint, None if none


def main():
    """
    Training and validation.
    """
    # 虽然python中外部的全局变量函数内可以调用,但是函数内改变变量值不会影响全局变量的值
    # 通过global声明变量则在函数内改变变量(如best_bleu4)则外部的全局变量的值也会跟着变
    global best_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder, data_name, word_map

    # 读取“单词-index”词典wordmap
    word_map_file = os.path.join(data_folder, 'WORDMAP_' + data_name + '.json')
    with open(word_map_file, 'r') as j:
        word_map = json.load(j)

    # 初始化/加载 checkpoint
    if checkpoint is None:
        # 初始化decoder
        decoder = DecoderWithAttention(attention_dim=attention_dim,
                                       embed_dim=emb_dim,
                                       decoder_dim=decoder_dim,
                                       vocab_size=len(word_map),
                                       dropout=dropout)
        
        # 初始化decoder的优化器,用filter对需要梯度更新的参数进行优化
        decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                             lr=decoder_lr)
        
        # 初始化encoder以及优化器,encoder.fine_tune可以控制encoder是否进行finetune并改变相应参数的requires_grad
        encoder = Encoder()
        encoder.fine_tune(fine_tune_encoder)
        encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                             lr=encoder_lr) if fine_tune_encoder else None

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        best_bleu4 = checkpoint['bleu-4']
        decoder = checkpoint['decoder']
        decoder_optimizer = checkpoint['decoder_optimizer']
        encoder = checkpoint['encoder']
        encoder_optimizer = checkpoint['encoder_optimizer']
        if fine_tune_encoder is True and encoder_optimizer is None:
            encoder.fine_tune(fine_tune_encoder)
            encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                                 lr=encoder_lr)

    # 将encoder和decoder送去GPU,这里对于模块如model,model=model.to(device)等价于model.to(device)
    # 但是对于tensor类型变量如a,a=a.to(device)和a.to(device)不等价,原因是对于tensor,to(device)方法是拷贝一份tensor到GPU
    # 前者中a会变成cuda类型tensor,但是后者中a仍然是cpu(普通)的tensor,所以对于tensor一定要用前面的方法赋值才是把变量送到了GPU上
    # 还有的时候会用这样的方法如model(a.to(device),b.to(device),...)的方法来把tensor送入GPU的model中,这样得到的model输出也
    # 会是cuda类型的tensor
    decoder = decoder.to(device)
    encoder = encoder.to(device)

    # Loss function 这里也给放到GPU上了
    criterion = nn.CrossEntropyLoss().to(device)

    # Custom dataloaders 图像标准化处理
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    
    # 初始化train和val的DataLoader类
    train_loader = torch.utils.data.DataLoader(
        CaptionDataset(data_folder, data_name, 'TRAIN', transform=transforms.Compose([normalize])),
        batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(
        CaptionDataset(data_folder, data_name, 'VAL', transform=transforms.Compose([normalize])),
        batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)

    # Epochs 开始
    for epoch in range(start_epoch, epochs):

        # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
        # 8个epoch不增长则调整学习率,20个不长则epoch停止
        if epochs_since_improvement == 20:
            break
        if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
            adjust_learning_rate(decoder_optimizer, 0.8)
            if fine_tune_encoder:
                adjust_learning_rate(encoder_optimizer, 0.8)

        # One epoch's training
        # 对于所有train data,每次loader加载一个batch,对batch进行训练
        train(train_loader=train_loader,
              encoder=encoder,
              decoder=decoder,
              criterion=criterion,
              encoder_optimizer=encoder_optimizer,
              decoder_optimizer=decoder_optimizer,
              epoch=epoch)

        # One epoch's validation
        # 模型在val data上跑一遍并计算和返回bleu score
        recent_bleu4 = validate(val_loader=val_loader,
                                encoder=encoder,
                                decoder=decoder,
                                criterion=criterion)

        # Check if there was an improvement
        # 如果当前bleu比前面的best还好,则best重新赋值,并且epochs_since_improvement归零
        # 否则epochs_since_improvement的值加1
        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
        else:
            epochs_since_improvement = 0

        # Save checkpoint 每个epoch都会保存,不管是不是表现最好的模型,但是因为名字不变所以会覆盖之前的文件
        # 不过best表现的会给一个前面加'BEST_'名字的保存名防止被非best文件覆盖,不过best之间也会覆盖
        save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer,
                        decoder_optimizer, recent_bleu4, is_best)


def train(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch):
    """
    Performs one epoch's training.
    :param train_loader: DataLoader for training data
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning)
    :param decoder_optimizer: optimizer to update decoder's weights
    :param epoch: epoch number
    """
    
    # encoder和decoder开启训练模式
    decoder.train()  # train mode (dropout and batchnorm is used)
    encoder.train()

    # 初始化四个累计器,分别累计处理一个batch所需时间,加载batch数据所需时间,loss值和accuracy值
    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss (per word decoded)
    top5accs = AverageMeter()  # top5 accuracy

    start = time.time()

    # Batches
    # load一个batch中的图像,captions列表和包含各个caption实际长度的列表
    for i, (imgs, caps, caplens) in enumerate(train_loader):
        data_time.update(time.time() - start)

        # Move to GPU, if available
        # 把三个数据都放到GPU上
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)

        # Forward prop.
        # 将数据输入encoder/decoder中
        imgs = encoder(imgs)
        scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

        # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
        # decoder的输出是从start后一位开始到end位,所以对应的target也是从1开始
        targets = caps_sorted[:, 1:]

        # Remove timesteps that we didn't decode at, or are pads
        # pack_padded_sequence is an easy trick to do this
        # 这里是把一个batch中所有有效时间步输出的softmax输出集合到一起,形状为(total_valid_decode_steps,num_class)
        # 但是这里我觉得是错的,pack_padded_sequence输出的是一个如下的tuple数据
        # PackedSequence(data=tensor([[0.3482, 0.2628, 0.3890],
        # [0.3099, 0.4163, 0.2738],
        # [0.3622, 0.4232, 0.2147],
        # [0.2595, 0.4771, 0.2634],
        # [0.2440, 0.5233, 0.2326],
        # [0.3541, 0.2688, 0.3771],
        # [0.3105, 0.4172, 0.2723]]), batch_sizes=tensor([2, 2, 2, 1]), sorted_indices=None, unsorted_indices=None)
        # 因此取数据应该用如下写法:
        # scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0]
        # targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]
        scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True)
        targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)

        # Calculate loss, 这个loss=sum(-log(right_label_corresponding_p))/total_decode_steps 即求和后取了平均!
        loss = criterion(scores, targets)

        # Add doubly stochastic attention regularization 这里没看论文不懂 先略过
        loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

        # Back prop. 计算完loss后先梯度归零之后反向传播
        decoder_optimizer.zero_grad()
        if encoder_optimizer is not None:
            encoder_optimizer.zero_grad()
        loss.backward()

        # Clip gradients 梯度剪裁
        if grad_clip is not None:
            clip_gradient(decoder_optimizer, grad_clip)
            if encoder_optimizer is not None:
                clip_gradient(encoder_optimizer, grad_clip)

        # Update weights 更新模型参数
        decoder_optimizer.step()
        if encoder_optimizer is not None:
            encoder_optimizer.step()

        # Keep track of metrics
        # 注意loss和acc这两个累计器在累计时需要提供一个batch数据中全部有效的decode步数和
        # 然后步数和乘以平均每步loss值/accuracy值得到一个batch的总loss值/总correct个数和总步数
        # 后面每次更新时候也会更新这三个值,总loss/总步数和总correct/总步数来求得平均loss/全体数据的accuracy
        top5 = accuracy(scores, targets, 5)
        losses.update(loss.item(), sum(decode_lengths))
        top5accs.update(top5, sum(decode_lengths))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        # 每100个batch输出一下累计器的结果
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
                                                                          batch_time=batch_time,
                                                                          data_time=data_time, loss=losses,
                                                                          top5=top5accs))


def validate(val_loader, encoder, decoder, criterion):
    """
    validate其实和train基本一样,只是多了个计算BLEU
    Performs one epoch's validation.
    :param val_loader: DataLoader for validation data.
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :return: BLEU-4 score
    """
    # 调为eval mode,在eval模式下会关闭dropout或者batchnorm,因为这两个都是在训练过程中才需要的操作
    # 另外,dropout的作用是防止过拟合,实现上是对每一层网络中的一部分节点不进行参数更新,实际上相当于训练了多个不同的网络并对他们取平均
    # 详情可以参照这个链接: https://www.jianshu.com/p/ef2a7a78aa83
    # BN的作用是加速网络快速收敛,防止梯度消失,实现上是求batch这个维度上的tensor平均值和方差,然后把数据的分布拉回到标准分布附近(因为
    # 经过多层的网络后数据会变得过大或者过小,使得通过激活函数后的值对应的梯度为0),详情参考下方链接:
    # https://www.cnblogs.com/guoyaohua/p/8724433.html
    decoder.eval()  
    if encoder is not None:
        encoder.eval()

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list()  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # explicitly disable gradient calculation to avoid CUDA memory error
    # solves the issue #57
    # 用 with torch.no_grad() 来关闭梯度,验证和测试时一定要关闭!
    with torch.no_grad():
        # 下述过程和train基本一样,注意取batch数据时候还取了一张图对应的所有正确captions即allcaps
        for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):

            # Move to device, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            # Forward prop.
            if encoder is not None:
                imgs = encoder(imgs)
            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            # 注意这里clone的是scores,即形状为(b_s,max_length(包含pad),class_num)
            # 然后下面用pack_padded_sequence取出valid timesteps,还是觉得这里该改为如下
            # scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0]
            # targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]
            scores_copy = scores.clone()
            scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True)
            targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)

            # Calculate loss
            loss = criterion(scores, targets)

            # Add doubly stochastic attention regularization
            loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

            # Keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time,
                                                                                loss=losses, top5=top5accs))

            # 这里存储references(包括一张图的多个captions)和predictions,形状如下
            # Store references (true captions), and hypothesis (prediction) for each image
            # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
            # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]

            # References
            allcaps = allcaps[sort_ind]  # because images were sorted in the decoder
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                img_captions = list(
                    map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}],
                        img_caps))  # remove <start> and pads
                references.append(img_captions)

            # Hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads
            preds = temp_preds
            hypotheses.extend(preds)

            assert len(references) == len(hypotheses)

        # Calculate BLEU-4 scores
        # 这里调用NLTK的corpus_bleu进行计算,输入的形状就是像上面那样的形状,多个references对应一个hypothese
        bleu4 = corpus_bleu(references, hypotheses)

        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'.format(
                loss=losses,
                top5=top5accs,
                bleu=bleu4))

    return bleu4


if __name__ == '__main__':
    main()

不使用teacher forcing在测试集上预测图片caption:关于eval.py的笔记

import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from datasets import *
from utils import *
from nltk.translate.bleu_score import corpus_bleu
import torch.nn.functional as F
from tqdm import tqdm

# Parameters
data_folder = '/media/ssd/caption data'  # folder with data files saved by create_input_files.py
data_name = 'coco_5_cap_per_img_5_min_word_freq'  # base name shared by data files
# 这个checkpoint是train之后保存的,表现最好的模型
checkpoint = '../BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar'  # model checkpoint
word_map_file = '/media/ssd/caption data/WORDMAP_coco_5_cap_per_img_5_min_word_freq.json'  # word map, ensure it's the same the data was encoded with and the model was trained with
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors

# 对于cudnn做benchmark的设置,查到了以下的解释:
# 如果网络的输入数据维度或类型上变化不大,设置torch.backends.cudnn.benchmark = true可以增加运行效率
# 如果网络的输入数据在每次 iteration 都变化的话,会导致 cnDNN 每次都会去寻找一遍最优配置,这样反而会降低运行效率。
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# Load model
# 注意encoder和decoder要开启eval模式
checkpoint = torch.load(checkpoint)
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval()

# Load word map (word2ix)
# 加载word2index和生成反向字典index2word
with open(word_map_file, 'r') as j:
    word_map = json.load(j)
rev_word_map = {v: k for k, v in word_map.items()}
vocab_size = len(word_map)

# Normalization transform
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])


def evaluate(beam_size):
    """
    Evaluation
    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    # 加载测试集数据和其loader,注意加载的是TEST测试集!!!
    # 注意这里loader设置的batch size为1,也就是说每次输入一张图然后生成对应的k(beam size)个captions
    # 由于beam search在decode过程中就是每步生成k个单词(和多batch处理情况类似),所以batch只设置为1
    # 后面还会看到对encoder的输入扩张为k个输入当成是batch size为k的输入来进行处理(后述)
    # 最终inference时候会从k个captions中选分数最高的句子作为唯一的hypothesis
    loader = torch.utils.data.DataLoader(
        CaptionDataset(data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])),
        batch_size=1, shuffle=True, num_workers=1, pin_memory=True)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    # references和hypotheses都为list类型,且形状如上所述,两个列表的每个item对应一张图的references和预测的hypothesis
    references = list()
    hypotheses = list()

    # For each image
    # tqdm这里desc的字符会显示在进度条前方
    for i, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        # 在这里通过expand把batch size从1变为k,之所以这么变是因为在decode的过程中每次解码都要输入k个单词
        # 这样可以把问题看成是对k个复制的同一张图encoder输出(batch size为k)进行decode
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        # k_prev_words存储的是上一个step的k个得分最高的词的index,形状为(k,1)
        # 初始化时候为k个start index
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        # seqs中保存着当前的k个预测句子,初始化形状为(k,1),当某个句子完成decode(最后一个元素是end)的时候
        # 完成decode的句子会被加入到complete_seqs中
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        # top_k_scores 保存着当前k个预测句子的分数,这个分数的计算是一个累积的分数
        # 比如当前解码的某一个序列为ABC,则分数为第一步的A对应的softmax的a值,第二步B对应的b值...
        # 然后三个分数相加得到a+b+c,其实beam search每一步比较分数得到topk的时候也是比较这个累计分数而不是当前的c值
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        # 当seqs中某一个sequence已经decode出end的时候,这个sequence会被保存到complete_seqs
        # 同时,这个句子的分数也被保存到complete_seqs_scores中
        # compelte_seqs和scores可以看成是分别把seqs和top_k_scores中解码完成的句子以及对应分数保存起来的变量
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        # h和c先进行初始化,注意后面的过程中h和c的b_s会变(后述)
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        # 注释里的s是一个小于等于k的数,因为完成的句子会被取出,beam length也会相应减少
        # (如beam是5,若当前解码完成一个句子,则后面搜索空间变为4)
        # 所以在解码过程中batch的大小会随着beam length的减小逐渐变小最终变为0
        while True:

            # 对先前解码的词进行embedding,作为下一步decode的输入
            embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)
            
            # 计算上一步h和encoder输出的注意力来得到weighted encoding
            awe, _ = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)
            
            # 用上一步h计算gate并和weighted encoding相乘
            gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            # 对weighted encoding和embeddings进行concat作为下一步解码的输入,并更新h和c
            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # 这里把上一步得到的k个序列的得分分别进行expand并分别加上当前vocab_size大小的softmax输出
            # 具体来说top_k_scores保存着之前的s个序列的分数,形状为(s,1),扩张后形状为(s,v_s),然后再加上softmax的输出(形状为(s,v_s))
            # 这样得到的为总共s*v_s个(每个s对应v_s个单词)的全体序列分数,之后beam也会对全体unroll进行topk选择
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            # 当第一步解码时,因为h,c和前一个词(start)都相同,所以scores中所有item都相等
            # 因此求topk时候不需要unroll然后找最大k个,直接在第一个里面找最大的k个就是全体的最大k个
            # 因此第一步这里用了scores[0].topk,而在后面step中要view(-1)来unroll所有数据然后取topk(s)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            # 比方说vocab_size为10,k为4,则有40个分数,top_k_words可能等于=[5,15,25,35]
            # 这里我们需要把这个index换成在原本没unroll序列中的index,即为[0][5],[1][5],[2][5]和[3][5]
            # 这里他用了一个/去取seq_index的list,用%取word_index的list
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            # 根据刚刚得到的seq_index和word_index来给抽取相应的seqs(可重复,考虑topk中多个词来自于同一个previous word)
            # 然后在相应seq中添加decode出的当前词的index,并更新seqs
            # 这里有个pytorch的语法想说一下,对于torch tensor如x形状为(5,10),若想取其中的几个元素如0号和1号元素并保留原维度
            # 则seqs[[0,1]]即可取出,形状为(2,10)维度不变
            # 而对于python的list则一般需要[lis for i,lis in enumerate(x) if i in {0,1}]
            # 不得不说pytorch这个tensor[[index1,...]]真的好方便!
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            # 查看是否topk中当前词有end的情况,将当前词不是end和是end的序列的index分别存入incomplete和complete两个list中
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                               next_word != word_map['<end>']]
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            # 如果存在解码完成了的序列,则把它们存储到cimplete_seqs中,相应的分数存到scores中
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
                
            # 注意这里,解码完成了x个,则beam的长度也相应缩小x,以保证最后输出的序列数为k个
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            # 当全部句子都解码完成时break
            if k == 0:
                break
                
            # seqs中只保留未完成的sequences
            # h和c中也只保留未完成sequence对应的根节点index的h元素和c元素,根节点index指的是prev_word_inds中的元素
            # (h/c中保留的元素可以是重复的,因为prev_word_inds中很可能包含重复元素)
            # encoder输出也只保留根节点index对应元素
            # top_k_scores和k_prev_words也相应的只保留非end的分数和结果,并通过unsqueeze恢复维度
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            # 句子长超过50太长了,则强行break
            if step > 50:
                break
            step += 1

        # 全部k个句子求得之后,求出得分最高的句子所对应的index i
        # 这里有个小语法是如果求list A的最大值对应的index,则用
        # index函数: A.index(max(A))
        i = complete_seqs_scores.index(max(complete_seqs_scores))
        # 根据index取出唯一的hypothesis
        seq = complete_seqs[i]

        # References
        # 这里的references存储了所有图片的references,并且每个item中也包含多个references(一图多captions)
        # 这里是allcaps[0]而不是allcaps是因为allcaps形状为(1,cpi),1是batch_size,allcaps[0]里面才是多个captions
        img_caps = allcaps[0].tolist()
        # 每个caption去掉特殊符号,然后加入到references中
        img_captions = list(
            map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],
                img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        # hypothesis也去掉特殊符号加入到包含所有图片预测句子的hypotheses中
        hypotheses.append([w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}])

        assert len(references) == len(hypotheses)

    # Calculate BLEU-4 scores
    bleu4 = corpus_bleu(references, hypotheses)

    return bleu4


if __name__ == '__main__':
    beam_size = 1
    print("\nBLEU-4 score @ beam size of %d is %.4f." % (beam_size, evaluate(beam_size)))

读入一张图片并生成caption,并可视化decode每个单词时对图像的attention: 关于caption.py的笔记

import torch
import torch.nn.functional as F
import numpy as np
import json
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import skimage.transform
import argparse
from scipy.misc import imread, imresize
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def caption_image_beam_search(encoder, decoder, image_path, word_map, beam_size=3):
    """
    这个函数实现的是输入「一张」图片并且通过beam生成captions
    Reads an image and captions it with beam search.
    :param encoder: encoder model
    :param decoder: decoder model
    :param image_path: path to image
    :param word_map: word map
    :param beam_size: number of sequences to consider at each decode-step
    :return: caption, weights for visualization
    """

    k = beam_size
    vocab_size = len(word_map)

    # Read image and process
    # 由于只输入一张图,所以不能用loader,而且要进行一遍图像的前处理
    
    # 读取图片
    img = imread(image_path)
    # 如果只有一个通道则增加为三通道
    if len(img.shape) == 2:
        img = img[:, :, np.newaxis]
        img = np.concatenate([img, img, img], axis=2)
    # resize为正方形
    img = imresize(img, (256, 256))
    # 维度位置调整
    img = img.transpose(2, 0, 1)
    # 缩小到0到1的范围
    img = img / 255.
    # 数据类型为float并放到cuda上
    img = torch.FloatTensor(img).to(device)
    # 标准化处理,形状变为(3,256,256)
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform = transforms.Compose([normalize])
    image = transform(img)  # (3, 256, 256)

    # Encode 编码图像
    image = image.unsqueeze(0)  # (1, 3, 256, 256)
    encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding 图像展平
    encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)

    # We'll treat the problem as having a batch size of k
    # 因为beam search 所以decode看成是处理batch为k的输入
    encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

    # Tensor to store top k previous words at each step; now they're just <start>
    # 上一步解码得到的k个词
    k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)  # (k, 1)

    # Tensor to store top k sequences; now they're just <start>
    # 存放所有seqs,第一步只有start所以直接把上一步解码得到的k个start赋值给seqs
    seqs = k_prev_words  # (k, 1)

    # Tensor to store top k sequences' scores; now they're just 0
    # 存储k个得分最高的序列的分数
    top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

    # Tensor to store top k sequences' alphas; now they're just 1s
    # 这里的seqs_alpha是存放topk个序列对应的attention matrices
    # 第0维度的值会随着decode逐渐增加,当某个序列decode完成后相应attention矩阵会被添加到下面的
    # complete_seqs_alpha这个变量中用于后面的可视化
    # 注意start对应的为一个全为1的注意力矩阵,而之后的每个词对应的为输入的注意力矩阵,如a单词对应的注意力矩阵为
    # a上一步的h和encode output之间计算的注意力矩阵(被concat并用于解码a的那个注意力矩阵)
    seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to(device)  # (k, 1, enc_image_size, enc_image_size)

    # Lists to store completed sequences, their alphas and scores
    complete_seqs = list()
    complete_seqs_alpha = list()
    complete_seqs_scores = list()

    # Start decoding
    step = 1
    h, c = decoder.init_hidden_state(encoder_out)

    # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
    while True:

        embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)
        
        # 注意这里输出了注意力矩阵alpha,而不是像eval中那样没输出
        awe, alpha = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)
        
        # 对alpha进行变形为(s, enc_image_size, enc_image_size)
        alpha = alpha.view(-1, enc_image_size, enc_image_size)  # (s, enc_image_size, enc_image_size)

        # gate控制attention通过
        gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
        awe = gate * awe
        
        h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

        scores = decoder.fc(h)  # (s, vocab_size)
        scores = F.log_softmax(scores, dim=1)

        # Add
        scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

        # For the first step, all k points will have the same scores (since same k previous words, h, c)
        if step == 1:
            top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
        else:
            # Unroll and find top scores, and their unrolled indices
            top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

        # Convert unrolled indices to actual indices of scores
        prev_word_inds = top_k_words / vocab_size  # (s)
        next_word_inds = top_k_words % vocab_size  # (s)

        # Add new words to sequences, alphas
        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)
        # 注意这里对存放k个序列的注意力矩阵们的seqs_alpha变量
        # 在相应序列的tensor中concat当前步骤的注意力矩阵来更新seqs_alpha
        seqs_alpha = torch.cat([seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
                               dim=1)  # (s, step+1, enc_image_size, enc_image_size)

        # Which sequences are incomplete (didn't reach <end>)?
        incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != word_map['<end>']]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

        # Set aside complete sequences
        if len(complete_inds) > 0:
            complete_seqs.extend(seqs[complete_inds].tolist())
            # complete_seqs_alpha中添加解码完成的seqs对应的注意力矩阵们
            complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
            complete_seqs_scores.extend(top_k_scores[complete_inds])
        k -= len(complete_inds)  # reduce beam length accordingly

        # Proceed with incomplete sequences
        if k == 0:
            break
        seqs = seqs[incomplete_inds]
        seqs_alpha = seqs_alpha[incomplete_inds]
        h = h[prev_word_inds[incomplete_inds]]
        c = c[prev_word_inds[incomplete_inds]]
        encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
        top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
        k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

        # Break if things have been going on too long
        if step > 50:
            break
        step += 1

    i = complete_seqs_scores.index(max(complete_seqs_scores))
    seq = complete_seqs[i]
    alphas = complete_seqs_alpha[i]

    return seq, alphas


def visualize_att(image_path, seq, alphas, rev_word_map, smooth=True):
    """
    Visualizes caption with weights at every word.
    Adapted from paper authors' repo: https://github.com/kelvinxu/arctic-captions/blob/master/alpha_visualization.ipynb
    :param image_path: path to image that has been captioned
    :param seq: caption
    :param alphas: weights
    :param rev_word_map: reverse word mapping, i.e. ix2word
    :param smooth: smooth weights?
    """
    # 用PIL中的Image来读入图像,并resize,指定resize模式为LANCZOS
    image = Image.open(image_path)
    image = image.resize([14 * 24, 14 * 24], Image.LANCZOS)

    # 把装着indexes的seq转换为装着单词的words
    words = [rev_word_map[ind] for ind in seq]

    for t in range(len(words)):
        if t > 50:
            break
        # np.ceil(x)会取上位正数,即np.ceil(4.1)会取5
        # plt.subplot就是画出np.ceil(len(words)/5)行,5列的图
        # 并且当前绘制的图像编号为t+1(即图像从编号1开始绘制)
        plt.subplot(np.ceil(len(words) / 5.), 5, t + 1)
        # 用plt.text在(0,1)位置加标签,白底黑字
        plt.text(0, 1, '%s' % (words[t]), color='black', backgroundcolor='white', fontsize=12)
        # imshow()接收一张图像,只是画出该图,并不会立刻显示出来。
        # imshow后还可以进行其他draw操作,比如scatter散点等。
        # 所有画完后使用plt.show()才能进行结果的显示。
        plt.imshow(image) 
        current_alpha = alphas[t, :]
        # 这里对注意力矩阵的形状进行处理,因为矩阵为(14,14)的,想和现在的(24*14,24*14)的相乘,则需要变形
        # 如果采用smooth,则利用上采样(skimage.transform.pyramid_expand实现)来插值变形
        # 如果不用smooth,则直接resize为指定形状,只不过图像质量会比smooth差一些
        if smooth:
            alpha = skimage.transform.pyramid_expand(current_alpha.numpy(), upscale=24, sigma=8)
        else:
            alpha = skimage.transform.resize(current_alpha.numpy(), [14 * 24, 14 * 24])
        # alpha设置透明度,0的时候就是全透明的
        # 这里alpha参数的设置具体要做实验才能明白区别
        if t == 0:
            plt.imshow(alpha, alpha=0)
        else:
            plt.imshow(alpha, alpha=0.8)
        plt.set_cmap(cm.Greys_r) # 设置为灰度图
        plt.axis('off') # 不显示坐标尺寸
    plt.show()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Show, Attend, and Tell - Tutorial - Generate Caption')

    parser.add_argument('--img', '-i', help='path to image')
    parser.add_argument('--model', '-m', help='path to model')
    parser.add_argument('--word_map', '-wm', help='path to word map JSON')
    parser.add_argument('--beam_size', '-b', default=5, type=int, help='beam size for beam search')
    parser.add_argument('--dont_smooth', dest='smooth', action='store_false', help='do not smooth alpha overlay')

    args = parser.parse_args()

    # Load model
    checkpoint = torch.load(args.model, map_location=str(device))
    decoder = checkpoint['decoder']
    decoder = decoder.to(device)
    # 记得设置eval模式
    decoder.eval()
    encoder = checkpoint['encoder']
    encoder = encoder.to(device)
    encoder.eval()

    # Load word map (word2ix)
    with open(args.word_map, 'r') as j:
        word_map = json.load(j)
    rev_word_map = {v: k for k, v in word_map.items()}  # ix2word

    # Encode, decode with attention and beam search
    seq, alphas = caption_image_beam_search(encoder, decoder, args.img, word_map, args.beam_size)
    alphas = torch.FloatTensor(alphas)

    # Visualize caption and attention of best sequence
    visualize_att(args.img, seq, alphas, rev_word_map, args.smooth)
⚠️ **GitHub.com Fallback** ⚠️