Caption tutorial notes - HongkuanZhang/Technique-Notes GitHub Wiki
数据前处理:关于utils.py的笔记
importosimportnumpyasnpimporth5py#这个函数用来生成HDF5文件(用于存储图像)importjsonimporttorchfromscipy.miscimportimread, imresize#最新的spicy已经移除了这两个函数,用imageio的imread和Image的imresize,参考此链接: https://blog.csdn.net/weekdawn/article/details/97777747fromtqdmimporttqdmfromcollectionsimportCounter# Counter用来数单词出现的次数,非常方便,不用像以前那样创建函数遇到单词给字典中词频+1那样记录fromrandomimportseed, choice, sample# choice用来随机选择iteration中的一个元素,sample用来从iteration中采样一部分(小于等于iteration长度)数据defcreate_input_files(dataset, karpathy_json_path, image_folder, captions_per_image, min_word_freq, output_folder,
max_len=100):
""" Creates input files for training, validation, and test data. :param dataset: name of dataset, one of 'coco', 'flickr8k', 'flickr30k' :param karpathy_json_path: path of Karpathy JSON file with splits and captions :param image_folder: folder with downloaded images :param captions_per_image: number of captions to sample per image :param min_word_freq: words occuring less frequently than this threshold are binned as <unk>s :param output_folder: folder to save files :param max_len: don't sample captions longer than this length """assertdatasetin {'coco', 'flickr8k', 'flickr30k'}
# Read Karpathy JSONwithopen(karpathy_json_path, 'r') asj: #karpathy的zip文件解压的文件夹,里面包含三个json文件,分别对应三种dataset的captionsdata=json.load(j) #加载的其实是一个字典,字典包含两个key为: dict_keys(['images', 'dataset']),其中images对应的值为一个字典列表#每个字典包含了一张图片的详细信息,dataset对应的为dataset名称的字符串如'coco'# Read image paths and captions for each imagetrain_image_paths= []
train_image_captions= []
val_image_paths= []
val_image_captions= []
test_image_paths= []
test_image_captions= []
word_freq=Counter() # 为生成单词-词频字典,初始化counterforimgindata['images']: # 读入每个image的多个captionscaptions= [] # captions变量存储一张图的多个标注forcinimg['sentences']:
# Update word frequencyword_freq.update(c['tokens']) # counter.update(token_list)可以自动根据词频更新单词-词频字典iflen(c['tokens']) <=max_len: # 只有token数小于最大长度的caption(token列表)才会被添加到captionscaptions.append(c['tokens'])
iflen(captions) ==0: # 若一张图的所有caption包含的token数量都太大则舍弃这张图continuepath=os.path.join(image_folder, img['filepath'], img['filename']) ifdataset=='coco'elseos.path.join(
image_folder, img['filename'])
ifimg['split'] in {'train', 'restval'}:
train_image_paths.append(path)
train_image_captions.append(captions)
elifimg['split'] in {'val'}:
val_image_paths.append(path)
val_image_captions.append(captions)
elifimg['split'] in {'test'}:
test_image_paths.append(path)
test_image_captions.append(captions)
# Sanity check# 各个path中包含每张图的路径,各个captions中包含每张图的多个captions,path数量和caption list数量必须一致(一图对多个captions,每张图的captions数量可能会不一,后面会处理)assertlen(train_image_paths) ==len(train_image_captions)
assertlen(val_image_paths) ==len(val_image_captions)
assertlen(test_image_paths) ==len(test_image_captions)
# Create word map# “单词-词频”词典中小于最低出现频率的单词会被舍弃,然后筛选出的相对高频词会生成“单词-index”的词典word_map,并且要添加四个特殊元素,注意pad添加为0索引元素words= [wforwinword_freq.keys() ifword_freq[w] >min_word_freq]
word_map= {k: v+1forv, kinenumerate(words)}
word_map['<unk>'] =len(word_map) +1word_map['<start>'] =len(word_map) +1word_map['<end>'] =len(word_map) +1word_map['<pad>'] =0# Create a base/root name for all output files # 输出word_map的文件名称base_filename=dataset+'_'+str(captions_per_image) +'_cap_per_img_'+str(min_word_freq) +'_min_word_freq'# Save word map to a JSONwithopen(os.path.join(output_folder, 'WORDMAP_'+base_filename+'.json'), 'w') asj:
json.dump(word_map, j)
# Sample captions for each image, save images to HDF5 file, and captions and their lengths to JSON files# HDF5文件就是个二叉树字典,文件本身是根节点,文件包含有很多group/dataset为节点,每个节点中包含dataset(也可以包含下属子节点)。seed(123)
forimpaths, imcaps, splitin [(train_image_paths, train_image_captions, 'TRAIN'),
(val_image_paths, val_image_captions, 'VAL'),
(test_image_paths, test_image_captions, 'TEST')]:
withh5py.File(os.path.join(output_folder, split+'_IMAGES_'+base_filename+'.hdf5'), 'a') ash:
# Make a note of the number of captions we are sampling per image # 将每个图片包含的caption数这个参数写入HDF5字典的attribute中h.attrs['captions_per_image'] =captions_per_image# Create dataset inside HDF5 file to store images# 这个HDF5文件只包含了train/test/val图像dataset,它的形状为(图像数量,3通道,256,256)images=h.create_dataset('images', (len(impaths), 3, 256, 256), dtype='uint8')
print("\nReading %s images and captions, storing to file...\n"%split)
enc_captions= [] # dataset中全部图像的的全部captions(图像数 * 每个图像包含的caption数量)caplens= [] # 每个caption的实际长度(token数+2)fori, pathinenumerate(tqdm(impaths)):
# Sample captionsiflen(imcaps[i]) <captions_per_image: # 每个图像对应caption数量如果小于规定数则重复sample来补充captions=imcaps[i] + [choice(imcaps[i]) for_inrange(captions_per_image-len(imcaps[i]))]
else: # 反之如果达到了或者甚至超过了,则采样规定数量的caption来控制数量captions=sample(imcaps[i], k=captions_per_image)
# Sanity check# 确保每个图像对应的caption数量都等于规定数量assertlen(captions) ==captions_per_image# Read images# 读入图像,形状为(3,256,256)img=imread(impaths[i])
iflen(img.shape) ==2:
img=img[:, :, np.newaxis]
img=np.concatenate([img, img, img], axis=2)
img=imresize(img, (256, 256))
img=img.transpose(2, 0, 1)
assertimg.shape== (3, 256, 256)
assertnp.max(img) <=255# Save image to HDF5 file# images变量中存储每个图像(images形状为(image_num,3,256,256))images[i] =imgforj, cinenumerate(captions):#把caption的token换成对应字典中的数字,然后加上四种特殊元素# Encode captions# 这里encode的时候对Counter类的word_map用get来获取字典中存在的对应,不存在的则当做unk处理enc_c= [word_map['<start>']] + [word_map.get(word, word_map['<unk>']) forwordinc] + [
word_map['<end>']] + [word_map['<pad>']] * (max_len-len(c))
# Find caption lengthsc_len=len(c) +2enc_captions.append(enc_c)
caplens.append(c_len) # 注意这里存储的每个实际的caption长度加了2# Sanity check# dataset中全部图像的的全部captions(图像数 * 每个图像包含的caption数量)assertimages.shape[0] *captions_per_image==len(enc_captions) ==len(caplens)
# Save encoded captions and their lengths to JSON files# enc_captions中包含所有图像的所有captions, 形状为(images.shape[0] * captions_per_image, padded_length)withopen(os.path.join(output_folder, split+'_CAPTIONS_'+base_filename+'.json'), 'w') asj:
json.dump(enc_captions, j)
# caplens包含所有caption的实际长度,形状为((images.shape[0] * captions_per_image,)withopen(os.path.join(output_folder, split+'_CAPLENS_'+base_filename+'.json'), 'w') asj:
json.dump(caplens, j)
definit_embedding(embeddings): # 根据word embedding的dim来初始化embedding层参数""" Fills embedding tensor with values from the uniform distribution. :param embeddings: embedding tensor """bias=np.sqrt(3.0/embeddings.size(1))
torch.nn.init.uniform_(embeddings, -bias, bias)
defload_embeddings(emb_file, word_map):# 从glove中加载embedding,并修改embedding层中对应的参数""" Creates an embedding tensor for the specified word map, for loading into the model. :param emb_file: file containing embeddings (stored in GloVe format) :param word_map: word map :return: embeddings in the same order as the words in the word map, dimension of embeddings """# Find embedding dimensionwithopen(emb_file, 'r') asf:
emb_dim=len(f.readline().split(' ')) -1# 减1是减去第一个单词占的长度vocab=set(word_map.keys())
# Create tensor to hold embeddings, initializeembeddings=torch.FloatTensor(len(vocab), emb_dim)
init_embedding(embeddings) # 初始化embedding# Read embedding fileprint("\nLoading embeddings...")
forlineinopen(emb_file, 'r'):
line=line.split(' ')
emb_word=line[0]
# 这里取embedding用了两个函数,filter(function,iteration)可以筛选iteration中符合条件的元素# (例如如果iteration是list,那么外侧加个list(filter)能返回符合条件的list),这里filter筛掉了空元素""和空格字符串" "# 然后map(function,filter)则是对filter中的每个元素进行float数据类型变换,最后外面的list函数返回list数据类型# 这种map+filter的联用方式在其他数据处理中也会很常用embedding=list(map(lambdat: float(t), filter(lambdan: nandnotn.isspace(), line[1:])))
# Ignore word if not in train_vocab# 对于不在词典中的词不会更改embedding层(unk保持初始化值)ifemb_wordnotinvocab:
continueembeddings[word_map[emb_word]] =torch.FloatTensor(embedding)
returnembeddings, emb_dimdefclip_gradient(optimizer, grad_clip):
""" Clips gradients computed during backpropagation to avoid explosion of gradients. :param optimizer: optimizer with the gradients to be clipped :param grad_clip: clip value """forgroupinoptimizer.param_groups:
forparamingroup['params']:
ifparam.gradisnotNone:
param.grad.data.clamp_(-grad_clip, grad_clip)
defsave_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer,
bleu4, is_best):
""" Saves model checkpoint. :param data_name: base name of processed dataset :param epoch: epoch number :param epochs_since_improvement: number of epochs since last improvement in BLEU-4 score :param encoder: encoder model :param decoder: decoder model :param encoder_optimizer: optimizer to update encoder's weights, if fine-tuning :param decoder_optimizer: optimizer to update decoder's weights :param bleu4: validation BLEU-4 score for this epoch :param is_best: is this checkpoint the best so far? """state= {'epoch': epoch,
'epochs_since_improvement': epochs_since_improvement,
'bleu-4': bleu4,
'encoder': encoder,
'decoder': decoder,
'encoder_optimizer': encoder_optimizer,
'decoder_optimizer': decoder_optimizer}
filename='checkpoint_'+data_name+'.pth.tar'torch.save(state, filename)
# If this checkpoint is the best so far, store a copy so it doesn't get overwritten by a worse checkpointifis_best:
torch.save(state, 'BEST_'+filename)
classAverageMeter(object):
""" Keeps track of most recent, average, sum, and count of a metric. """def__init__(self):
self.reset()
defreset(self):
self.val=0self.avg=0self.sum=0self.count=0defupdate(self, val, n=1):
self.val=valself.sum+=val*nself.count+=nself.avg=self.sum/self.countdefadjust_learning_rate(optimizer, shrink_factor):
""" Shrinks learning rate by a specified factor. :param optimizer: optimizer whose learning rate must be shrunk. :param shrink_factor: factor in interval (0, 1) to multiply learning rate with. """print("\nDECAYING learning rate.")
forparam_groupinoptimizer.param_groups:
param_group['lr'] =param_group['lr'] *shrink_factorprint("The new learning rate is %f\n"% (optimizer.param_groups[0]['lr'],))
defaccuracy(scores, targets, k):
""" Computes top-k accuracy, from predicted and true labels. :param scores: scores from the model :param targets: true labels :param k: k in top-k accuracy :return: top-k accuracy """batch_size=targets.size(0)
# 这个torchtensor.topk可以筛选对应维度中的最大/小k个数值,1表示第一个维度(即每个时间步softmax输出),后面两个True分别表示取最大k值和按照topk顺序输出_, ind=scores.topk(k, 1, True, True)
# 这里由于模型每个时间步输出取了topk,则每个时间步的正确答案也要expand为k个来计算正确预测数# 例如,一个caption对应的正确答案为4个token,则形状为(4,),但模型由于topk=2输出ind的形状为(4,2),因此caption的形状也应该相应扩张为(4,2)# correct的形状为(b_s,k),这里的b_s不是1个batch中的caption数量,而是所有有效时间步的数量(pack_padded_squences之后所有有效时间步的和)# 由于eq所以元素都为True/False(相对位置元素是否相等),如下图# tensor([[False, True],# [False, True],# [False, True],# [False, True]])correct=ind.eq(targets.view(-1, 1).expand_as(ind))
correct_total=correct.view(-1).float().sum() # 计算True的个数(0D tensor)returncorrect_total.item() * (100.0/batch_size) # 用.item()取出scalar数值,b_s等于有效时间步总和(即有效tokens数量和),因此correct/b_s为accuracy
读入前处理后的数据并生成dataset:Dataset类的构造
importtorchfromtorch.utils.dataimportDatasetimporth5pyimportjsonimportosclassCaptionDataset(Dataset): # 这个CaptionDataset类继承了pytorch中的Dataset类,这个类会被后面的DataLoader用来读入batch数据""" A PyTorch Dataset class to be used in a PyTorch DataLoader to create batches. """def__init__(self, data_folder, data_name, split, transform=None):
""" :param data_folder: folder where data files are stored :param data_name: base name of processed datasets :param split: split, one of 'TRAIN', 'VAL', or 'TEST' :param transform: image transform pipeline """self.split=splitassertself.splitin {'TRAIN', 'VAL', 'TEST'}
# Open hdf5 file where images are storedself.h=h5py.File(os.path.join(data_folder, self.split+'_IMAGES_'+data_name+'.hdf5'), 'r')
self.imgs=self.h['images']
# Captions per imageself.cpi=self.h.attrs['captions_per_image']
# Load encoded captions (completely into memory)withopen(os.path.join(data_folder, self.split+'_CAPTIONS_'+data_name+'.json'), 'r') asj:
self.captions=json.load(j)
# Load caption lengths (completely into memory)withopen(os.path.join(data_folder, self.split+'_CAPLENS_'+data_name+'.json'), 'r') asj:
self.caplens=json.load(j)
# PyTorch transformation pipeline for the image (normalizing, etc.)# 在实际输入中这里的trainsform变量为对数据的标准化流程,具体如下# import torchvision.transforms as transforms# normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
# CaptionDataset(....,transform=transforms.Compose([normalize]))# 这里面transforms.Compose([a,b,c,d])(input)可以对输入数据进行连续的a,b,c,d数据处理操作,在这里只用到了normalization因此list中只有一项。# mean和std的取值也是ImageNet给出的设置,一般都是这个固定的参数,如果想从0训练则可以都更改为[0.5,0.5,0.5](一般都不会这么做)。self.transform=transform# Total number of datapoints# dataset的size为当前dataset中所有caption的数量self.dataset_size=len(self.captions)
def__getitem__(self, i):
# Remember, the Nth caption corresponds to the (N // captions_per_image)th image# 第 Nth 的标注对应 (N // cpi)th 的图片,读入图片并对元素除以255使得所有元素取值落在[0,1]之间img=torch.FloatTensor(self.imgs[i//self.cpi] /255.)
# 对数据标准化ifself.transformisnotNone:
img=self.transform(img)
caption=torch.LongTensor(self.captions[i]) # 注意这里captions[i]是一个列表,装着一个caption里的所有tokenscaplen=torch.LongTensor([self.caplens[i]]) # 这里caplens[i]是一个scalar,为了能变成tensor,在这里加上了[]变成了[caplens[i]]# 后面的decoder中的caption_lengths那里会通过squeeze(1)从(b_s,1)恢复成(b_s,)形状ifself.splitis'TRAIN':
returnimg, caption, caplenelse:
# For validation of testing, also return all 'captions_per_image' captions to find BLEU-4 score# 对于test和val,除了返回image-caption(1条)-caption_length(一个整数),还会返回对应图片的全部captionsall_captions=torch.LongTensor(
self.captions[((i//self.cpi) *self.cpi):(((i//self.cpi) *self.cpi) +self.cpi)])
returnimg, caption, caplen, all_captionsdef__len__(self):
returnself.dataset_size
Attention-based Ecoder-Decoder模型构建:关于models.py的笔记
importtorchfromtorchimportnnimporttorchvisiondevice=torch.device("cuda"iftorch.cuda.is_available() else"cpu")
classEncoder(nn.Module):
""" Encoder. 其实就是个去掉后两层分类层的Resnet101网络 输出为encode后的图像 """def__init__(self, encoded_image_size=14):
super(Encoder, self).__init__()
# encode后的图像尺寸为14*14self.enc_image_size=encoded_image_size# 从torchvision中导入预训练的resnet101resnet=torchvision.models.resnet101(pretrained=True) # pretrained ImageNet ResNet-101# Remove linear and pool layers (since we're not doing classification)# 移除resnet101最后的两个用于分类的层:averagepool层和softmax层# 注意这里使用的方法是用model.children()把模型中的各层存入list中,然后list删去最后两个层得到我们需要的modules# 然后用nn.Sequential(*modules)来得到线性网络# PS:nn.Sequential(*layers)/nn.Sequential和nn.Module建立网络的区别是,前者把网络堆叠起来,输入则按堆叠顺序来进行处理# 但nn.Modules则除了在init中定义各层,还需要在forward中处理各个网络之间的连接顺序和方式modules=list(resnet.children())[:-2]
self.resnet=nn.Sequential(*modules)
# Resize image to fixed size to allow input images of variable size# 这个nn.AdaptiveAvgPool2d会对输入图像进行池化,池化参数会自动计算# 保证最终输出的形状为(encoded_image_size, encoded_image_size)self.adaptive_pool=nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))
self.fine_tune()
defforward(self, images):
""" Forward propagation. :param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size) :return: encoded images """# 最终输出形状为 (batch_size, encoded_image_size, encoded_image_size, 2048)# 其中2048为通道数,也是后面代码中的encoded_dimout=self.resnet(images) # (batch_size, 2048, image_size/32, image_size/32)out=self.adaptive_pool(out) # (batch_size, 2048, encoded_image_size, encoded_image_size)out=out.permute(0, 2, 3, 1) # (batch_size, encoded_image_size, encoded_image_size, 2048)returnoutdeffine_tune(self, fine_tune=True):
""" fine-tune函数使得resnet中只有第五层至最后一层(2th卷积层到4th卷积层)的参数被fine-tune 而第一到第四层参数不变动(保持预训练中学习到的基础知识) Allow or prevent the computation of gradients for convolutional blocks 2 through 4 of the encoder. :param fine_tune: Allow? """forpinself.resnet.parameters():
p.requires_grad=False# If fine-tuning, only fine-tune convolutional blocks 2 through 4forcinlist(self.resnet.children())[5:]:
forpinc.parameters():
p.requires_grad=fine_tuneclassAttention(nn.Module):
""" Attention Network. 这个attention主要是计算decoder中hidden state和encoder中hidden states的注意力 输出为attention_weighted_encoding和注意力矩阵 """def__init__(self, encoder_dim, decoder_dim, attention_dim):
""" :param encoder_dim: feature size of encoded images :param decoder_dim: size of decoder's RNN :param attention_dim: size of the attention network """super(Attention, self).__init__()
self.encoder_att=nn.Linear(encoder_dim, attention_dim) # 将encoder_dim(2048)转换为attention_dim以便后面计算注意力数值self.decoder_att=nn.Linear(decoder_dim, attention_dim) # 同理将decoder_dim转换为attention_dimself.full_att=nn.Linear(attention_dim, 1) # 变换后的encoder和decoder的维度为attention_dim, 两者相加之后通过这个线性层计算attention scoreself.relu=nn.ReLU()
self.softmax=nn.Softmax(dim=1) # softmax layer to calculate weightsdefforward(self, encoder_out, decoder_hidden):
""" Forward propagation. :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim) 注:这里的num_pixels=14*14,因为encoder的输出后面会通过view(b_s,-1,encoder_dim)变为上述这个形状 :param decoder_hidden: previous decoder output, a tensor of dimension (batch_size, decoder_dim) :return: attention weighted encoding, weights """att1=self.encoder_att(encoder_out) # (batch_size, num_pixels, attention_dim)att2=self.decoder_att(decoder_hidden) # (batch_size, attention_dim)att=self.full_att(self.relu(att1+att2.unsqueeze(1))).squeeze(2) # (batch_size, num_pixels)alpha=self.softmax(att) # (batch_size, num_pixels) alpha即为b_s行,14*14列的attention矩阵,每行的每个元素表示对每个pixel的注意力值# encoder输出*注意力矩阵并在pixel维度相加得到weighted ecoder output,形状为(batch_size, attention_dim)attention_weighted_encoding= (encoder_out*alpha.unsqueeze(2)).sum(dim=1) # (batch_size, attention_dim)# 返回注意力加权的encoder输出和注意力矩阵returnattention_weighted_encoding, alphaclassDecoderWithAttention(nn.Module):
""" Decoder. 接收encoder的输出进行解码,每一步解码时除了输入上一步得到的预测单词(训练时因为是teacher forcing所以是用golden的上一步的单词) 还要计算上一步decoder hidden state和encoder输出之间的注意力得到注意力encoder输出,并把注意力输出和上步预测词concat作为当前解码的总输入 """def__init__(self, attention_dim, embed_dim, decoder_dim, vocab_size, encoder_dim=2048, dropout=0.5):
""" :param attention_dim: size of attention network :param embed_dim: embedding size :param decoder_dim: size of decoder's RNN :param vocab_size: size of vocabulary :param encoder_dim: feature size of encoded images :param dropout: dropout """super(DecoderWithAttention, self).__init__()
self.encoder_dim=encoder_dimself.attention_dim=attention_dimself.embed_dim=embed_dimself.decoder_dim=decoder_dimself.vocab_size=vocab_sizeself.dropout=dropoutself.attention=Attention(encoder_dim, decoder_dim, attention_dim) # attention networkself.embedding=nn.Embedding(vocab_size, embed_dim) # embedding layerself.dropout=nn.Dropout(p=self.dropout)
self.decode_step=nn.LSTMCell(embed_dim+encoder_dim, decoder_dim, bias=True) # decoding LSTMCellself.init_h=nn.Linear(encoder_dim, decoder_dim) # linear layer to find initial hidden state of LSTMCellself.init_c=nn.Linear(encoder_dim, decoder_dim) # linear layer to find initial cell state of LSTMCellself.f_beta=nn.Linear(decoder_dim, encoder_dim) # linear layer to create a sigmoid-activated gateself.sigmoid=nn.Sigmoid()
self.fc=nn.Linear(decoder_dim, vocab_size) # linear layer to find scores over vocabularyself.init_weights() # initialize some layers with the uniform distributiondefinit_weights(self):
""" 初始化embedding和FFN层的参数,使用的是layer.weight/bias.data.method_(-XX,XX) Initializes some parameters with values from the uniform distribution, for easier convergence. """self.embedding.weight.data.uniform_(-0.1, 0.1)
self.fc.bias.data.fill_(0)
self.fc.weight.data.uniform_(-0.1, 0.1)
defload_pretrained_embeddings(self, embeddings):
""" 加载预训练如Golve的embedding Loads embedding layer with pre-trained embeddings. :param embeddings: pre-trained embeddings """self.embedding.weight=nn.Parameter(embeddings)
deffine_tune_embeddings(self, fine_tune=True):
""" 如果要fine-tune embedding层,则用这个函数控制requires_grad=True Allow fine-tuning of embedding layer? (Only makes sense to not-allow if using pre-trained embeddings). :param fine_tune: Allow? """forpinself.embedding.parameters():
p.requires_grad=fine_tunedefinit_hidden_state(self, encoder_out):
""" Creates the initial hidden and cell states for the decoder's LSTM based on the encoded images. :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim) :return: hidden state, cell state """mean_encoder_out=encoder_out.mean(dim=1) # encoder输出对全部pixel的向量取均值,得到形状为(b_s,encoder_dim)的tensorh=self.init_h(mean_encoder_out) # (batch_size, decoder_dim) 初始化线性层变为(batch_size, decoder_dim)的形状作为解码器的h0c=self.init_c(mean_encoder_out)
returnh, cdefforward(self, encoder_out, encoded_captions, caption_lengths):
""" Forward propagation. :param encoder_out: encoded images, a tensor of dimension (batch_size, enc_image_size, enc_image_size, encoder_dim) :param encoded_captions: encoded captions, a tensor of dimension (batch_size, max_caption_length) :param caption_lengths: caption lengths, a tensor of dimension (batch_size, 1) :return: scores for vocabulary, sorted encoded captions, decode lengths, weights, sort indices """batch_size=encoder_out.size(0)
encoder_dim=encoder_out.size(-1)
vocab_size=self.vocab_size# Flatten imageencoder_out=encoder_out.view(batch_size, -1, encoder_dim) # (batch_size, num_pixels, encoder_dim)num_pixels=encoder_out.size(1)
# Sort input data by decreasing lengths; why? apparent below# 对caption_lengths(形状为(batch_size, 1))根据每个caption长度进行排序,得到从长到短的排序以及相应的index顺序caption_lengths, sort_ind=caption_lengths.squeeze(1).sort(dim=0, descending=True)
# 根据index顺序对encoder输出和相应caption重新排列encoder_out=encoder_out[sort_ind]
encoded_captions=encoded_captions[sort_ind]
# Embedding# 得到全部captions(golden labelled的)的embeddingembeddings=self.embedding(encoded_captions) # (batch_size, max_caption_length, embed_dim)# Initialize LSTM stateh, c=self.init_hidden_state(encoder_out) # (batch_size, decoder_dim)# We won't decode at the <end> position, since we've finished generating as soon as we generate <end># So, decoding lengths are actual lengths - 1# 对于每个caption,解码步数正常来说为caption长度,但由于输出end时不用再输入end进行解码,所以需要减去1decode_lengths= (caption_lengths-1).tolist()
# Create tensors to hold word predicion scores and alphas# predictions中存储一个batch中所有caption对应的各个decoder时间步的softmax输出# 对于小于最大句长的caption,不足的部分为0tensorpredictions=torch.zeros(batch_size, max(decode_lengths), vocab_size).to(device)
# alphas存储一个batch中所有caption对应的各个decoder时间步输入时计算的attention矩阵alphas=torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)
# At each time-step, decode by# attention-weighing the encoder's output based on the decoder's previous hidden state output# then generate a new word in the decoder with the previous word and the attention weighted encodingfortinrange(max(decode_lengths)):
# 计算当前时间步t的动态batch_size_t,由于句长由长到短排列,随着t增大batch_size_t会逐渐减小(满足句长大于t的句子会越来越少)batch_size_t=sum([l>tforlindecode_lengths])
# 取对应batch_size_t个图像的encoder输出和相应的decoder hidden state,然后计算得到加权encoder输出和注意力矩阵attention_weighted_encoding, alpha=self.attention(encoder_out[:batch_size_t],
h[:batch_size_t])
# 把batch_size_t个decoder hidden state变为(batch_size_t, encoder_dim)形状并输入到sigmoid中作为gategate=self.sigmoid(self.f_beta(h[:batch_size_t])) # gating scalar, (batch_size_t, encoder_dim)# gate和刚刚得到的加权encoder输出相乘得到最终的加权encoder输出attention_weighted_encoding=gate*attention_weighted_encoding# 当前解码时间步输入为batch_size_t个词向量为embeddings[:batch_size_t, t, :](形状为(b_s_t,embed_dim))和# attention计算出的加权encoder输出attention_weighted_encoding(形状为(b_s_t,encoder_dim))的concat# 形状为((b_s_t,embed_dim+encoder_dim))# 然后输入和h与c输入到LSTMCell中得到更新的h和c(形状为(batch_size_t, decoder_dim))h, c=self.decode_step(
torch.cat([embeddings[:batch_size_t, t, :], attention_weighted_encoding], dim=1),
(h[:batch_size_t], c[:batch_size_t])) # (batch_size_t, decoder_dim)# 更新的h用于计算preds,形状为(batch_size_t, vocab_size)preds=self.fc(self.dropout(h)) # (batch_size_t, vocab_size)# 对应行存储预测值和attention矩阵predictions[:batch_size_t, t, :] =predsalphas[:batch_size_t, t, :] =alphareturnpredictions, encoded_captions, decode_lengths, alphas, sort_ind
训练encoder-decoder模型: 关于train.py的笔记
importtimeimporttorch.backends.cudnnascudnnimporttorch.optimimporttorch.utils.data# import这个是为了用里面的DataLoader类importtorchvision.transformsastransformsfromtorchimportnnfromtorch.nn.utils.rnnimportpack_padded_sequencefrommodelsimportEncoder, DecoderWithAttentionfromdatasetsimport*# import了自定义的CaptionDataset类fromutilsimport*fromnltk.translate.bleu_scoreimportcorpus_bleu# Data parametersdata_folder='/media/ssd/caption data'# folder with data files saved by create_input_files.pydata_name='coco_5_cap_per_img_5_min_word_freq'# base name shared by data files# Model parametersemb_dim=512# dimension of word embeddingsattention_dim=512# dimension of attention linear layersdecoder_dim=512# dimension of decoder RNNdropout=0.5device=torch.device("cuda"iftorch.cuda.is_available() else"cpu") # sets device for model and PyTorch tensorscudnn.benchmark=True# set to true only if inputs to model are fixed size; otherwise lot of computational overhead# Training parametersstart_epoch=0epochs=120# number of epochs to train for (if early stopping is not triggered)epochs_since_improvement=0# 验证集中距离上次BLEU-4 score提升的epoch(th)之间相隔的epoches数batch_size=32workers=1# for data-loading; right now, only 1 works with h5py# dencoder的学习率要大于decode,因为decoder参数为随机初始化的需要更大的学习率来达到收敛encoder_lr=1e-4# learning rate for encoder if fine-tuningdecoder_lr=4e-4# learning rate for decoder# 梯度剪裁是为了防止梯度爆炸,当梯度大于设定的超参数时,会对梯度进行缩放grad_clip=5.# clip gradients at an absolute value of# Show and Tell文章中用了'doubly stochastic attention'来标准化参数,这个具体没看怎么回事alpha_c=1.# regularization parameter for 'doubly stochastic attention', as in the paper# 每次valid都会记录best的BLUE分数并对最好结果和epochs_since_improvement(变为0)进行更新best_bleu4=0.# BLEU-4 score right now# 每100个batches报告一次train和valid的状态print_freq=100# print training/validation stats every __ batches# 是否finetune encoderfine_tune_encoder=False# fine-tune encoder?# 是否从之前训练的模型的checkpoint开始训练checkpoint=None# path to checkpoint, None if nonedefmain():
""" Training and validation. """# 虽然python中外部的全局变量函数内可以调用,但是函数内改变变量值不会影响全局变量的值# 通过global声明变量则在函数内改变变量(如best_bleu4)则外部的全局变量的值也会跟着变globalbest_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder, data_name, word_map# 读取“单词-index”词典wordmapword_map_file=os.path.join(data_folder, 'WORDMAP_'+data_name+'.json')
withopen(word_map_file, 'r') asj:
word_map=json.load(j)
# 初始化/加载 checkpointifcheckpointisNone:
# 初始化decoderdecoder=DecoderWithAttention(attention_dim=attention_dim,
embed_dim=emb_dim,
decoder_dim=decoder_dim,
vocab_size=len(word_map),
dropout=dropout)
# 初始化decoder的优化器,用filter对需要梯度更新的参数进行优化decoder_optimizer=torch.optim.Adam(params=filter(lambdap: p.requires_grad, decoder.parameters()),
lr=decoder_lr)
# 初始化encoder以及优化器,encoder.fine_tune可以控制encoder是否进行finetune并改变相应参数的requires_gradencoder=Encoder()
encoder.fine_tune(fine_tune_encoder)
encoder_optimizer=torch.optim.Adam(params=filter(lambdap: p.requires_grad, encoder.parameters()),
lr=encoder_lr) iffine_tune_encoderelseNoneelse:
checkpoint=torch.load(checkpoint)
start_epoch=checkpoint['epoch'] +1epochs_since_improvement=checkpoint['epochs_since_improvement']
best_bleu4=checkpoint['bleu-4']
decoder=checkpoint['decoder']
decoder_optimizer=checkpoint['decoder_optimizer']
encoder=checkpoint['encoder']
encoder_optimizer=checkpoint['encoder_optimizer']
iffine_tune_encoderisTrueandencoder_optimizerisNone:
encoder.fine_tune(fine_tune_encoder)
encoder_optimizer=torch.optim.Adam(params=filter(lambdap: p.requires_grad, encoder.parameters()),
lr=encoder_lr)
# 将encoder和decoder送去GPU,这里对于模块如model,model=model.to(device)等价于model.to(device)# 但是对于tensor类型变量如a,a=a.to(device)和a.to(device)不等价,原因是对于tensor,to(device)方法是拷贝一份tensor到GPU# 前者中a会变成cuda类型tensor,但是后者中a仍然是cpu(普通)的tensor,所以对于tensor一定要用前面的方法赋值才是把变量送到了GPU上# 还有的时候会用这样的方法如model(a.to(device),b.to(device),...)的方法来把tensor送入GPU的model中,这样得到的model输出也# 会是cuda类型的tensordecoder=decoder.to(device)
encoder=encoder.to(device)
# Loss function 这里也给放到GPU上了criterion=nn.CrossEntropyLoss().to(device)
# Custom dataloaders 图像标准化处理normalize=transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
# 初始化train和val的DataLoader类train_loader=torch.utils.data.DataLoader(
CaptionDataset(data_folder, data_name, 'TRAIN', transform=transforms.Compose([normalize])),
batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)
val_loader=torch.utils.data.DataLoader(
CaptionDataset(data_folder, data_name, 'VAL', transform=transforms.Compose([normalize])),
batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)
# Epochs 开始forepochinrange(start_epoch, epochs):
# Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20# 8个epoch不增长则调整学习率,20个不长则epoch停止ifepochs_since_improvement==20:
breakifepochs_since_improvement>0andepochs_since_improvement%8==0:
adjust_learning_rate(decoder_optimizer, 0.8)
iffine_tune_encoder:
adjust_learning_rate(encoder_optimizer, 0.8)
# One epoch's training# 对于所有train data,每次loader加载一个batch,对batch进行训练train(train_loader=train_loader,
encoder=encoder,
decoder=decoder,
criterion=criterion,
encoder_optimizer=encoder_optimizer,
decoder_optimizer=decoder_optimizer,
epoch=epoch)
# One epoch's validation# 模型在val data上跑一遍并计算和返回bleu scorerecent_bleu4=validate(val_loader=val_loader,
encoder=encoder,
decoder=decoder,
criterion=criterion)
# Check if there was an improvement# 如果当前bleu比前面的best还好,则best重新赋值,并且epochs_since_improvement归零# 否则epochs_since_improvement的值加1is_best=recent_bleu4>best_bleu4best_bleu4=max(recent_bleu4, best_bleu4)
ifnotis_best:
epochs_since_improvement+=1print("\nEpochs since last improvement: %d\n"% (epochs_since_improvement,))
else:
epochs_since_improvement=0# Save checkpoint 每个epoch都会保存,不管是不是表现最好的模型,但是因为名字不变所以会覆盖之前的文件# 不过best表现的会给一个前面加'BEST_'名字的保存名防止被非best文件覆盖,不过best之间也会覆盖save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer,
decoder_optimizer, recent_bleu4, is_best)
deftrain(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch):
""" Performs one epoch's training. :param train_loader: DataLoader for training data :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning) :param decoder_optimizer: optimizer to update decoder's weights :param epoch: epoch number """# encoder和decoder开启训练模式decoder.train() # train mode (dropout and batchnorm is used)encoder.train()
# 初始化四个累计器,分别累计处理一个batch所需时间,加载batch数据所需时间,loss值和accuracy值batch_time=AverageMeter() # forward prop. + back prop. timedata_time=AverageMeter() # data loading timelosses=AverageMeter() # loss (per word decoded)top5accs=AverageMeter() # top5 accuracystart=time.time()
# Batches# load一个batch中的图像,captions列表和包含各个caption实际长度的列表fori, (imgs, caps, caplens) inenumerate(train_loader):
data_time.update(time.time() -start)
# Move to GPU, if available# 把三个数据都放到GPU上imgs=imgs.to(device)
caps=caps.to(device)
caplens=caplens.to(device)
# Forward prop.# 将数据输入encoder/decoder中imgs=encoder(imgs)
scores, caps_sorted, decode_lengths, alphas, sort_ind=decoder(imgs, caps, caplens)
# Since we decoded starting with <start>, the targets are all words after <start>, up to <end># decoder的输出是从start后一位开始到end位,所以对应的target也是从1开始targets=caps_sorted[:, 1:]
# Remove timesteps that we didn't decode at, or are pads# pack_padded_sequence is an easy trick to do this# 这里是把一个batch中所有有效时间步输出的softmax输出集合到一起,形状为(total_valid_decode_steps,num_class)# 但是这里我觉得是错的,pack_padded_sequence输出的是一个如下的tuple数据# PackedSequence(data=tensor([[0.3482, 0.2628, 0.3890],# [0.3099, 0.4163, 0.2738],# [0.3622, 0.4232, 0.2147],# [0.2595, 0.4771, 0.2634],# [0.2440, 0.5233, 0.2326],# [0.3541, 0.2688, 0.3771],# [0.3105, 0.4172, 0.2723]]), batch_sizes=tensor([2, 2, 2, 1]), sorted_indices=None, unsorted_indices=None)# 因此取数据应该用如下写法:# scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0]# targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]scores, _=pack_padded_sequence(scores, decode_lengths, batch_first=True)
targets, _=pack_padded_sequence(targets, decode_lengths, batch_first=True)
# Calculate loss, 这个loss=sum(-log(right_label_corresponding_p))/total_decode_steps 即求和后取了平均!loss=criterion(scores, targets)
# Add doubly stochastic attention regularization 这里没看论文不懂 先略过loss+=alpha_c* ((1.-alphas.sum(dim=1)) **2).mean()
# Back prop. 计算完loss后先梯度归零之后反向传播decoder_optimizer.zero_grad()
ifencoder_optimizerisnotNone:
encoder_optimizer.zero_grad()
loss.backward()
# Clip gradients 梯度剪裁ifgrad_clipisnotNone:
clip_gradient(decoder_optimizer, grad_clip)
ifencoder_optimizerisnotNone:
clip_gradient(encoder_optimizer, grad_clip)
# Update weights 更新模型参数decoder_optimizer.step()
ifencoder_optimizerisnotNone:
encoder_optimizer.step()
# Keep track of metrics# 注意loss和acc这两个累计器在累计时需要提供一个batch数据中全部有效的decode步数和# 然后步数和乘以平均每步loss值/accuracy值得到一个batch的总loss值/总correct个数和总步数# 后面每次更新时候也会更新这三个值,总loss/总步数和总correct/总步数来求得平均loss/全体数据的accuracytop5=accuracy(scores, targets, 5)
losses.update(loss.item(), sum(decode_lengths))
top5accs.update(top5, sum(decode_lengths))
batch_time.update(time.time() -start)
start=time.time()
# Print status# 每100个batch输出一下累计器的结果ifi%print_freq==0:
print('Epoch: [{0}][{1}/{2}]\t''Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t''Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t''Loss {loss.val:.4f} ({loss.avg:.4f})\t''Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
batch_time=batch_time,
data_time=data_time, loss=losses,
top5=top5accs))
defvalidate(val_loader, encoder, decoder, criterion):
""" validate其实和train基本一样,只是多了个计算BLEU Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :return: BLEU-4 score """# 调为eval mode,在eval模式下会关闭dropout或者batchnorm,因为这两个都是在训练过程中才需要的操作# 另外,dropout的作用是防止过拟合,实现上是对每一层网络中的一部分节点不进行参数更新,实际上相当于训练了多个不同的网络并对他们取平均# 详情可以参照这个链接: https://www.jianshu.com/p/ef2a7a78aa83# BN的作用是加速网络快速收敛,防止梯度消失,实现上是求batch这个维度上的tensor平均值和方差,然后把数据的分布拉回到标准分布附近(因为# 经过多层的网络后数据会变得过大或者过小,使得通过激活函数后的值对应的梯度为0),详情参考下方链接:# https://www.cnblogs.com/guoyaohua/p/8724433.htmldecoder.eval()
ifencoderisnotNone:
encoder.eval()
batch_time=AverageMeter()
losses=AverageMeter()
top5accs=AverageMeter()
start=time.time()
references=list() # references (true captions) for calculating BLEU-4 scorehypotheses=list() # hypotheses (predictions)# explicitly disable gradient calculation to avoid CUDA memory error# solves the issue #57# 用 with torch.no_grad() 来关闭梯度,验证和测试时一定要关闭!withtorch.no_grad():
# 下述过程和train基本一样,注意取batch数据时候还取了一张图对应的所有正确captions即allcapsfori, (imgs, caps, caplens, allcaps) inenumerate(val_loader):
# Move to device, if availableimgs=imgs.to(device)
caps=caps.to(device)
caplens=caplens.to(device)
# Forward prop.ifencoderisnotNone:
imgs=encoder(imgs)
scores, caps_sorted, decode_lengths, alphas, sort_ind=decoder(imgs, caps, caplens)
# Since we decoded starting with <start>, the targets are all words after <start>, up to <end>targets=caps_sorted[:, 1:]
# Remove timesteps that we didn't decode at, or are pads# pack_padded_sequence is an easy trick to do this# 注意这里clone的是scores,即形状为(b_s,max_length(包含pad),class_num)# 然后下面用pack_padded_sequence取出valid timesteps,还是觉得这里该改为如下# scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0]# targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]scores_copy=scores.clone()
scores, _=pack_padded_sequence(scores, decode_lengths, batch_first=True)
targets, _=pack_padded_sequence(targets, decode_lengths, batch_first=True)
# Calculate lossloss=criterion(scores, targets)
# Add doubly stochastic attention regularizationloss+=alpha_c* ((1.-alphas.sum(dim=1)) **2).mean()
# Keep track of metricslosses.update(loss.item(), sum(decode_lengths))
top5=accuracy(scores, targets, 5)
top5accs.update(top5, sum(decode_lengths))
batch_time.update(time.time() -start)
start=time.time()
ifi%print_freq==0:
print('Validation: [{0}/{1}]\t''Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t''Loss {loss.val:.4f} ({loss.avg:.4f})\t''Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time,
loss=losses, top5=top5accs))
# 这里存储references(包括一张图的多个captions)和predictions,形状如下# Store references (true captions), and hypothesis (prediction) for each image# If for n images, we have n hypotheses, and references a, b, c... for each image, we need -# references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]# Referencesallcaps=allcaps[sort_ind] # because images were sorted in the decoderforjinrange(allcaps.shape[0]):
img_caps=allcaps[j].tolist()
img_captions=list(
map(lambdac: [wforwincifwnotin {word_map['<start>'], word_map['<pad>']}],
img_caps)) # remove <start> and padsreferences.append(img_captions)
# Hypotheses_, preds=torch.max(scores_copy, dim=2)
preds=preds.tolist()
temp_preds=list()
forj, pinenumerate(preds):
temp_preds.append(preds[j][:decode_lengths[j]]) # remove padspreds=temp_predshypotheses.extend(preds)
assertlen(references) ==len(hypotheses)
# Calculate BLEU-4 scores# 这里调用NLTK的corpus_bleu进行计算,输入的形状就是像上面那样的形状,多个references对应一个hypothesebleu4=corpus_bleu(references, hypotheses)
print(
'\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'.format(
loss=losses,
top5=top5accs,
bleu=bleu4))
returnbleu4if__name__=='__main__':
main()
不使用teacher forcing在测试集上预测图片caption:关于eval.py的笔记
importtorch.backends.cudnnascudnnimporttorch.optimimporttorch.utils.dataimporttorchvision.transformsastransformsfromdatasetsimport*fromutilsimport*fromnltk.translate.bleu_scoreimportcorpus_bleuimporttorch.nn.functionalasFfromtqdmimporttqdm# Parametersdata_folder='/media/ssd/caption data'# folder with data files saved by create_input_files.pydata_name='coco_5_cap_per_img_5_min_word_freq'# base name shared by data files# 这个checkpoint是train之后保存的,表现最好的模型checkpoint='../BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar'# model checkpointword_map_file='/media/ssd/caption data/WORDMAP_coco_5_cap_per_img_5_min_word_freq.json'# word map, ensure it's the same the data was encoded with and the model was trained withdevice=torch.device("cuda"iftorch.cuda.is_available() else"cpu") # sets device for model and PyTorch tensors# 对于cudnn做benchmark的设置,查到了以下的解释:# 如果网络的输入数据维度或类型上变化不大,设置torch.backends.cudnn.benchmark = true可以增加运行效率# 如果网络的输入数据在每次 iteration 都变化的话,会导致 cnDNN 每次都会去寻找一遍最优配置,这样反而会降低运行效率。cudnn.benchmark=True# set to true only if inputs to model are fixed size; otherwise lot of computational overhead# Load model# 注意encoder和decoder要开启eval模式checkpoint=torch.load(checkpoint)
decoder=checkpoint['decoder']
decoder=decoder.to(device)
decoder.eval()
encoder=checkpoint['encoder']
encoder=encoder.to(device)
encoder.eval()
# Load word map (word2ix)# 加载word2index和生成反向字典index2wordwithopen(word_map_file, 'r') asj:
word_map=json.load(j)
rev_word_map= {v: kfork, vinword_map.items()}
vocab_size=len(word_map)
# Normalization transformnormalize=transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
defevaluate(beam_size):
""" Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """# DataLoader# 加载测试集数据和其loader,注意加载的是TEST测试集!!!# 注意这里loader设置的batch size为1,也就是说每次输入一张图然后生成对应的k(beam size)个captions# 由于beam search在decode过程中就是每步生成k个单词(和多batch处理情况类似),所以batch只设置为1# 后面还会看到对encoder的输入扩张为k个输入当成是batch size为k的输入来进行处理(后述)# 最终inference时候会从k个captions中选分数最高的句子作为唯一的hypothesisloader=torch.utils.data.DataLoader(
CaptionDataset(data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])),
batch_size=1, shuffle=True, num_workers=1, pin_memory=True)
# TODO: Batched Beam Search# Therefore, do not use a batch_size greater than 1 - IMPORTANT!# Lists to store references (true captions), and hypothesis (prediction) for each image# If for n images, we have n hypotheses, and references a, b, c... for each image, we need -# references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]# references和hypotheses都为list类型,且形状如上所述,两个列表的每个item对应一张图的references和预测的hypothesisreferences=list()
hypotheses=list()
# For each image# tqdm这里desc的字符会显示在进度条前方fori, (image, caps, caplens, allcaps) inenumerate(
tqdm(loader, desc="EVALUATING AT BEAM SIZE "+str(beam_size))):
k=beam_size# Move to GPU device, if availableimage=image.to(device) # (1, 3, 256, 256)# Encodeencoder_out=encoder(image) # (1, enc_image_size, enc_image_size, encoder_dim)enc_image_size=encoder_out.size(1)
encoder_dim=encoder_out.size(3)
# Flatten encodingencoder_out=encoder_out.view(1, -1, encoder_dim) # (1, num_pixels, encoder_dim)num_pixels=encoder_out.size(1)
# We'll treat the problem as having a batch size of k# 在这里通过expand把batch size从1变为k,之所以这么变是因为在decode的过程中每次解码都要输入k个单词# 这样可以把问题看成是对k个复制的同一张图encoder输出(batch size为k)进行decodeencoder_out=encoder_out.expand(k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim)# Tensor to store top k previous words at each step; now they're just <start># k_prev_words存储的是上一个step的k个得分最高的词的index,形状为(k,1)# 初始化时候为k个start indexk_prev_words=torch.LongTensor([[word_map['<start>']]] *k).to(device) # (k, 1)# Tensor to store top k sequences; now they're just <start># seqs中保存着当前的k个预测句子,初始化形状为(k,1),当某个句子完成decode(最后一个元素是end)的时候# 完成decode的句子会被加入到complete_seqs中seqs=k_prev_words# (k, 1)# Tensor to store top k sequences' scores; now they're just 0# top_k_scores 保存着当前k个预测句子的分数,这个分数的计算是一个累积的分数# 比如当前解码的某一个序列为ABC,则分数为第一步的A对应的softmax的a值,第二步B对应的b值...# 然后三个分数相加得到a+b+c,其实beam search每一步比较分数得到topk的时候也是比较这个累计分数而不是当前的c值top_k_scores=torch.zeros(k, 1).to(device) # (k, 1)# Lists to store completed sequences and scores# 当seqs中某一个sequence已经decode出end的时候,这个sequence会被保存到complete_seqs# 同时,这个句子的分数也被保存到complete_seqs_scores中# compelte_seqs和scores可以看成是分别把seqs和top_k_scores中解码完成的句子以及对应分数保存起来的变量complete_seqs=list()
complete_seqs_scores=list()
# Start decoding# h和c先进行初始化,注意后面的过程中h和c的b_s会变(后述)step=1h, c=decoder.init_hidden_state(encoder_out)
# s is a number less than or equal to k, because sequences are removed from this process once they hit <end># 注释里的s是一个小于等于k的数,因为完成的句子会被取出,beam length也会相应减少# (如beam是5,若当前解码完成一个句子,则后面搜索空间变为4)# 所以在解码过程中batch的大小会随着beam length的减小逐渐变小最终变为0whileTrue:
# 对先前解码的词进行embedding,作为下一步decode的输入embeddings=decoder.embedding(k_prev_words).squeeze(1) # (s, embed_dim)# 计算上一步h和encoder输出的注意力来得到weighted encodingawe, _=decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels)# 用上一步h计算gate并和weighted encoding相乘gate=decoder.sigmoid(decoder.f_beta(h)) # gating scalar, (s, encoder_dim)awe=gate*awe# 对weighted encoding和embeddings进行concat作为下一步解码的输入,并更新h和ch, c=decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim)scores=decoder.fc(h) # (s, vocab_size)scores=F.log_softmax(scores, dim=1)
# 这里把上一步得到的k个序列的得分分别进行expand并分别加上当前vocab_size大小的softmax输出# 具体来说top_k_scores保存着之前的s个序列的分数,形状为(s,1),扩张后形状为(s,v_s),然后再加上softmax的输出(形状为(s,v_s))# 这样得到的为总共s*v_s个(每个s对应v_s个单词)的全体序列分数,之后beam也会对全体unroll进行topk选择scores=top_k_scores.expand_as(scores) +scores# (s, vocab_size)# For the first step, all k points will have the same scores (since same k previous words, h, c)# 当第一步解码时,因为h,c和前一个词(start)都相同,所以scores中所有item都相等# 因此求topk时候不需要unroll然后找最大k个,直接在第一个里面找最大的k个就是全体的最大k个# 因此第一步这里用了scores[0].topk,而在后面step中要view(-1)来unroll所有数据然后取topk(s)ifstep==1:
top_k_scores, top_k_words=scores[0].topk(k, 0, True, True) # (s)else:
# Unroll and find top scores, and their unrolled indicestop_k_scores, top_k_words=scores.view(-1).topk(k, 0, True, True) # (s)# Convert unrolled indices to actual indices of scores# 比方说vocab_size为10,k为4,则有40个分数,top_k_words可能等于=[5,15,25,35]# 这里我们需要把这个index换成在原本没unroll序列中的index,即为[0][5],[1][5],[2][5]和[3][5]# 这里他用了一个/去取seq_index的list,用%取word_index的listprev_word_inds=top_k_words/vocab_size# (s)next_word_inds=top_k_words%vocab_size# (s)# Add new words to sequences# 根据刚刚得到的seq_index和word_index来给抽取相应的seqs(可重复,考虑topk中多个词来自于同一个previous word)# 然后在相应seq中添加decode出的当前词的index,并更新seqs# 这里有个pytorch的语法想说一下,对于torch tensor如x形状为(5,10),若想取其中的几个元素如0号和1号元素并保留原维度# 则seqs[[0,1]]即可取出,形状为(2,10)维度不变# 而对于python的list则一般需要[lis for i,lis in enumerate(x) if i in {0,1}]# 不得不说pytorch这个tensor[[index1,...]]真的好方便!seqs=torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1)# Which sequences are incomplete (didn't reach <end>)?# 查看是否topk中当前词有end的情况,将当前词不是end和是end的序列的index分别存入incomplete和complete两个list中incomplete_inds= [indforind, next_wordinenumerate(next_word_inds) ifnext_word!=word_map['<end>']]
complete_inds=list(set(range(len(next_word_inds))) -set(incomplete_inds))
# Set aside complete sequences# 如果存在解码完成了的序列,则把它们存储到cimplete_seqs中,相应的分数存到scores中iflen(complete_inds) >0:
complete_seqs.extend(seqs[complete_inds].tolist())
complete_seqs_scores.extend(top_k_scores[complete_inds])
# 注意这里,解码完成了x个,则beam的长度也相应缩小x,以保证最后输出的序列数为k个k-=len(complete_inds) # reduce beam length accordingly# Proceed with incomplete sequences# 当全部句子都解码完成时breakifk==0:
break# seqs中只保留未完成的sequences# h和c中也只保留未完成sequence对应的根节点index的h元素和c元素,根节点index指的是prev_word_inds中的元素# (h/c中保留的元素可以是重复的,因为prev_word_inds中很可能包含重复元素)# encoder输出也只保留根节点index对应元素# top_k_scores和k_prev_words也相应的只保留非end的分数和结果,并通过unsqueeze恢复维度seqs=seqs[incomplete_inds]
h=h[prev_word_inds[incomplete_inds]]
c=c[prev_word_inds[incomplete_inds]]
encoder_out=encoder_out[prev_word_inds[incomplete_inds]]
top_k_scores=top_k_scores[incomplete_inds].unsqueeze(1)
k_prev_words=next_word_inds[incomplete_inds].unsqueeze(1)
# Break if things have been going on too long# 句子长超过50太长了,则强行breakifstep>50:
breakstep+=1# 全部k个句子求得之后,求出得分最高的句子所对应的index i# 这里有个小语法是如果求list A的最大值对应的index,则用# index函数: A.index(max(A))i=complete_seqs_scores.index(max(complete_seqs_scores))
# 根据index取出唯一的hypothesisseq=complete_seqs[i]
# References# 这里的references存储了所有图片的references,并且每个item中也包含多个references(一图多captions)# 这里是allcaps[0]而不是allcaps是因为allcaps形状为(1,cpi),1是batch_size,allcaps[0]里面才是多个captionsimg_caps=allcaps[0].tolist()
# 每个caption去掉特殊符号,然后加入到references中img_captions=list(
map(lambdac: [wforwincifwnotin {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],
img_caps)) # remove <start> and padsreferences.append(img_captions)
# Hypotheses# hypothesis也去掉特殊符号加入到包含所有图片预测句子的hypotheses中hypotheses.append([wforwinseqifwnotin {word_map['<start>'], word_map['<end>'], word_map['<pad>']}])
assertlen(references) ==len(hypotheses)
# Calculate BLEU-4 scoresbleu4=corpus_bleu(references, hypotheses)
returnbleu4if__name__=='__main__':
beam_size=1print("\nBLEU-4 score @ beam size of %d is %.4f."% (beam_size, evaluate(beam_size)))
importtorchimporttorch.nn.functionalasFimportnumpyasnpimportjsonimporttorchvision.transformsastransformsimportmatplotlib.pyplotaspltimportmatplotlib.cmascmimportskimage.transformimportargparsefromscipy.miscimportimread, imresizefromPILimportImagedevice=torch.device("cuda"iftorch.cuda.is_available() else"cpu")
defcaption_image_beam_search(encoder, decoder, image_path, word_map, beam_size=3):
""" 这个函数实现的是输入「一张」图片并且通过beam生成captions Reads an image and captions it with beam search. :param encoder: encoder model :param decoder: decoder model :param image_path: path to image :param word_map: word map :param beam_size: number of sequences to consider at each decode-step :return: caption, weights for visualization """k=beam_sizevocab_size=len(word_map)
# Read image and process# 由于只输入一张图,所以不能用loader,而且要进行一遍图像的前处理# 读取图片img=imread(image_path)
# 如果只有一个通道则增加为三通道iflen(img.shape) ==2:
img=img[:, :, np.newaxis]
img=np.concatenate([img, img, img], axis=2)
# resize为正方形img=imresize(img, (256, 256))
# 维度位置调整img=img.transpose(2, 0, 1)
# 缩小到0到1的范围img=img/255.# 数据类型为float并放到cuda上img=torch.FloatTensor(img).to(device)
# 标准化处理,形状变为(3,256,256)normalize=transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
transform=transforms.Compose([normalize])
image=transform(img) # (3, 256, 256)# Encode 编码图像image=image.unsqueeze(0) # (1, 3, 256, 256)encoder_out=encoder(image) # (1, enc_image_size, enc_image_size, encoder_dim)enc_image_size=encoder_out.size(1)
encoder_dim=encoder_out.size(3)
# Flatten encoding 图像展平encoder_out=encoder_out.view(1, -1, encoder_dim) # (1, num_pixels, encoder_dim)num_pixels=encoder_out.size(1)
# We'll treat the problem as having a batch size of k# 因为beam search 所以decode看成是处理batch为k的输入encoder_out=encoder_out.expand(k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim)# Tensor to store top k previous words at each step; now they're just <start># 上一步解码得到的k个词k_prev_words=torch.LongTensor([[word_map['<start>']]] *k).to(device) # (k, 1)# Tensor to store top k sequences; now they're just <start># 存放所有seqs,第一步只有start所以直接把上一步解码得到的k个start赋值给seqsseqs=k_prev_words# (k, 1)# Tensor to store top k sequences' scores; now they're just 0# 存储k个得分最高的序列的分数top_k_scores=torch.zeros(k, 1).to(device) # (k, 1)# Tensor to store top k sequences' alphas; now they're just 1s# 这里的seqs_alpha是存放topk个序列对应的attention matrices# 第0维度的值会随着decode逐渐增加,当某个序列decode完成后相应attention矩阵会被添加到下面的# complete_seqs_alpha这个变量中用于后面的可视化# 注意start对应的为一个全为1的注意力矩阵,而之后的每个词对应的为输入的注意力矩阵,如a单词对应的注意力矩阵为# a上一步的h和encode output之间计算的注意力矩阵(被concat并用于解码a的那个注意力矩阵)seqs_alpha=torch.ones(k, 1, enc_image_size, enc_image_size).to(device) # (k, 1, enc_image_size, enc_image_size)# Lists to store completed sequences, their alphas and scorescomplete_seqs=list()
complete_seqs_alpha=list()
complete_seqs_scores=list()
# Start decodingstep=1h, c=decoder.init_hidden_state(encoder_out)
# s is a number less than or equal to k, because sequences are removed from this process once they hit <end>whileTrue:
embeddings=decoder.embedding(k_prev_words).squeeze(1) # (s, embed_dim)# 注意这里输出了注意力矩阵alpha,而不是像eval中那样没输出awe, alpha=decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels)# 对alpha进行变形为(s, enc_image_size, enc_image_size)alpha=alpha.view(-1, enc_image_size, enc_image_size) # (s, enc_image_size, enc_image_size)# gate控制attention通过gate=decoder.sigmoid(decoder.f_beta(h)) # gating scalar, (s, encoder_dim)awe=gate*aweh, c=decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim)scores=decoder.fc(h) # (s, vocab_size)scores=F.log_softmax(scores, dim=1)
# Addscores=top_k_scores.expand_as(scores) +scores# (s, vocab_size)# For the first step, all k points will have the same scores (since same k previous words, h, c)ifstep==1:
top_k_scores, top_k_words=scores[0].topk(k, 0, True, True) # (s)else:
# Unroll and find top scores, and their unrolled indicestop_k_scores, top_k_words=scores.view(-1).topk(k, 0, True, True) # (s)# Convert unrolled indices to actual indices of scoresprev_word_inds=top_k_words/vocab_size# (s)next_word_inds=top_k_words%vocab_size# (s)# Add new words to sequences, alphasseqs=torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1)# 注意这里对存放k个序列的注意力矩阵们的seqs_alpha变量# 在相应序列的tensor中concat当前步骤的注意力矩阵来更新seqs_alphaseqs_alpha=torch.cat([seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
dim=1) # (s, step+1, enc_image_size, enc_image_size)# Which sequences are incomplete (didn't reach <end>)?incomplete_inds= [indforind, next_wordinenumerate(next_word_inds) ifnext_word!=word_map['<end>']]
complete_inds=list(set(range(len(next_word_inds))) -set(incomplete_inds))
# Set aside complete sequencesiflen(complete_inds) >0:
complete_seqs.extend(seqs[complete_inds].tolist())
# complete_seqs_alpha中添加解码完成的seqs对应的注意力矩阵们complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
complete_seqs_scores.extend(top_k_scores[complete_inds])
k-=len(complete_inds) # reduce beam length accordingly# Proceed with incomplete sequencesifk==0:
breakseqs=seqs[incomplete_inds]
seqs_alpha=seqs_alpha[incomplete_inds]
h=h[prev_word_inds[incomplete_inds]]
c=c[prev_word_inds[incomplete_inds]]
encoder_out=encoder_out[prev_word_inds[incomplete_inds]]
top_k_scores=top_k_scores[incomplete_inds].unsqueeze(1)
k_prev_words=next_word_inds[incomplete_inds].unsqueeze(1)
# Break if things have been going on too longifstep>50:
breakstep+=1i=complete_seqs_scores.index(max(complete_seqs_scores))
seq=complete_seqs[i]
alphas=complete_seqs_alpha[i]
returnseq, alphasdefvisualize_att(image_path, seq, alphas, rev_word_map, smooth=True):
""" Visualizes caption with weights at every word. Adapted from paper authors' repo: https://github.com/kelvinxu/arctic-captions/blob/master/alpha_visualization.ipynb :param image_path: path to image that has been captioned :param seq: caption :param alphas: weights :param rev_word_map: reverse word mapping, i.e. ix2word :param smooth: smooth weights? """# 用PIL中的Image来读入图像,并resize,指定resize模式为LANCZOSimage=Image.open(image_path)
image=image.resize([14*24, 14*24], Image.LANCZOS)
# 把装着indexes的seq转换为装着单词的wordswords= [rev_word_map[ind] forindinseq]
fortinrange(len(words)):
ift>50:
break# np.ceil(x)会取上位正数,即np.ceil(4.1)会取5# plt.subplot就是画出np.ceil(len(words)/5)行,5列的图# 并且当前绘制的图像编号为t+1(即图像从编号1开始绘制)plt.subplot(np.ceil(len(words) /5.), 5, t+1)
# 用plt.text在(0,1)位置加标签,白底黑字plt.text(0, 1, '%s'% (words[t]), color='black', backgroundcolor='white', fontsize=12)
# imshow()接收一张图像,只是画出该图,并不会立刻显示出来。# imshow后还可以进行其他draw操作,比如scatter散点等。# 所有画完后使用plt.show()才能进行结果的显示。plt.imshow(image)
current_alpha=alphas[t, :]
# 这里对注意力矩阵的形状进行处理,因为矩阵为(14,14)的,想和现在的(24*14,24*14)的相乘,则需要变形# 如果采用smooth,则利用上采样(skimage.transform.pyramid_expand实现)来插值变形# 如果不用smooth,则直接resize为指定形状,只不过图像质量会比smooth差一些ifsmooth:
alpha=skimage.transform.pyramid_expand(current_alpha.numpy(), upscale=24, sigma=8)
else:
alpha=skimage.transform.resize(current_alpha.numpy(), [14*24, 14*24])
# alpha设置透明度,0的时候就是全透明的# 这里alpha参数的设置具体要做实验才能明白区别ift==0:
plt.imshow(alpha, alpha=0)
else:
plt.imshow(alpha, alpha=0.8)
plt.set_cmap(cm.Greys_r) # 设置为灰度图plt.axis('off') # 不显示坐标尺寸plt.show()
if__name__=='__main__':
parser=argparse.ArgumentParser(description='Show, Attend, and Tell - Tutorial - Generate Caption')
parser.add_argument('--img', '-i', help='path to image')
parser.add_argument('--model', '-m', help='path to model')
parser.add_argument('--word_map', '-wm', help='path to word map JSON')
parser.add_argument('--beam_size', '-b', default=5, type=int, help='beam size for beam search')
parser.add_argument('--dont_smooth', dest='smooth', action='store_false', help='do not smooth alpha overlay')
args=parser.parse_args()
# Load modelcheckpoint=torch.load(args.model, map_location=str(device))
decoder=checkpoint['decoder']
decoder=decoder.to(device)
# 记得设置eval模式decoder.eval()
encoder=checkpoint['encoder']
encoder=encoder.to(device)
encoder.eval()
# Load word map (word2ix)withopen(args.word_map, 'r') asj:
word_map=json.load(j)
rev_word_map= {v: kfork, vinword_map.items()} # ix2word# Encode, decode with attention and beam searchseq, alphas=caption_image_beam_search(encoder, decoder, args.img, word_map, args.beam_size)
alphas=torch.FloatTensor(alphas)
# Visualize caption and attention of best sequencevisualize_att(args.img, seq, alphas, rev_word_map, args.smooth)