Hi.
I’m implementing Label Smoothing for Transformer but the gradient of my model didn’t flow. What is a problem of my model?
def cal_loss(pred, gt, ignore_idx, smoothing=None):
'''
Calculate cross-entropy loss between very before layer of softmax and ground truth(gt)
param:
pred: predicted value (before applying softmax)
gt: grount truth
ignore_idx: index that will be ignored when calculating its probability
smoothing: if float is passed, then apply label smoothing
shape:
pred: [Batch_size, (Seq_len-1), Tgt_vocab_size]
gt: [Batch_size, (Seq_len-1)]
'''
if smoothing != None:
confidence = 1 - smoothing
target_vocab_size = pred.size(2)
# generate one-hot vector through the probability that model generates
one_hot = torch.zeros_like(pred, requires_grad=True).scatter(2, gt.unsqueeze(2), 1) # [Batch_size, (Seq_len-1), Tgt_vocab_size]
one_hot = one_hot * confidence + (1 - one_hot) * smoothing / (target_vocab_size - 2) # ['SOS', 'PAD']
log_prob = nn.functional.softmax(one_hot, dim=2)
non_pad_mask = gt.ne(ignore_idx) # where gt != ignore_index
# NLL-Loss: $\sum_x log(p_{theta}(y \rvert x)) q(w)
loss = - (one_hot * log_prob).sum(dim=2) # [Batch_size, (Seq_len-1)]
loss = loss.masked_select(non_pad_mask).mean() # gather all values where mask == True and return 1-D array
else:
loss = nn.functional.cross_entropy(pred, gt, ignore_idx=ignore_idx, reduction='sum')
return loss