Proper way to use torch.nn.CTCloss

pagpires · October 3, 2019, 3:46pm

A follow-up with additional observation and minimal code for repro: create a perfect prediction from one-hot-encoding from ylabel and a all blank. Both are a batch of 1 datum. Weirdly, the loss of perfect prediction is higher than all_blank when input_length is high, but lower than all_blank when input_length is low. Is it expected?

import torch
import torch.nn.functional as F

T = 189
n_class = 80
y = torch.tensor([[55, 43, 40, 62, 41, 44, 53, 40, 62, 41, 50, 53, 62, 58, 36, 54, 43, 44, 49, 42]])
output_length = torch.tensor(y.shape[1])

pred_model_idx = 79*torch.ones(T, dtype=torch.long)
pred_perf_idx = torch.cat([y[0], (n_class-1) * torch.ones(T-y.shape[1], dtype=torch.long)]) # the first idx are perfect with y, then padded with blanks
pred_model = torch.eye(n_class)[pred_model_idx].unsqueeze(1) # one-hot encoding
pred_perf = torch.eye(n_class)[pred_perf_idx].unsqueeze(1) # one-hot encoding

for input_length in [torch.tensor(y.shape[1]), torch.tensor(T)]:
    print("=============\ninput length:", input_length)
    print("perfect loss:", F.ctc_loss(F.log_softmax(pred_perf, dim=2), y, input_length, output_length, n_class-1, 'none', True))
    print("all_blank loss:", F.ctc_loss(F.log_softmax(pred_model, dim=2), y, input_length, output_length, n_class-1, 'none', True))

# OUTPUT
# =============
# input length: tensor(20)
# perfect loss: tensor([68.0656])
# all_blank loss: tensor([88.0655])
# =============
# input length: tensor(189)
# perfect loss: tensor([605.4802])
# all_blank loss: tensor([593.8168])