I also encountered a similar problem (i.e. only predicting blank
). Additionally I found that a nearly perfect prediction has higher loss than the predict_all_blank
. (I also pre-pad the ylabel
s with blank). Hereâs the setup for replication, Iâm wondering if Iâve used it properly or there might be a bug? Please let me know if additional info is needed. Really appreciate it!
Here are the observations, note loss from perfect prediction is higher than that from all_blank, with/without reduction='none'
. Additionally see bottom for experiment setup:
# print loss
tloss = torch.nn.CTCLoss(blank=79, zero_infinity=False, reduction='none')
print('Perfect prediction:\n', tloss(pred_perf, batch_y_cat, inputls, outputls))
print('Model prediction:\n',tloss(pred_model, batch_y_cat, inputls, outputls))
# output
# Perfect prediction:
# tensor([110.0361, 109.6828, 107.2605], device='cuda:0')
# Model prediction:
# tensor([86.3294, 90.4917, 38.5629], device='cuda:0')
# print the predicted raw results
tloss = torch.nn.CTCLoss(blank=79, zero_infinity=False, reduction='mean')
for idx in range(3):
print('=========================================')
print('Prediction - perfect prediction')
print(pred_perf.argmax(dim=2).permute((1,0))[idx])
print("loss:", tloss(pred_perf[:,idx, :].unsqueeze(1), batch_y[idx].unsqueeze(0), inputls[idx], outputls[idx]))
print('--------')
print('Prediction - model')
print(pred_model.argmax(dim=2).permute(1,0)[idx])
print("loss:", tloss(pred_model[:,idx, :].unsqueeze(1), batch_y[idx].unsqueeze(0), inputls[idx], outputls[idx]))
print('--------')
print('Ground Truth')
print(batch_y[idx])
print('Unpadded ground truth')
unpad_y = batch_y[idx][: outputls[idx]]
print(unpad_y)
# output
# =========================================
# Prediction - perfect prediction
# tensor([55, 43, 40, 62, 41, 44, 53, 40, 62, 41, 50, 53, 62, 58, 36, 54, 43, 44,
# 49, 42, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79], device='cuda:0')
# loss: tensor(30.2797, device='cuda:0')
# --------
# Prediction - model
# tensor([79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79], device='cuda:0')
# loss: tensor(3.2668, device='cuda:0')
# --------
# Ground Truth
# tensor([55, 43, 40, 62, 41, 44, 53, 40, 62, 41, 50, 53, 62, 58, 36, 54, 43, 44,
# 49, 42, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79],
# device='cuda:0')
# Unpadded ground truth
# tensor([55, 43, 40, 62, 41, 44, 53, 40, 62, 41, 50, 53, 62, 58, 36, 54, 43, 44,
# 49, 42], device='cuda:0')
# =========================================
# Prediction - perfect prediction
# tensor([42, 50, 62, 48, 56, 38, 43, 62, 41, 56, 53, 55, 43, 40, 53, 62, 55, 43,
# 36, 49, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79], device='cuda:0')
# loss: tensor(30.3025, device='cuda:0')
# --------
# Prediction - model
# tensor([79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79], device='cuda:0')
# loss: tensor(3.4614, device='cuda:0')
# --------
# Ground Truth
# tensor([42, 50, 62, 48, 56, 38, 43, 62, 41, 56, 53, 55, 43, 40, 53, 62, 55, 43,
# 36, 49, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79],
# device='cuda:0')
# Unpadded ground truth
# tensor([42, 50, 62, 48, 56, 38, 43, 62, 41, 56, 53, 55, 43, 40, 53, 62, 55, 43,
# 36, 49], device='cuda:0')
# =========================================
# Prediction - perfect prediction
# tensor([36, 62, 38, 50, 48, 51, 47, 40, 55, 40, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79], device='cuda:0')
# loss: tensor(61.9394, device='cuda:0')
# --------
# Prediction - model
# tensor([79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79], device='cuda:0')
# loss: tensor(3.8248, device='cuda:0')
# --------
# Ground Truth
# tensor([36, 62, 38, 50, 48, 51, 47, 40, 55, 40, 79, 79, 79, 79, 79, 79, 79, 79,
# 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79],
# device='cuda:0')
# Unpadded ground truth
# tensor([36, 62, 38, 50, 48, 51, 47, 40, 55, 40], device='cuda:0')
# create perfect prediction and model prediction
# key parameters
time_steps = 189
n_class = 80
blank_idx = 79
# construct the perfect prediction based on ground truth
eps = 0.0001
B, curr_t = batch_y.shape
batch_y = batch_y.cpu()
m_fit = torch.cat([torch.zeros(time_steps-curr_t, blank_idx), torch.ones(time_steps-curr_t, 1), torch.zeros(time_steps-curr_t, n_class-1-blank_idx)], dim=1)
pred_perf_prob = torch.stack([(torch.cat([torch.eye(n_class)[batch_y[i]], m_fit], dim=0)*(1-eps*n_class)+eps) for i in range(len(batch_y))], dim=0).to(device) # (B, T, n_class)
pred_perf = torch.nn.functional.log_softmax(pred_perf_prob, dim=2).permute((1,0,2)) # (T, B, n_class)
# get model prediction, which predicts all blank
with torch.no_grad():
pred_model, inputs = model.network(batch_x) # (T, B, n_class)
inputls = torch.full(size=(B,), fill_value=time_steps, dtype=torch.long).to(device)
outputls = (torch.sum(batch_y != 79, dim=1)).to(torch.long).to(device) #tensor([20, 20, 10])
Env: PyTorch 1.1, CUDA 9