Hey there !
I’m trying to adapt a code, which is using warp-ctc. When I remplace it with the built-in PyTorch CTCLoss, there is a strange behaviour during the learning process. Indeed, after few batches in the first epoch, the network predictions are only blank labels.
My versions of pytorch :
- pytorch = 1.4.0
- torchvision = 0.5.0
Here is the code which defines the architecture of the network :
import torch
from torch import nn
import torch.nn.functional as F
class BidirectionalLSTM(nn.Module):
def __init__(self, nIn, nHidden, nOut):
super(BidirectionalLSTM, self).__init__()
self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
self.embedding = nn.Linear(nHidden * 2, nOut)
def forward(self, input):
recurrent, _ = self.rnn(input)
T, b, h = recurrent.size()
t_rec = recurrent.view(T * b, h)
output = self.embedding(t_rec) # [T * b, nOut]
output = output.view(T, b, -1)
return output
class CRNN(nn.Module):
def __init__(self, cnnOutSize, nc, nclass, nh, n_rnn=2, leakyRelu=False):
super(CRNN, self).__init__()
# assert imgH % 16 == 0, 'imgH has to be a multiple of 16'
ks = [3, 3, 3, 3, 3, 3, 2]
ps = [1, 1, 1, 1, 1, 1, 0]
ss = [1, 1, 1, 1, 1, 1, 1]
nm = [64, 128, 256, 256, 512, 512, 512]
cnn = nn.Sequential()
def convRelu(i, batchNormalization=False):
nIn = nc if i == 0 else nm[i - 1]
nOut = nm[i]
cnn.add_module('conv{0}'.format(i),
nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
if batchNormalization:
cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
if leakyRelu:
cnn.add_module('relu{0}'.format(i),
nn.LeakyReLU(0.2, inplace=True))
else:
cnn.add_module('relu{0}'.format(i), nn.ReLU(True))
convRelu(0)
cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2)) # 64x16x64
convRelu(1)
cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2)) # 128x8x32
convRelu(2, True)
convRelu(3)
cnn.add_module('pooling{0}'.format(2),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 256x4x16
convRelu(4, True)
convRelu(5)
cnn.add_module('pooling{0}'.format(3),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 512x2x16
convRelu(6, True) # 512x1x16
self.cnn = cnn
self.rnn = nn.Sequential(
BidirectionalLSTM(cnnOutSize, nh, nh),
BidirectionalLSTM(nh, nh, nclass))
def forward(self, input):
# conv features
conv = self.cnn(input)
b, c, h, w = conv.size()
conv = conv.view(b, -1, w)
conv = conv.permute(2, 0, 1) # [w, b, c]
# rnn features
output = self.rnn(conv)
# get probabilities
output = F.softmax(output, dim=2)
return output
And here is my canvas for training :
(...) # Dataloader etc.
# Set network
crnn = CRNN(cnnOutSize=512, nc=3, nclass=num_of_outputs, nh=512)
# Loss and optimizer
criterion = CTCLoss(zero_infinity=True)
optimizer = torch.optim.Adam(crnn.parameters())
# Training
for epoch in range(1000):
print("Epoch", epoch)
for i, x in enumerate(train_dataloader):
# Extract inputs from the batch
line_imgs = x['line_imgs']
labels = x['labels']
label_lengths = x['label_lengths']
# Compute nn prediction
preds = crnn(line_imgs)
# Compute loss
batch_size = preds.size(1)
preds_lengths = torch.IntTensor([preds.size(0)]*batch_size)
loss = criterion(preds.log(), labels, preds_lengths, label_lengths)
# Update weights
optimizer.zero_grad()
loss.backward()
optimizer.step()
# CER computation with naive Decode
preds = preds.permute(1,0,2).detach().numpy() # TNC -> NTC
for i, gt_line in enumerate(x['gt']):
logits = preds[i,:]
pred, raw_pred = string_utils.naive_decode(logits)
pred_str = string_utils.label2str_single(pred, idx_to_char, False)
print(pred_str,'/', gt_line)
cer = error_rates.cer(gt_line, pred_str)
print("Loss", loss.item(), 'CER', cer)
print()
And here a sample of my training set (from READ dataset) :
line_imgs : torch.Size([3, 32, 640])
labels : tensor([71, 75, 73, 73, 69, 79, 79, 69, 75, 74, 1, 61, 74, 67, 65, 66, 65, 78,
80, 69, 67, 80, 65, 74, 1, 41, 75, 79, 80, 65, 74, 82, 75, 78, 61, 74,
79, 63, 68, 72, 61, 67, 65, 1, 9, 34, 65, 86, 65, 73, 62, 65, 78, 1,
18, 25, 24, 23, 10, 1, 65, 74, 80, 68, 61, 72, 80, 65, 74],
dtype=torch.int32)
gt : kommission angefertigten Kostenvoranschlage (Dezember 1876) enthalten
I’m using batch of size one to prevent effect from padding. But also with batch size higher than one and padding the behaviour is the same (blank label prediction). Same for changing optimizer and learning rate.
Any idea to solve this ?
Thank you in advance !