Changing warp-ctc by PyTorch CTCLoss provides blank label prediction

Hey there !

I’m trying to adapt a code, which is using warp-ctc. When I remplace it with the built-in PyTorch CTCLoss, there is a strange behaviour during the learning process. Indeed, after few batches in the first epoch, the network predictions are only blank labels.


My versions of pytorch :

  • pytorch = 1.4.0
  • torchvision = 0.5.0

Here is the code which defines the architecture of the network :

import torch
from torch import nn
import torch.nn.functional as F

class BidirectionalLSTM(nn.Module):

    def __init__(self, nIn, nHidden, nOut):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
        self.embedding = nn.Linear(nHidden * 2, nOut)

    def forward(self, input):
        recurrent, _ = self.rnn(input)
        T, b, h = recurrent.size()
        t_rec = recurrent.view(T * b, h)

        output = self.embedding(t_rec)  # [T * b, nOut]
        output = output.view(T, b, -1)

        return output

class CRNN(nn.Module):

    def __init__(self, cnnOutSize, nc, nclass, nh, n_rnn=2, leakyRelu=False):
        super(CRNN, self).__init__()
        # assert imgH % 16 == 0, 'imgH has to be a multiple of 16'

        ks = [3, 3, 3, 3, 3, 3, 2]
        ps = [1, 1, 1, 1, 1, 1, 0]
        ss = [1, 1, 1, 1, 1, 1, 1]
        nm = [64, 128, 256, 256, 512, 512, 512]

        cnn = nn.Sequential()

        def convRelu(i, batchNormalization=False):
            nIn = nc if i == 0 else nm[i - 1]
            nOut = nm[i]
            cnn.add_module('conv{0}'.format(i),
                           nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
            if batchNormalization:
                cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
            if leakyRelu:
                cnn.add_module('relu{0}'.format(i),
                               nn.LeakyReLU(0.2, inplace=True))
            else:
                cnn.add_module('relu{0}'.format(i), nn.ReLU(True))

        convRelu(0)
        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2))  # 64x16x64
        convRelu(1)
        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2))  # 128x8x32
        convRelu(2, True)
        convRelu(3)
        cnn.add_module('pooling{0}'.format(2),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16
        convRelu(4, True)
        convRelu(5)
        cnn.add_module('pooling{0}'.format(3),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 512x2x16
        convRelu(6, True)  # 512x1x16

        self.cnn = cnn
        self.rnn = nn.Sequential(
            BidirectionalLSTM(cnnOutSize, nh, nh),
            BidirectionalLSTM(nh, nh, nclass))

    def forward(self, input):
        # conv features
        conv = self.cnn(input)
        b, c, h, w = conv.size()
        conv = conv.view(b, -1, w)
        conv = conv.permute(2, 0, 1)  # [w, b, c]

        # rnn features
        output = self.rnn(conv)
        
        # get probabilities
        output = F.softmax(output, dim=2)
        
        return output

And here is my canvas for training :

(...) # Dataloader etc.

# Set network
crnn = CRNN(cnnOutSize=512, nc=3, nclass=num_of_outputs, nh=512)

# Loss and optimizer
criterion = CTCLoss(zero_infinity=True)
optimizer = torch.optim.Adam(crnn.parameters())

# Training
for epoch in range(1000):
    
    print("Epoch", epoch)
    
    for i, x in enumerate(train_dataloader):
        
        # Extract inputs from the batch
        line_imgs = x['line_imgs']
        labels = x['labels']
        label_lengths = x['label_lengths']
        
        # Compute nn prediction
        preds = crnn(line_imgs) 
        
        # Compute loss
        batch_size = preds.size(1)
        preds_lengths = torch.IntTensor([preds.size(0)]*batch_size)
        loss = criterion(preds.log(), labels, preds_lengths, label_lengths)
        
        # Update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # CER computation with naive Decode
        preds = preds.permute(1,0,2).detach().numpy() # TNC -> NTC
        for i, gt_line in enumerate(x['gt']):
            logits = preds[i,:]
            pred, raw_pred = string_utils.naive_decode(logits)
            pred_str = string_utils.label2str_single(pred, idx_to_char, False)
            print(pred_str,'/', gt_line)
            cer = error_rates.cer(gt_line, pred_str)
        
        print("Loss", loss.item(), 'CER', cer)
        print()

And here a sample of my training set (from READ dataset) :

line_imgs : torch.Size([3, 32, 640])
labels : tensor([71, 75, 73, 73, 69, 79, 79, 69, 75, 74,  1, 61, 74, 67, 65, 66, 65, 78,
        80, 69, 67, 80, 65, 74,  1, 41, 75, 79, 80, 65, 74, 82, 75, 78, 61, 74,
        79, 63, 68, 72, 61, 67, 65,  1,  9, 34, 65, 86, 65, 73, 62, 65, 78,  1,
        18, 25, 24, 23, 10,  1, 65, 74, 80, 68, 61, 72, 80, 65, 74],
       dtype=torch.int32)
gt : kommission angefertigten Kostenvoranschlage (Dezember 1876) enthalten

I’m using batch of size one to prevent effect from padding. But also with batch size higher than one and padding the behaviour is the same (blank label prediction). Same for changing optimizer and learning rate.

Any idea to solve this ?
Thank you in advance !

I have the same problem, but when I increase the Learning Rate, It has different behaviour.