Why did my RNN model give me the 'nan' loss even with a low lr and clip_grad

My model is seem like below:

class CTCModel(nn.Module):
    def __init__(self,output_size,rnn_hidden_size=128, num_rnn_layers=1, dropout=0):
        super(CTCModel, self).__init__()
        self.num_rnn_layers = num_rnn_layers
        self.rnn_hidden_size = rnn_hidden_size
        self.output_size = output_size
        self.layer1 = nn.Sequential(
            nn.Conv2d(3,32, kernel_size=(3,3),stride=(1,1),padding=(1,1)),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=(3, 4), stride=(3, 2)),
            nn.ReLU(),
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(32,64, kernel_size=(3,3),stride=(1,1),padding=(1,1)),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=(4, 3), stride=(4, 2)),
            nn.ReLU(),
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=(3, 3),stride=(1,1),padding=(1,1)),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=(4, 2), stride=(2, 2)),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=(3, 2), stride=(1, 1)),
			nn.ReLU(),
        )
        self.gru = nn.GRU(128, rnn_hidden_size, num_rnn_layers,
                          batch_first=True,
                          dropout=dropout,bidirectional=True)
        self.linear = nn.Linear(rnn_hidden_size*2,output_size)

    def forward(self, x, hidden):
        h0 = hidden
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out).squeeze()
        out = out.transpose(1, 2)
        out, hidden = self.gru(out, h0)
        out = self.linear(out)
        return out

    def initHidden(self,batch_size,use_cuda=False):
        h0 = Variable(torch.zeros(self.num_rnn_layers*2,batch_size,self.rnn_hidden_size))
        if use_cuda:
            return (h0.cuda())
        else:
            return h0

Evey step of training is like this:

def CTCtrain(inputs,targets,lens,ctc,ctc_optimizer,criterion,clip,use_cuda=False):
    if use_cuda:
        inputs = inputs.cuda()
    loss = 0
    ctc_optimizer.zero_grad()
    batch_size = inputs.size()[0]
    init_hidden = ctc.initHidden(batch_size,use_cuda=use_cuda)
    ctc_outputs = ctc(inputs,init_hidden)
    ctcloss_inputs = ctc_outputs.transpose(0,1) #SeqLen * BatchSize * Hidden
    label_lens = lens
    act_lens = Variable(torch.IntTensor(batch_size*[ctc_outputs.size()[1]]),requires_grad=False)
    loss = criterion(ctcloss_inputs,targets,act_lens,label_lens)

    loss.backward()
    torch.nn.utils.clip_grad_norm(ctc.parameters(), clip)
    ctc_optimizer.step()

    #TODO
    decoded_outputs = decode_ctc_outputs(ctc_outputs)
    decoded_targets = np.split(targets.data.numpy(),lens.data.numpy().cumsum())[:-1]
    accuracy = np.array([np.array_equal(decoded_targets[i],decoded_outputs[i])
                         for i in range(batch_size)]).mean()

    return loss.data[0],accuracy

And the CTCLoss is from here

I have tried to clip the gradient with torch.nn.utils.clip_grad_norm(ctc.parameters(), clip), the clip=0.5. What is more , my learning_rate is 0.0001. However, all these could not help. after sever batches, the loss will become “nan” like belowing:

epoch train loss: 8023.35400391
epoch train accuracy: 0.0
epoch train loss: 8019.50976562
epoch train accuracy: 0.0
epoch train loss: nan
epoch train accuracy: 0.0
epoch train loss: nan
epoch train accuracy: 0.0

I have spent plenty time to figure it out, like changing the structure of my model with more or less channel or more Conv layers. Unfortunately, I’m still fighting with this trouble. Any advice will be appreciated!!

2 Likes

What is your learning rate? It should be small enough to avoid the “loss nan” problem.