Assertion error (assert cur_offset == offset) in loss.backward()

I am trying to train a GRU-attention model, but running into this assertion error at the point of loss.backward(). Here is my code with the error. Any help will be appreciated, I am kind of at a loss(pun) here.

class AttendResistance(nn.Module):
    def __init__(self, nb_classes, nb_tokens, embedding_matrix, embed_dropout_rate=0, 
                 final_dropout_rate=0, return_attention=False):
        super(AttendResistance, self).__init__()

        embedding_dim = 20
        hidden_size = 32
      
        self.embed_dropout_rate = embed_dropout_rate
        self.final_dropout_rate = final_dropout_rate
        self.return_attention = return_attention
        self.hidden_size = hidden_size
        self.nb_classes = nb_classes
        
        self.embed = nn.Embedding(nb_tokens, embedding_dim)
        self.embed.weight = nn.Parameter(embedding_matrix)
        self.embed_dropout = nn.Dropout2d(embed_dropout_rate)
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers = 1, batch_first=True, dropout = 0.5, 
                        bidirectional=False)
        self.final_drop = nn.Dropout(final_dropout_rate)
        self.linear = nn.Linear(hidden_size, nb_classes)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, input_seqs):
        x = self.embed(input_seqs)
        x = nn.Tanh()(x)
        x = self.embed_dropout(x)
        x, _ = self.gru(x)        
        x = self.final_drop(x)
        x = self.linear(x[:, -1, :].float())
        outputs = self.softmax(x)
        
        if self.return_attention:
            return outputs, att_weights
        else:
            return outputs

attn_res = AttendResistance(268, 20, embedding_matrix, 0.5, 0.3, True)
attn_res = attn_res.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(attn_res.parameters())

num_epochs = 10
for epoch in range(num_epochs):
    for i, (prot_seqs, labels) in enumerate(train_loader):
        prot_seqs = Variable(prot_seqs.long()).cuda()
        labels = Variable(labels.long()).cuda()
        
        #print (prot_seqs)
        #print (labels)
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs, att_weights = attn_res(prot_seqs)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        print (outputs)
        print (att_weights)
        print (torch.sum(att_weights))
        print (loss)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                   %(epoch+1, num_epochs, i+1, len(X_train)//batch_size, loss.data[0]))

And here is the error:

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-89-a32cf2edb4cc> in <module>()
     17         print (torch.sum(att_weights))
     18         print (loss)
---> 19         loss.backward()
     20         optimizer.step()
     21 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_graph, create_graph, retain_variables)
    165                 Variable.
    166         """
--> 167         torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
    168 
    169     def register_hook(self, hook):

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
     97 
     98     Variable._execution_engine.run_backward(
---> 99         variables, grad_variables, retain_graph)
    100 
    101 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/function.py in _do_backward(self, gradients, retain_variables)
    333     def _do_backward(self, gradients, retain_variables):
    334         self.retain_variables = retain_variables
--> 335         result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
    336         if not retain_variables:
    337             del self._nested_output

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/function.py in backward(self, *gradients)
    341     def backward(self, *gradients):
    342         nested_gradients = _unflatten(gradients, self._nested_output)
--> 343         result = self.backward_extended(*nested_gradients)
    344         return tuple(_iter_None_tensors(result))
    345 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in backward_extended(self, grad_output, grad_hy)
    333                 output,
    334                 weight,
--> 335                 grad_weight)
    336         else:
    337             grad_weight = [(None,) * len(layer_weight) for layer_weight in weight]

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in backward_weight(fn, input, hx, output, weight, grad_weight)
    466 
    467         # copy the weights from the weight_buf into grad_weight
--> 468         grad_params = get_parameters(fn, handle, dw)
    469         _copyParams(grad_params, grad_weight)
    470         return grad_weight

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in get_parameters(fn, handle, weight_buf)
    169                     layer_params.append(param)
    170                 else:
--> 171                     assert cur_offset == offset
    172 
    173                 cur_offset = offset + filter_dim_a[0]

AssertionError: 

I am a newbie with pytorch. Because the error is not giving me any explicit message, I don’t know what am I doing wrong here. Running on,

0.3.0.post4
Cuda compilation tools, release 8.0, V8.0.61

The problem is probably the nans that are appearing. Do you know why they’re showing up?

Okk, maybe, the nans were not related to the model. I ran the data processing pipeline again, and now there are numbers(you can see at the edited post). But the original problem remains.

Could you please provide a runnable script? I can’t really help debugging right now because your code doesn’t run due to some missing variables.

If it helps, it looks like an assert is being triggered in /home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py line 171.

Yeah, I am aware it is an assertion error that is happening. But I don’t know what are these cur_offset and offset variables are. I have tried to read the code there but have not been able to figure out.