LSTM error in second epoch only on GPU

I am running a code which runs totally fine on CPU but fails in the second epoch when running on GPU. I am suspecting it is memory issue because it happens in second epoch. But not really able to pin down where exactly is the issue.

Error:

 for epoch in range(N_EPOCHS):
     30         train_loss = train(seq_model, train_iter, optimizer, criterion, CLIP)
---> 31         valid_loss = evaluate(seq_model, test_iter, criterion)
     32 
     33         if valid_loss < best_valid_loss:

<ipython-input-17-36056d622cf9> in evaluate(seq_model, test_iterator, criterion)
      7         question = batch.question
      8         answer = batch.answer
----> 9         output = seq_model(question,answer)
     10 
     11         loss=criterion(output[1:].view(-1, 2004), answer[1:].view(-1))

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    539             result = self._slow_forward(*input, **kwargs)
    540         else:
--> 541             result = self.forward(*input, **kwargs)
    542         for hook in self._forward_hooks.values():
    543             hook_result = hook(self, input, result)

<ipython-input-15-dc2f26ea3a12> in forward(self, question, answer, teacher_forcing_ratio)
     21         for i in range(1,100):
     22 
---> 23             prediction,enc_hidd_state,enc_hidd_cell = self.decoder(input, enc_hidd_state, enc_hidd_cell)
     24 
     25             outputs[i] = prediction

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    539             result = self._slow_forward(*input, **kwargs)
    540         else:
--> 541             result = self.forward(*input, **kwargs)
    542         for hook in self._forward_hooks.values():
    543             hook_result = hook(self, input, result)

<ipython-input-14-d16c62e83aec> in forward(self, answer, hidden, cell)
     19         f_embed = self.embed(answer)
     20         #print("embed"+str(f_embed.shape))
---> 21         f_lstm,(hidden,cell) = self.lstm(f_embed.float(),(hidden,cell))
     22         #print("lstm"+str(f_lstm.shape))
     23         f_linear = self.linear(f_lstm)

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    539             result = self._slow_forward(*input, **kwargs)
    540         else:
--> 541             result = self.forward(*input, **kwargs)
    542         for hook in self._forward_hooks.values():
    543             hook_result = hook(self, input, result)

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    562             return self.forward_packed(input, hx)
    563         else:
--> 564             return self.forward_tensor(input, hx)
    565 
    566 

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/rnn.py in forward_tensor(self, input, hx)
    541         unsorted_indices = None
    542 
--> 543         output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
    544 
    545         return output, self.permute_hidden(hidden, unsorted_indices)

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/rnn.py in forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices)
    524         if batch_sizes is None:
    525             result = _VF.lstm(input, hx, self._get_flat_weights(), self.bias, self.num_layers,
--> 526                               self.dropout, self.training, self.bidirectional, self.batch_first)
    527         else:
    528             result = _VF.lstm(input, batch_sizes, hx, self._get_flat_weights(), self.bias,

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

Decoder definition:

class Decoder(nn.Module):
    def __init__(self,embedding_matrix):
    
        super().__init__()

        self.embed = nn.Embedding(2004,600)

        self.embed.weight = nn.Parameter(embedding_matrix,requires_grad=False)

        self.lstm = nn.LSTM(600,256,bidirectional =False)

        self.linear = nn.Linear(256,2004)
      
    
    
    
    def forward(self,answer, hidden, cell):
        
        f_embed = self.embed(answer)
        #print("embed"+str(f_embed.shape))
        f_lstm,(hidden,cell) = self.lstm(f_embed.float(),(hidden,cell))
        #print("lstm"+str(f_lstm.shape))
        f_linear = self.linear(f_lstm)
        #print("linera"+str(f_linear.shape))
        f_relu = torch.relu(f_linear)
        
        return f_relu,hidden, cell 

any insight on how to resolve would be highly appreciated.

Do you re-initialize your hidden sate after each batch? See this Seq2Seq PyTorch tutorial, specifically the use if the initHidden() methods of the encoder and decoder.

Without re-initalizing (or the usage of detach() at the right spot), the backpropagation path of your RNN continuously grows, definitely leading to memory issues.

Thanks for the quick response Chris.

I did not know about the initialization of the hidden state of encoder and realized i somehow deleted the torch.no_grad() condition during evaluation. it seems both things combined led to the failure.
Corrected both and code worked fine :slight_smile: