I am running a code which runs totally fine on CPU but fails in the second epoch when running on GPU. I am suspecting it is memory issue because it happens in second epoch. But not really able to pin down where exactly is the issue.
Error:
for epoch in range(N_EPOCHS):
30 train_loss = train(seq_model, train_iter, optimizer, criterion, CLIP)
---> 31 valid_loss = evaluate(seq_model, test_iter, criterion)
32
33 if valid_loss < best_valid_loss:
<ipython-input-17-36056d622cf9> in evaluate(seq_model, test_iterator, criterion)
7 question = batch.question
8 answer = batch.answer
----> 9 output = seq_model(question,answer)
10
11 loss=criterion(output[1:].view(-1, 2004), answer[1:].view(-1))
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
<ipython-input-15-dc2f26ea3a12> in forward(self, question, answer, teacher_forcing_ratio)
21 for i in range(1,100):
22
---> 23 prediction,enc_hidd_state,enc_hidd_cell = self.decoder(input, enc_hidd_state, enc_hidd_cell)
24
25 outputs[i] = prediction
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
<ipython-input-14-d16c62e83aec> in forward(self, answer, hidden, cell)
19 f_embed = self.embed(answer)
20 #print("embed"+str(f_embed.shape))
---> 21 f_lstm,(hidden,cell) = self.lstm(f_embed.float(),(hidden,cell))
22 #print("lstm"+str(f_lstm.shape))
23 f_linear = self.linear(f_lstm)
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
562 return self.forward_packed(input, hx)
563 else:
--> 564 return self.forward_tensor(input, hx)
565
566
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/rnn.py in forward_tensor(self, input, hx)
541 unsorted_indices = None
542
--> 543 output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
544
545 return output, self.permute_hidden(hidden, unsorted_indices)
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/rnn.py in forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices)
524 if batch_sizes is None:
525 result = _VF.lstm(input, hx, self._get_flat_weights(), self.bias, self.num_layers,
--> 526 self.dropout, self.training, self.bidirectional, self.batch_first)
527 else:
528 result = _VF.lstm(input, batch_sizes, hx, self._get_flat_weights(), self.bias,
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Decoder definition:
class Decoder(nn.Module):
def __init__(self,embedding_matrix):
super().__init__()
self.embed = nn.Embedding(2004,600)
self.embed.weight = nn.Parameter(embedding_matrix,requires_grad=False)
self.lstm = nn.LSTM(600,256,bidirectional =False)
self.linear = nn.Linear(256,2004)
def forward(self,answer, hidden, cell):
f_embed = self.embed(answer)
#print("embed"+str(f_embed.shape))
f_lstm,(hidden,cell) = self.lstm(f_embed.float(),(hidden,cell))
#print("lstm"+str(f_lstm.shape))
f_linear = self.linear(f_lstm)
#print("linera"+str(f_linear.shape))
f_relu = torch.relu(f_linear)
return f_relu,hidden, cell
any insight on how to resolve would be highly appreciated.