CUDA-related errors while training seq2seq

Hi, I’m encountering various CUDA-related errors such as CUDNN_STATUS_INTERNAL_ERROR: 73, cublas runtime error: 54 and core dumped: 68 while running the same code. These errors occur after training for unfixed (seemingly random) numbers of mini-batches. For the mini-batches I manage to run before the errors appear, the loss decreases, and everything looks normal. Any help would be extremely appreciated!

I am trying to train a seq2seq model, which concatenates the decoder’s output to the encoder’s context vector (the encoder’s final output), and uses this as the input for the decoder during the next time step.

class EncoderRNN(torch.nn.Module):
    def __init__(self, vocab_size, wordvec_dim, hidden_size, batch_size):
        super(EncoderRNN, self).__init__()
        self.vocab_size = vocab_size
        self.wordvec_dim = wordvec_dim
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.h = Variable(torch.randn(2, self.batch_size, self.hidden_size), requires_grad=True)
        if use_cuda:
            self.h = self.h.cuda()
        self.embedding = torch.nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.wordvec_dim)
        self.gru = torch.nn.GRU(input_size=self.wordvec_dim, hidden_size=self.hidden_size, num_layers=2)
    def forward(self, input, input_hidden):
        embedded = self.embedding(input).view(1,self.batch_size,self.wordvec_dim) #(seq_len, batch, input_size)
        output, output_hidden = self.gru(embedded, input_hidden)
        return output, output_hidden #output_hidden will have 2 layers of hidden states

class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, input_size, out_vocab_size, batch_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.out_vocab_size = out_vocab_size
        self.batch_size = batch_size
        self.h = Variable(torch.randn(2,self.batch_size, self.hidden_size), requires_grad=True)
        self.c = Variable(torch.randn(2,self.batch_size, self.hidden_size), requires_grad=True)
        if use_cuda:
            self.h = self.h.cuda()
            self.c = self.c.cuda()
        initial_output = np.zeros((1,self.batch_size,self.out_vocab_size))
        initial_output[:,:,0]= 1 # initialize with 'PAD' token
        initial_output = torch.FloatTensor(initial_output)
        initial_output = Variable(initial_output, requires_grad=False)
        self.initial_output = initial_output
        if use_cuda:
            self.initial_output = self.initial_output.cuda()

        self.LSTM = torch.nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=2)
        self.linear = torch.nn.Linear(self.hidden_size, self.out_vocab_size)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, context_vector, prev_output, input_h, input_c):
        decoder_input =,prev_output), 2)        
        decoder_output, (decoder_hn, decoder_cn) = self.LSTM(decoder_input, (input_h, input_c))
        softmax_output = self.softmax(self.linear(decoder_output)[0])
        return softmax_output, decoder_hn, decoder_cn

And this is how I train the model:

batch_size = 256

vocab_size = 11868
wordvec_dim = 256
encoder_hidden_size = 512

decoder_hidden_size = 512
decoder_output_size = 17
decoder_input_size = decoder_hidden_size + decoder_output_size

learning_rate = 0.01
use_cuda = torch.cuda.is_available()
print('using gpu:', use_cuda)

encoder = EncoderRNN(vocab_size, wordvec_dim, encoder_hidden_size, batch_size)
decoder = DecoderRNN(hidden_size=decoder_hidden_size, input_size=decoder_input_size,
                         out_vocab_size=decoder_output_size, batch_size=batch_size)  
if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

criterion = torch.nn.CrossEntropyLoss()
encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)

train_loader =, batch_size=batch_size, shuffle=True, drop_last=True)
for i, (input, target) in enumerate(train_loader):
    encoder_h = encoder.h
    input_var = Variable(input).type(torch.LongTensor)
    target_var = Variable(target).type(torch.LongTensor)
    if use_cuda:
        input_var = input_var.cuda()
        target_var = target_var.cuda()
    loss = 0
    for t in range(tokenized_question_text_list.shape[1]): # up to 115
        encoder_output, encoder_hidden = encoder(input_var[:,t], encoder_h)
        encoder_h = encoder_hidden
    decoder_h, decoder_c, prev_output = decoder.h, decoder.c, decoder.initial_output
    for t in range(tokenized_equations.shape[1]): # up to 30
        prev_outputn, decoder_hn, decoder_cn = decoder(encoder_output, prev_output, decoder_h, decoder_c)
        decoder_h = decoder_hn
        decoder_c = decoder_cn
        prev_output = prev_outputn.view(1,batch_size,decoder_output_size)
        loss += criterion(prev_output[0], target_var[:,t])

So I found out that when I set DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=True), my model trains without CUDA errors… maybe an issue with DataLoader's shuffle argument?

Could you run your script again with CUDA_LAUNCH_BLOCKING=1 python This might point to the line, where the error occurred. Could you also post the stack trace after it crashed again?