Hi, I’m encountering various CUDA-related errors such as CUDNN_STATUS_INTERNAL_ERROR: 73
, cublas runtime error: 54
and core dumped: 68
while running the same code. These errors occur after training for unfixed (seemingly random) numbers of mini-batches. For the mini-batches I manage to run before the errors appear, the loss decreases, and everything looks normal. Any help would be extremely appreciated!
I am trying to train a seq2seq model, which concatenates the decoder’s output to the encoder’s context vector (the encoder’s final output), and uses this as the input for the decoder during the next time step.
class EncoderRNN(torch.nn.Module):
def __init__(self, vocab_size, wordvec_dim, hidden_size, batch_size):
super(EncoderRNN, self).__init__()
self.vocab_size = vocab_size
self.wordvec_dim = wordvec_dim
self.batch_size = batch_size
self.hidden_size = hidden_size
self.h = Variable(torch.randn(2, self.batch_size, self.hidden_size), requires_grad=True)
if use_cuda:
self.h = self.h.cuda()
self.embedding = torch.nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.wordvec_dim)
self.gru = torch.nn.GRU(input_size=self.wordvec_dim, hidden_size=self.hidden_size, num_layers=2)
def forward(self, input, input_hidden):
embedded = self.embedding(input).view(1,self.batch_size,self.wordvec_dim) #(seq_len, batch, input_size)
output, output_hidden = self.gru(embedded, input_hidden)
return output, output_hidden #output_hidden will have 2 layers of hidden states
class DecoderRNN(torch.nn.Module):
def __init__(self, hidden_size, input_size, out_vocab_size, batch_size):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.input_size = input_size
self.out_vocab_size = out_vocab_size
self.batch_size = batch_size
self.h = Variable(torch.randn(2,self.batch_size, self.hidden_size), requires_grad=True)
self.c = Variable(torch.randn(2,self.batch_size, self.hidden_size), requires_grad=True)
if use_cuda:
self.h = self.h.cuda()
self.c = self.c.cuda()
initial_output = np.zeros((1,self.batch_size,self.out_vocab_size))
initial_output[:,:,0]= 1 # initialize with 'PAD' token
initial_output = torch.FloatTensor(initial_output)
initial_output = Variable(initial_output, requires_grad=False)
self.initial_output = initial_output
if use_cuda:
self.initial_output = self.initial_output.cuda()
self.LSTM = torch.nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=2)
self.linear = torch.nn.Linear(self.hidden_size, self.out_vocab_size)
self.softmax = torch.nn.Softmax(dim=1)
def forward(self, context_vector, prev_output, input_h, input_c):
decoder_input = torch.cat((context_vector,prev_output), 2)
decoder_output, (decoder_hn, decoder_cn) = self.LSTM(decoder_input, (input_h, input_c))
softmax_output = self.softmax(self.linear(decoder_output)[0])
return softmax_output, decoder_hn, decoder_cn
And this is how I train the model:
batch_size = 256
vocab_size = 11868
wordvec_dim = 256
encoder_hidden_size = 512
decoder_hidden_size = 512
decoder_output_size = 17
decoder_input_size = decoder_hidden_size + decoder_output_size
learning_rate = 0.01
use_cuda = torch.cuda.is_available()
print('using gpu:', use_cuda)
encoder = EncoderRNN(vocab_size, wordvec_dim, encoder_hidden_size, batch_size)
decoder = DecoderRNN(hidden_size=decoder_hidden_size, input_size=decoder_input_size,
out_vocab_size=decoder_output_size, batch_size=batch_size)
if use_cuda:
encoder = encoder.cuda()
decoder = decoder.cuda()
criterion = torch.nn.CrossEntropyLoss()
encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
for i, (input, target) in enumerate(train_loader):
encoder_h = encoder.h
input_var = Variable(input).type(torch.LongTensor)
target_var = Variable(target).type(torch.LongTensor)
if use_cuda:
input_var = input_var.cuda()
target_var = target_var.cuda()
loss = 0
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
for t in range(tokenized_question_text_list.shape[1]): # up to 115
encoder_output, encoder_hidden = encoder(input_var[:,t], encoder_h)
encoder_h = encoder_hidden
decoder_h, decoder_c, prev_output = decoder.h, decoder.c, decoder.initial_output
for t in range(tokenized_equations.shape[1]): # up to 30
prev_outputn, decoder_hn, decoder_cn = decoder(encoder_output, prev_output, decoder_h, decoder_c)
decoder_h = decoder_hn
decoder_c = decoder_cn
prev_output = prev_outputn.view(1,batch_size,decoder_output_size)
loss += criterion(prev_output[0], target_var[:,t])
print(i, loss.data[0])
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()