I am getting the following error while experimenting with a simple lstm layer. The forward propagation is fine but while computing loss.backward() I am getting the following error.
raise RuntimeError(‘Gradients aren't CUDA tensors’)
RuntimeError: Gradients aren’t CUDA tensors
# Zero gradients of both optimizers
encoderchar_optimizer.zero_grad()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
encoder_input = Variable(torch.FloatTensor(len(input_batch),batch_size,500))
for ix , w in enumerate(input_batch):
w = w.contiguous().view(15,batch_size)
reshaped_input_length = [ x[ix] for x in input_batch_length] # [15 ,.. 30 times] * 128
if USE_CUDA:
w = w.cuda()
#reshaped_input_length = Variable(torch.LongTensor(reshaped_input_length )).cuda()
hidden_all , output = encoderchar(w,reshaped_input_length)
encoder_input[ix] = output.transpose(0,1).contiguous().view(batch_size,-1)
if USE_CUDA:
encoder_input = encoder_input.cuda()
temporary_target_batch_length = [15] * batch_size
#if USE_CUDA:
#target_batch_length = Variable(torch.LongTensor(target_batch_length )).cuda()
encoder_hidden_all, encoder_output = encoder(encoder_input, target_batch_length)
decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
decoder_hidden = encoder_output
max_target_length = max(temporary_target_batch_length)
all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))
# Move new Variables to CUDA
if USE_CUDA:
decoder_input = decoder_input.cuda()
all_decoder_outputs = all_decoder_outputs.cuda()
target_batch = target_batch.cuda()
##Added by Satish
encoder_hidden_all = encoder_hidden_all.cuda()
encoder_output = encoder_output.cuda()
decoder_hidden = decoder_hidden.cuda()
# Run through decoder one time step at a time
for t in range(max_target_length):
decoder_output, decoder_hidden, decoder_attn = decoder(
decoder_input, decoder_hidden, encoder_hidden_all
)
all_decoder_outputs[t] = decoder_output
decoder_input = target_batch[t] # Next input is current target
if USE_CUDA:
decoder_input = decoder_input.cuda()
if USE_CUDA:
all_decoder_outputs = all_decoder_outputs.cuda()
Loss calculation and backpropagation
loss = masked_cross_entropy(
all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
target_batch.transpose(0, 1).contiguous(), # -> batch x seq
target_batch_length
)
loss.backward()
# Clip gradient norms
ecc = torch.nn.utils.clip_grad_norm(encoderchar.parameters(), clip)
ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
# Update parameters with optimizers
encoderchar_optimizer.step()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.data[0], ec, dc