RuntimeError : Gradients are not CUDA tensors

I am getting the following error while experimenting with a simple lstm layer. The forward propagation is fine but while computing loss.backward() I am getting the following error.

raise RuntimeError(‘Gradients aren’t CUDA tensors’)
RuntimeError: Gradients aren’t CUDA tensors

any suggestions about why I am doing wrong?

the gradients are not CUDA tensors :slight_smile:

Can you post your simple script to see what you are doing wrong?

I’m encountering the same error too. Followed is my train() function.

def train(input_batch, input_batch_length, target_batch,target_batch_length, batch_size):

# Zero gradients of both optimizers
encoderchar_optimizer.zero_grad()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

encoder_input = Variable(torch.FloatTensor(len(input_batch),batch_size,500))

for ix , w in enumerate(input_batch): 
    w = w.contiguous().view(15,batch_size)
    reshaped_input_length = [ x[ix] for x in input_batch_length] # [15 ,.. 30 times] * 128
    if USE_CUDA: 
        w = w.cuda()
        #reshaped_input_length =  Variable(torch.LongTensor(reshaped_input_length )).cuda()
    hidden_all , output = encoderchar(w,reshaped_input_length)
    encoder_input[ix] = output.transpose(0,1).contiguous().view(batch_size,-1)
    if USE_CUDA: 
        encoder_input = encoder_input.cuda()
  
temporary_target_batch_length = [15] * batch_size

#if USE_CUDA: 
    #target_batch_length =  Variable(torch.LongTensor(target_batch_length )).cuda()

encoder_hidden_all, encoder_output = encoder(encoder_input, target_batch_length)
decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
decoder_hidden = encoder_output

max_target_length = max(temporary_target_batch_length)
all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))

# Move new Variables to CUDA
if USE_CUDA:
    decoder_input = decoder_input.cuda()
    all_decoder_outputs = all_decoder_outputs.cuda()
    target_batch =  target_batch.cuda()
    ##Added by Satish 
    encoder_hidden_all  = encoder_hidden_all.cuda()
    encoder_output = encoder_output.cuda()
    decoder_hidden = decoder_hidden.cuda()
# Run through decoder one time step at a time
for t in range(max_target_length):
    decoder_output, decoder_hidden, decoder_attn = decoder(
        decoder_input, decoder_hidden, encoder_hidden_all
    )

    all_decoder_outputs[t] = decoder_output
    decoder_input = target_batch[t] # Next input is current target
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

if USE_CUDA:
    all_decoder_outputs = all_decoder_outputs.cuda()

Loss calculation and backpropagation

loss = masked_cross_entropy(
    all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
    target_batch.transpose(0, 1).contiguous(), # -> batch x seq
    target_batch_length
)
loss.backward()

# Clip gradient norms
ecc = torch.nn.utils.clip_grad_norm(encoderchar.parameters(), clip)
ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

# Update parameters with optimizers
encoderchar_optimizer.step()
encoder_optimizer.step()
decoder_optimizer.step()

return loss.data[0], ec, dc

any inputs what I’m doing wrong?