RuntimeError: element 0 of variables does not require grad


I am trying to implement the following dice loss but get a RuntimeError on loss.backward(). The error says:

RuntimeError: element 0 of variables does not require grad and does not have a grad_fn

Any help is much appreciated.

import torch
import torch.nn.functional as F
from torch.nn.modules.loss import _Loss
from torch.autograd import Function, Variable
class DiceCoeff(Function):
    """Dice coeff for individual examples"""
    def forward(self, input, target):
        _,input = input.max(0)
        #print 'input:'+str(input.size())        
        #print 'target:'+str(target.size())
        self.save_for_backward(input, target)
        self.inter =,target.view(-1).float()) + 0.0001        
        self.union = torch.sum(input) + torch.sum(target) + 0.0001
        t = 2*self.inter.float()/self.union.float()
        return t
    # This function has only a single output, so it gets only one gradient
    def backward(self, grad_output):
        input, target = self.saved_variables
        grad_input = grad_target = None
        if self.needs_input_grad[0]:
            grad_input = grad_output * 2 * (target * self.union + self.inter) \
                         / self.union * self.union
        if self.needs_input_grad[1]:
            grad_target = None
        return grad_input, grad_target       
def dice_coeff(input, target):
    """Dice coeff for batches"""
    if input.is_cuda:
        s = Variable(torch.FloatTensor(1).cuda().zero_())
        s = Variable(torch.FloatTensor(1).zero_())

    for i, c in enumerate(zip(input, target)):
        s = s + DiceCoeff().forward(c[0], c[1])
    s = s / (i+1)
    return s
class DiceLoss(_Loss):
    def forward(self, input, target):
        return 1 - dice_coeff(F.sigmoid(input), target)

Traceback (most recent call last):
File “”, line 560, in
File “”, line 335, in main
train(train_loader, net, criterion, optimizer, epoch, train_args)
File “”, line 372, in train
File “/usr/lib64/python2.7/site-packages/torch/autograd/”, line 167, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
File “/usr/lib64/python2.7/site-packages/torch/autograd/”, line 99, in backward
variables, grad_variables, retain_graph)
RuntimeError: element 0 of variables does not require grad and does not have a grad_fn

Could you include the full script, format your code and paste the trace please? :slight_smile:

Also check that your input has requires_grad=True: Variable(..., requires_grad=True)

I tried with the following command, but got the same error.

inputs = Variable(inputs,requires_grad=True).cuda()

I have formatted the full code for the loss function and included the Traceback. I have no idea about this error. Help is highly requested. Thanks.

Could you post the part where you construct loss? It’s hard to see the issue without seeing the graph.

Following is how I create the loss. Everything works fine when I use the CrossEntropyLoss, but with the DiceLoss, I get the RuntimeError

from myloss import DiceLoss

criterion = DiceLoss().cuda()
for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs = Variable(inputs).cuda()
        labels = Variable(labels).cuda()
        loss = criterion(outputs, labels)

You are not supposed to do this. Follow the doc and use DiceCoeff.apply.

I have the same problem. Here is my code. I’m trying to implement a differentiable version of Beam Search and run into the exact same error. My code is as follows:

def sample_beam_differentiable(self, netW, input, hidden_state, opt={}):
        beam_size = opt.get('beam_size', 10)
        batch_size = input.size(1)

        #assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
        seq_all = Variable(torch.LongTensor(self.seq_length, batch_size, beam_size).zero_().cuda())
        seq = Variable(torch.LongTensor(self.seq_length, batch_size).zero_().cuda())
        seqLogprobs = Variable(torch.FloatTensor(self.seq_length, batch_size).cuda())
        # lets process every image independently for now, for simplicity

        self.done_beams = [[] for _ in range(batch_size)]
        for k in range(batch_size):
            # copy the hidden state for beam_size time.
            state = []
            for state_tmp in hidden_state:
                state.append(state_tmp[:,k,:].view(1,1,-1).expand(1,beam_size, self.nhid).clone())

            state = tuple(state)

            beam_seq = Variable(torch.LongTensor(self.seq_length, beam_size).zero_().cuda())
            beam_seq_logprobs = Variable(torch.FloatTensor(self.seq_length, beam_size).zero_().cuda())
            beam_logprobs_sum = Variable(torch.zeros(beam_size).cuda()) # running sum of logprobs for each beam
            for t in range(self.seq_length + 1):
                if t == 0: # input <bos>
                    it =, beam_size).fill_(self.vocab_size)
                    xt = netW(Variable(it, requires_grad=False))
                    """perform a beam merge. that is,
                    for every previous beam we now many new possibilities to branch out
                    we need to resort our beams to maintain the loop invariant of keeping
                    the top beam_size most likely sequences."""
                    logprobsf = logprobs.float() # lets go to CPU for more efficiency in indexing operations
                    ys,ix = torch.sort(logprobsf,1,True) # sorted array of logprobs along each previous beam (last true = descending)
                    candidates = []
                    cols = min(beam_size, ys.size(1))
                    rows = beam_size
                    if t == 1:  # at first time step only the first beam is active
                        rows = 1
                    for cc in range(cols): # for each column (word, essentially)
                        for qq in range(rows): # for each beam expansion
                            # compute logprob of expanding beam q with word in (sorted) position c
                            local_logprob = ys[qq,cc]
                            if (beam_seq[t-2, qq] == self.vocab_size).cpu().data.numpy():

                            candidate_logprob = beam_logprobs_sum[qq] + local_logprob
                            candidates.append({'c'[qq,cc], 'q':qq, 'p'[0], 'r'[0]})

                    candidates = sorted(candidates, key=lambda x: -x['p'])

                    # construct new beams
                    new_state = [_.clone() for _ in state]
                    if t > 1:
                        # well need these as reference when we fork beams around
                        beam_seq_prev = beam_seq[:t-1].clone()
                        beam_seq_logprobs_prev = beam_seq_logprobs[:t-1].clone()
                    for vix in range(beam_size):
                        v = candidates[vix]
                        # fork beam index q into index vix
                        if t > 1:
                            beam_seq[:t-1, vix] = beam_seq_prev[:, v['q']]
                            beam_seq_logprobs[:t-1, vix] = beam_seq_logprobs_prev[:, v['q']]

                        # rearrange recurrent states
                        for state_ix in range(len(new_state)):
                            # copy over state in previous beam q to new beam at vix
                            new_state[state_ix][0, vix] = state[state_ix][0, v['q']] # dimension one is time step

                        # append new end terminal at the end of this beam
                        beam_seq[t-1, vix] = int(v['c']) # c'th word is the continuation
                        beam_seq_logprobs[t-1, vix] = float(v['r']) # the raw logprob here
                        beam_logprobs_sum[vix] = float(v['p']) # the new (sum) logprob along this beam

                        if v['c'] == self.vocab_size or t == self.seq_length:
                            # END token special case here, or we reached the end.
                            # add the beam to a set of done beams
                            self.done_beams[k].append({'seq': beam_seq[:, vix].clone(),
                                                'logps': beam_seq_logprobs[:, vix].clone(),
                                                'p': beam_logprobs_sum[vix].clone()

                    # encode as vectors
                    it = beam_seq[t-1].clone().view(1,-1)
                    xt = netW(it.cuda())

                if t >= 1:
                    state = new_state

                output, state = self.rnn(xt, state)

                output = F.dropout(output, self.d,
                decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
                logprobs = F.log_softmax(self.beta * decoded)

            self.done_beams[k] = sorted(self.done_beams[k], key=lambda x: -x['p'].cpu().data.numpy())
            seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
            seqLogprobs[:, k] = self.done_beams[k][0]['logps']
            for ii in range(beam_size):
                seq_all[:,k,ii] = self.done_beams[k][ii]['seq']

        # return the samples and their log likelihoods
        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)