Beam Search: RuntimeError: element 0 of variables does not require grad

I am trying to implement a differentiable version of beam search and am running into the error:
RuntimeError: element 0 of variables does not require grad

I am not sure what am I doing wrong here.

def sample_beam_differentiable(self, netW, input, hidden_state, opt={}):
        beam_size = opt.get('beam_size', 10)
        batch_size = input.size(1)

        #assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
        seq_all = Variable(torch.LongTensor(self.seq_length, batch_size, beam_size).zero_().cuda())
        seq = Variable(torch.LongTensor(self.seq_length, batch_size).zero_().cuda())
        seqLogprobs = Variable(torch.FloatTensor(self.seq_length, batch_size).cuda())
        # lets process every image independently for now, for simplicity

        self.done_beams = [[] for _ in range(batch_size)]
        for k in range(batch_size):
            # copy the hidden state for beam_size time.
            state = []
            for state_tmp in hidden_state:
                state.append(state_tmp[:,k,:].view(1,1,-1).expand(1,beam_size, self.nhid).clone())

            state = tuple(state)

            beam_seq = Variable(torch.LongTensor(self.seq_length, beam_size).zero_().cuda())
            beam_seq_logprobs = Variable(torch.FloatTensor(self.seq_length, beam_size).zero_().cuda())
            beam_logprobs_sum = Variable(torch.zeros(beam_size).cuda()) # running sum of logprobs for each beam
            for t in range(self.seq_length + 1):
                if t == 0: # input <bos>
                    it =, beam_size).fill_(self.vocab_size)
                    xt = netW(Variable(it, requires_grad=False))
                    """perform a beam merge. that is,
                    for every previous beam we now many new possibilities to branch out
                    we need to resort our beams to maintain the loop invariant of keeping
                    the top beam_size most likely sequences."""
                    logprobsf = logprobs.float() # lets go to CPU for more efficiency in indexing operations
                    ys,ix = torch.sort(logprobsf,1,True) # sorted array of logprobs along each previous beam (last true = descending)
                    candidates = []
                    cols = min(beam_size, ys.size(1))
                    rows = beam_size
                    if t == 1:  # at first time step only the first beam is active
                        rows = 1
                    for cc in range(cols): # for each column (word, essentially)
                        for qq in range(rows): # for each beam expansion
                            # compute logprob of expanding beam q with word in (sorted) position c
                            local_logprob = ys[qq,cc]
                            if (beam_seq[t-2, qq] == self.vocab_size).cpu().data.numpy():

                            candidate_logprob = beam_logprobs_sum[qq] + local_logprob
                            candidates.append({'c'[qq,cc], 'q':qq, 'p'[0], 'r'[0]})

                    candidates = sorted(candidates, key=lambda x: -x['p'])

                    # construct new beams
                    new_state = [_.clone() for _ in state]
                    if t > 1:
                        # well need these as reference when we fork beams around
                        beam_seq_prev = beam_seq[:t-1].clone()
                        beam_seq_logprobs_prev = beam_seq_logprobs[:t-1].clone()
                    for vix in range(beam_size):
                        v = candidates[vix]
                        # fork beam index q into index vix
                        if t > 1:
                            beam_seq[:t-1, vix] = beam_seq_prev[:, v['q']]
                            beam_seq_logprobs[:t-1, vix] = beam_seq_logprobs_prev[:, v['q']]

                        # rearrange recurrent states
                        for state_ix in range(len(new_state)):
                            # copy over state in previous beam q to new beam at vix
                            new_state[state_ix][0, vix] = state[state_ix][0, v['q']] # dimension one is time step

                        # append new end terminal at the end of this beam
                        beam_seq[t-1, vix] = int(v['c']) # c'th word is the continuation
                        beam_seq_logprobs[t-1, vix] = float(v['r']) # the raw logprob here
                        beam_logprobs_sum[vix] = float(v['p']) # the new (sum) logprob along this beam

                        if v['c'] == self.vocab_size or t == self.seq_length:
                            # END token special case here, or we reached the end.
                            # add the beam to a set of done beams
                            self.done_beams[k].append({'seq': beam_seq[:, vix].clone(),
                                                'logps': beam_seq_logprobs[:, vix].clone(),
                                                'p': beam_logprobs_sum[vix].clone()

                    # encode as vectors
                    it = beam_seq[t-1].clone().view(1,-1)
                    xt = netW(it.cuda())

                if t >= 1:
                    state = new_state

                output, state = self.rnn(xt, state)

                output = F.dropout(output, self.d,
                decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
                logprobs = F.log_softmax(self.beta * decoded)

            self.done_beams[k] = sorted(self.done_beams[k], key=lambda x: -x['p'].cpu().data.numpy())
            seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
            seqLogprobs[:, k] = self.done_beams[k][0]['logps']
            for ii in range(beam_size):
                seq_all[:,k,ii] = self.done_beams[k][ii]['seq']

        # return the samples and their log likelihoods
        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)