RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [256, 3]],

I am implementing PPO RL algorithm in pytorch and getting above error.However I am unable to find why this error is comming.

Below is my code.

#main loop for generating data into replaybuffer.

while count<max_timesteps-1:
        episode_length += 1
        if done:
            cx = Variable(torch.zeros(params.lstm_layers, 1, params.lstm_size))
            hx = Variable(torch.zeros(params.lstm_layers, 1, params.lstm_size))
            cx = Variable(
            hx = Variable(

        values = []
        log_probs = []
        rewards = []
        entropies = []
        adv = []
        st = []
        rew = []
        act = []
        while count<max_timesteps-1:
            St = (Variable(state.unsqueeze(0)))
            value, action_values = model(St)
            prob = F.softmax(action_values - max(action_values), dim = -1)
            log_prob = torch.log(prob).reshape(-1,)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            m = categorical.Categorical(prob)
            action = m.sample().reshape(-1,)
            log_prob_a = log_prob.gather(0, Variable(action))
            state, reward, done = env.step(action)
            reward = max(min(reward, 1), -1)
            count +=1
            if done:
                episode_length = 0
                state = env.reset()
            print("rank ",rank," action:",action, "reward ",reward)

            if done:

        R = torch.zeros(1, 1)
        if not done:
            St = Variable(state.unsqueeze(0))
            value, _ = model(Variable(St))
            R =
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = params.gamma * R + rewards[i]
            # advantage = R - values[i]
            TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
            gae = gae * params.gamma * params.tau + TD

        for i in reversed(range(len(rewards))):
            transition = [st[i], adv[i], rew[i], act[i], log_probs[i], values[i]]

#ActorCritic Class

class ActorCritic(torch.nn.Module):

    def __init__(self, params):
        super(ActorCritic, self).__init__()

        self.num_inputs = params.num_inputs
        self.action_space = params.action_dim
        self.hidden_size = params.hidden_size
        num_inputs = params.num_inputs
        self.lstm = nn.LSTM(num_inputs, 8,num_layers = params.lstm_layers)
        self.fc1 = nn.Linear(8, 256)
        self.fc2 = nn.Linear(256, 256)
        # self.fc3 = nn.Linear(256, 256)
        # self.fc3.apply(init_weights)
        self.critic_linear = nn.Linear(256, 1)
        self.actor_linear = nn.Linear(256, self.action_space)

    def forward(self, inputs):
        f1 = F.elu(self.fc1(inputs))
        f2 = F.elu(self.fc2(f1))
        # x = torch.tanh(self.fc3(x))
        critic = self.critic_linear(f2)
        actor = self.actor_linear(f2)
        return  critic, actor
    def get_state(self,inputs):
        inputs,(hx,cx) = inputs
        st, (hx,cx) = self.lstm(inputs,(hx,cx))
        return st, (hx,cx)

#code for training part

ind = np.random.randint(0, len(, size=int(0.2*len(
                    for i in ind:
                        state, adv, reward, action, old_log_prob, value =[i]
                        V, act_val = model(Variable(state))
                        prob = F.softmax(act_val - max(act_val), dim = -1)
                        log_prob = torch.log(prob).reshape(-1,)
                        entropy = -(log_prob * prob).sum(1, keepdim=True)
                        action_log_prob = log_prob.gather(0, Variable(action))
                        ratio = torch.exp(action_log_prob - old_log_prob)
                        surr1 = ratio * adv
                        surr2 = torch.clamp(ratio, 1.0 - 0.2,1.0 + 0.2) * adv
                        actor_loss = -torch.min(surr1, surr2).mean()
                        value_loss = 0.5 * (reward - V).pow(2).mean()
                        entr_loss = 0.01 * entropy.mean()
                        actor_loss.mean().backward(retain_graph = True)
                        (0.5 * value_loss).mean().backward(retain_graph = True)

                        torch.nn.utils.clip_grad_norm_(model.parameters(), 40)

Full traceback:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [256, 3]], which is output 0 of TBackward, is at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

The traceback is giving the DoubleTensor[256,3], actor_linear fully connected layer is is of size 256,3.

I am unable to figure it out what is wrong with the code.

Try to add .clone() operations to tensors to isolate the offending line of code.
I cannot find anything suspicious by skimming through your code.

PS: Variables are deprecated since PyTorch 0.4, so you can use tensors now :wink:


I tried to clone everything and removed Variable. I am still getting this error. I have encountered this error earlier also but there I was able to identify a variable that was updated inplace.

But I am unable to get which variable is getting updated inplace here.

Please help