One of the variables needed for gradient computation has been modified by an inplace operation, but i haven't use inplace operation

Hi there! I have got a problem when i construct a policy gradient example. My value estimator is construct by an easy dnn network. However, as i make this dnn a little bit change, i met this error. I think i haven’t do some inplace operation. Here is my code. thank you in advance!

import itertools
from collections import namedtuple

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as f
from tqdm import tqdm


class PolicyEstimator():
    def __init__(self, state: torch.tensor, env: gym.Env, learning_rate: float = 0.01, device=torch.device('cpu'), name: str = "PolicyEstimator"):
        self.state = state
        self.env = env
        self.learning_rate = learning_rate
        self.trans_mu = nn.Linear(state.shape[0], 16, bias=True, device=device)
        self.trans_sigma = nn.Linear(state.shape[0], 16, bias=True, device=device)
        self.model = nn.Sequential(self.trans_mu, self.trans_sigma)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def predict(self, state: torch.tensor):
        if type(state) is np.ndarray: state = torch.tensor(state.astype(np.float32))
        self.state = state
        self.mu = torch.sigmoid(torch.sum(self.trans_mu(self.state)))
        self.sigma = f.softplus(torch.sum(self.trans_sigma(self.state))) + 1e-5
        self.normal_dist = torch.distributions.Normal(loc=self.mu, scale=self.sigma)
        self.action = self.normal_dist.sample(sample_shape=(1, ))
        self.action = torch.clip(self.action, min=self.env.action_space.low[0], max=self.env.action_space.high[0])
        return self.action

    def loss(self, state: torch.tensor, target: torch.tensor, action: torch.tensor):
        if type(state) is np.ndarray: state = torch.tensor(state.astype(np.float32))
        self.state = state
        self.mu = torch.sigmoid(torch.sum(self.trans_mu(self.state)))
        self.sigma = f.softplus(torch.sum(self.trans_sigma(self.state))) + 1e-5
        self.normal_dist = torch.distributions.Normal(loc=self.mu, scale=self.sigma)
        loss = -self.normal_dist.log_prob(action) * target - 1e-1 * self.normal_dist.entropy()
        return loss

    def update(self, state: torch.tensor, target: torch.tensor, action: torch.tensor):
        self.optimizer.zero_grad()
        loss = self.loss(state, target, action)
        loss.backward()
        self.optimizer.step()
        return loss


class ValueEstimator():
    def __init__(self, state: torch.tensor, env: gym.Env, learning_rate: float = 0.01, device=torch.device('cpu'), name: str = "value_estimater"):
        self.state = state
        self.env = env
        self.learning_rate = learning_rate
        self.classifier = nn.Sequential(nn.Linear(state.shape[0], 32, bias=True, device=device), nn.Sigmoid(), nn.Linear(32, 1, bias=True, device=device),nn.ReLU(inplace=False))
        # when i replace above code to this, there will be fine and no error.
        # self.classifier = nn.Sequential(nn.Linear(state.shape[0], 32, bias=True, device=device), nn.ReLU())
        self.optimizer = torch.optim.Adam(self.classifier.parameters(), lr=self.learning_rate)

    def predict(self, state: torch.tensor):
        if type(state) is np.ndarray: state = torch.tensor(state.astype(np.float32))
        value_estimate = torch.sum(self.classifier(state))
        return value_estimate

    def loss(self, state: torch.tensor, target: torch.tensor):
        value_estimate = self.predict(state)
        loss_fn = nn.MSELoss()
        loss = loss_fn(value_estimate, target)
        return loss

    def update(self, state: torch.tensor, target: torch.tensor):
        self.optimizer.zero_grad()
        loss = self.loss(state, target)
        loss.backward(retain_graph=True)
        self.optimizer.step()
        return loss


EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"])


def actor_critic(env: gym.Env, estimator_policy: PolicyEstimator, estimator_value: ValueEstimator, num_episodes: int, discount_factor: float = 1.0):
    Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state', 'done'])
    stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))
    for i_episode in range(num_episodes):
        state = env.reset()
        episode = []
        for t in tqdm(itertools.count()):
            env.render()

            action = estimator_policy.predict(state)
            next_state, reward, done, _ = env.step(action)
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            episode.append(Transition(state=state, action=action, reward=reward, next_state=next_state, done=done))

            # calculate TD Target

            value_next = estimator_value.predict(next_state)
            td_target = reward + value_next * discount_factor
            td_error = td_target - estimator_value.predict(state)

            # update the value estimator

            estimator_value.update(state, td_target)

            # update the policy estimator
            # using the td_error as our advantage estimate

            estimator_policy.update(state, td_error, action)

            # print out which step we are on.
            print(f"\rstep{t} @ Episode {i_episode+1}/{num_episodes} ({stats.episode_rewards[i_episode - 1]})", end='')

            if done: break

            state = next_state

    return stats


if __name__ == "__main__":
    torch.autograd.set_detect_anomaly(True)
    env = gym.envs.make("MountainCarContinuous-v0")
    state = torch.tensor(env.observation_space.sample())
    pe = PolicyEstimator(state, env, learning_rate=0.001)
    ve = ValueEstimator(state, env, learning_rate=0.1)
    stats = actor_critic(env, pe, ve, 50, discount_factor=0.99)

the error shows:

(hw) ➜ rllearning python -u “/Users/hehaoyuan/rllearning/policyGradient.py”
0it [00:00, ?it/s]/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/autograd/init.py:147: UserWarning: Error detected in MmBackward. Traceback of forward call that caused the error:
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 218, in
stats = actor_critic(env, pe, ve, 50, discount_factor=0.99)
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 191, in actor_critic
td_error = td_target - estimator_value.predict(state)
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 152, in predict
value_estimate = torch.sum(self.classifier(state))
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1051, in _call_impl
return forward_call(*input, **kwargs)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/modules/container.py”, line 139, in forward
input = module(input)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1051, in _call_impl
return forward_call(*input, **kwargs)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/modules/linear.py”, line 96, in forward
return F.linear(input, self.weight, self.bias)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/functional.py”, line 1847, in linear
return torch._C._nn.linear(input, weight, bias)
(Triggered internally at …/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward(
0it [00:00, ?it/s]
Traceback (most recent call last):
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 218, in
stats = actor_critic(env, pe, ve, 50, discount_factor=0.99)
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 200, in actor_critic
estimator_policy.update(state, td_error, action)
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 135, in update
loss.backward()
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/_tensor.py”, line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/autograd/init.py”, line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [32, 1]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Inplace operations use this syntax: a += b where the actual operation could also be changed to a subtraction, multiplication, etc.
In your code you are using it e.g. here:

stats.episode_rewards[i_episode] += reward

which could cause the issue.
I would recommend to replace these inplace ops with their out-of-place versions.

Thank you for your attention.
But as I replace this line with

stats.episode_rewards[i_episode] = stats.episode_rewards[i_episode] + reward

there will be the same error. I think this line might not be the point of this error.