How do I debug this? RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

I am training a RL agent on the “CartPole-v0” environment from Gym. This is my net:

class DiscreteNet(nn.Module):
    """For discrete action space"""
    def __init__(self, obs_size, n_actions):
        super(DiscreteNet, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(obs_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, n_actions),
            # Use a sigmoid output do that the values for every action are between 0 and 1
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

Now I create the network and the environment:

env = gym.make("CartPole-v0")

n_obs = env.observation_space.shape[0]
n_actions = env.action_space.n

print(f"n_obs: {n_obs}, n_actions: {n_actions}")

# The neural network
net = DiscreteNet(n_obs, n_actions).to(device)

# The optimizer to optimize the network parameters
optimizer = optim.Adam(params=net.parameters(), lr=0.01)

And this is how I train it:

Episode = namedtuple("Episode", field_names=["reward", "steps"])
EpisodeStep = namedtuple("EpisodeStep", field_names=["obs_v", "action_v", "log_prob_v"])

for batch_iter_no in range(N_EPISODES * BATCH_SIZE):
    
    # New episode
    episode_reward = 0    
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    
    # Obtain a batch of episodes
    while True:
        obs_v = torch.FloatTensor([obs]).to(device)
        act_v = net(obs_v).to(device)
        distribution = Categorical(act_v)
        action_v = distribution.sample().to(device)
        log_prob_v = distribution.log_prob(action_v).to(device)
        next_obs, reward, is_done, _ = env.step(action_v.item())
        episode_reward += reward
        episode_steps.append(EpisodeStep(obs_v=obs_v, action_v=action_v, log_prob_v=log_prob_v))
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == BATCH_SIZE:
                break
        obs = next_obs    
        
    # Filter the batch for elite episodes
    rewards = [e.reward for e in batch]
    reward_bound = np.percentile(rewards, PERCENTILE)
    reward_mean = float(np.mean(rewards))

    for episode_iter_no, e in enumerate(batch):
        if e.reward < reward_bound:
            continue
        
        # Train the NN on the elite episode
        cumulated_loss = 0
        episode_len = 0
        for s in e.steps:
            
            # Loss is: - log_prob 
            loss_v = (- s.log_prob_v).to(device)
            
            # Logging variables
            cumulated_loss += loss_v.item()
            episode_len += 1
            
            # For some reason this next line is needed
            #loss_v.requires_grad = True

            optimizer.zero_grad()                          
            loss_v.backward()
            optimizer.step()
        
        # Print the logging info after each episode
        print(f"{batch_iter_no}, {episode_iter_no}: avg_loss={cumulated_loss/episode_len:.3f},"
              f"episode_reward={e.reward}, episode_boundary={reward_bound}")

However this throws the error below. I have tried looking at other questions that had this error, but do not quite understand the error message itself. How do I know which variables are needed for computing the gradients in my code? And which one is causing this error?

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-216-320c323f8526> in <module>
     55 
     56             optimizer.zero_grad()
---> 57             loss_v.backward()
     58             optimizer.step()
     59 

~/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    219                 retain_graph=retain_graph,
    220                 create_graph=create_graph)
--> 221         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    222 
    223     def register_hook(self, hook):

~/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    128         retain_graph = create_graph
    129 
--> 130     Variable._execution_engine.run_backward(
    131         tensors, grad_tensors_, retain_graph, create_graph,
    132         allow_unreachable=True)  # allow_unreachable flag

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 2]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

I did enable torch.autograd.set_detect_anomaly(True) for the error message above, but it does not change the traceback. I am in a Jupyter Notebook if that is important.