MADDPG RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

Hey everyone im trying to implement MADDPG but when the second agent tries to do a backward pass on the critic loss I am met with the following error:

Blockquote File
“”, line 536, in
maddppg_agents.learn(memory, i)
File “”, line 448, in learn
critic_loss.backward(retain_graph = True)
File “”, line 522, in backward
File", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 2]], which is output 0 of AsStridedBackward0, is at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!


This is a snippet of my code:
def learn(self, memory, episode):
if not memory.ready():
actor_states, states, actions, rewards, actor_new_states, states_, terminations = memory.sample_buffer()

device = self.agents[0].actor.device
# print("terms", terminations)
states = T.tensor(states, dtype=T.float32).to(device)
actions_np = np.array(actions, dtype=np.float32)  # Convert the list of NumPy arrays to a single NumPy array
actions = T.tensor(actions_np, dtype=T.float32).to(device)
rewards = T.tensor(rewards, dtype = T.float32).to(device)
states_ = T.tensor(states_, dtype = T.float32).to(device)
terminations = T.tensor(terminations).to(device)

all_agents_new_actions = []
all_agents_new_mu_actions = []
old_agents_actions = []

for agent_idx, agent in enumerate(self.agents):
  #estimate action values for the next state according to actor network
  new_states = T.tensor(actor_new_states[agent_idx], dtype = T.float32).to(device)
  new_charge_rate, new_charge_decision = agent.target_actor.forward(new_states)
  #Action for current state from actor network
  mu_states = T.tensor(actor_states[agent_idx], dtype = T.float32).to(device)
  charge_rate, charge_decision =
  #actions agent actually took
new_actions =[acts for acts in all_agents_new_actions], dim = 1)
mu =[acts for acts in all_agents_new_mu_actions], dim = 1)
old_actions[acts for acts in old_agents_actions], dim = 1)

#Cost functions
for agent_idx, agent in enumerate(self.agents):
  #get the states and new action for the target critic network and flatten them.
  #critic values with target critic
  #One-step lookahead TD-error:
  critic_value_ = agent.target_critic.forward(states_, new_actions).flatten()
  #ensure that terminal states are not include in future rewards
  critic_value_[terminations[:,0]] = 0.0
  #critic values using the local critic 
  # network, how good the action actually was
  critic_value = agent.critic.forward(states, old_actions).flatten()

  mean_rewards = T.mean(rewards[:, agent_idx])
  std_rewards = T.std(rewards[:, agent_idx])
  normalized_rewards = (rewards[:, agent_idx] - mean_rewards) / (std_rewards + 1e-8)

  #target = normalized_rewards + agent.gamma*critic_value_
  target = rewards[:, agent_idx] + agent.gamma*critic_value_
  #calculate the loss of the current critic value 
  critic_loss = F.mse_loss(target, critic_value)

  # print("critic_loss: ", critic_loss)
  self.writer.add_scalar(f"EV_{agent_idx}/Loss/Critic", critic_loss, episode)
  critic_loss.backward(retain_graph = True)

  actor_loss = agent.critic.forward(states, mu).flatten()
  actor_loss = -T.mean(actor_loss)
  self.writer.add_scalar(f"EV_{agent_idx}/Loss/Actor", actor_loss, episode)
  actor_loss.backward(retain_graph = True)