Hey everyone im trying to implement MADDPG but when the second agent tries to do a backward pass on the critic loss I am met with the following error:
Blockquote File
“”, line 536, in
maddppg_agents.learn(memory, i)
File “”, line 448, in learn
critic_loss.backward(retain_graph = True)
File “”, line 522, in backward
torch.autograd.backward(
File", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 2]], which is output 0 of AsStridedBackward0, is at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Blockquote
This is a snippet of my code:
def learn(self, memory, episode):
if not memory.ready():
return
actor_states, states, actions, rewards, actor_new_states, states_, terminations = memory.sample_buffer()
device = self.agents[0].actor.device
# print("terms", terminations)
states = T.tensor(states, dtype=T.float32).to(device)
actions_np = np.array(actions, dtype=np.float32) # Convert the list of NumPy arrays to a single NumPy array
actions = T.tensor(actions_np, dtype=T.float32).to(device)
rewards = T.tensor(rewards, dtype = T.float32).to(device)
states_ = T.tensor(states_, dtype = T.float32).to(device)
terminations = T.tensor(terminations).to(device)
all_agents_new_actions = []
all_agents_new_mu_actions = []
old_agents_actions = []
for agent_idx, agent in enumerate(self.agents):
#estimate action values for the next state according to actor network
new_states = T.tensor(actor_new_states[agent_idx], dtype = T.float32).to(device)
new_charge_rate, new_charge_decision = agent.target_actor.forward(new_states)
all_agents_new_actions.append((new_charge_rate))
all_agents_new_actions.append((new_charge_decision))
#Action for current state from actor network
mu_states = T.tensor(actor_states[agent_idx], dtype = T.float32).to(device)
charge_rate, charge_decision = agent.actor.forward(mu_states)
all_agents_new_mu_actions.append((charge_rate))
all_agents_new_mu_actions.append((charge_decision))
#actions agent actually took
old_agents_actions.append(actions[agent_idx])
new_actions = T.cat([acts for acts in all_agents_new_actions], dim = 1)
mu = T.cat([acts for acts in all_agents_new_mu_actions], dim = 1)
old_actions =T.cat([acts for acts in old_agents_actions], dim = 1)
#Cost functions
for agent_idx, agent in enumerate(self.agents):
print(agent.target_critic)
#get the states and new action for the target critic network and flatten them.
#critic values with target critic
#One-step lookahead TD-error:
critic_value_ = agent.target_critic.forward(states_, new_actions).flatten()
#ensure that terminal states are not include in future rewards
critic_value_[terminations[:,0]] = 0.0
#critic values using the local critic
# network, how good the action actually was
critic_value = agent.critic.forward(states, old_actions).flatten()
mean_rewards = T.mean(rewards[:, agent_idx])
std_rewards = T.std(rewards[:, agent_idx])
normalized_rewards = (rewards[:, agent_idx] - mean_rewards) / (std_rewards + 1e-8)
#target = normalized_rewards + agent.gamma*critic_value_
target = rewards[:, agent_idx] + agent.gamma*critic_value_
#calculate the loss of the current critic value
critic_loss = F.mse_loss(target, critic_value)
# print("critic_loss: ", critic_loss)
self.writer.add_scalar(f"EV_{agent_idx}/Loss/Critic", critic_loss, episode)
agent.critic.optimizer.zero_grad()
critic_loss.backward(retain_graph = True)
agent.critic.optimizer.step()
actor_loss = agent.critic.forward(states, mu).flatten()
actor_loss = -T.mean(actor_loss)
self.writer.add_scalar(f"EV_{agent_idx}/Loss/Actor", actor_loss, episode)
agent.actor.optimizer.zero_grad()
actor_loss.backward(retain_graph = True)
agent.actor.optimizer.step()
agent.update_network_parameters()