I am training a deep learning model with PyTorch. The learning process is as follow:
def learn(self, memory,batch_size):
if memory.__len__() < batch_size:
return
transitions = memory.sample(batch_size)
batch = Transition(*zip(*transitions))
state = np.array(batch.state)
actions = np.array(batch.actions)
stateNext = np.array(batch.next_state)
rewards = np.array(batch.rewards)
for agent in range(self.alg_agent_num):
if agent==0:
agent_obs = state[:,:self.HAP_obs_num]
agent_obsNext = stateNext[:,:self.HAP_obs_num]
critic_state = list(agent_obs)
critic_stateNext = list(agent_obsNext)
agent_obsNext_tensor= T.tensor(agent_obsNext, dtype=T.float, requires_grad=False).to(self.device)
agent_act_target = self.alg_agents[agent].target_actor.forward(agent_obsNext_tensor)
actions_next = list(agent_act_target.cpu().detach().numpy())
agent_obs_tensor = T.tensor(agent_obs, dtype=T.float, requires_grad=False).to(self.device)
actor_action = self.alg_agents[agent].actor.forward(agent_obs_tensor)
actions_main = list(actor_action.cpu().detach().numpy())
else:
agent_obs = state[:, self.HAP_obs_num+(agent-1)*self.UAV_obs_num: self.HAP_obs_num+agent*self.UAV_obs_num]
agent_obsNext = stateNext[:, self.HAP_obs_num+(agent-1)*self.UAV_obs_num:self.HAP_obs_num+self.UAV_obs_num*agent]
agent_obsNext_tensor= T.tensor(agent_obsNext, dtype=T.float, requires_grad=False).to(self.device)
agent_act_target = self.alg_agents[agent].target_actor.forward(agent_obsNext_tensor)
agent_obs_tensor = T.tensor(agent_obs, dtype=T.float, requires_grad=False).to(self.device)
actor_action = self.alg_agents[agent].actor.forward(agent_obs_tensor)
for b in range(batch_size):
critic_state[b] = np.append(critic_state[b], agent_obs[b, :2]) # existed full IOTDspos + UAV pos
critic_stateNext[b] = np.append(critic_stateNext[b], agent_obsNext[b, :2])
actions_next[b] = np.append(actions_next[b], agent_act_target[b].cpu().detach().numpy())
actions_main[b] = np.append(actions_main[b], actor_action[b].cpu().detach().numpy())
critic_state = T.tensor(critic_state, dtype=T.float, requires_grad=False).to(self.device)
critic_stateNext = T.tensor(critic_stateNext, dtype=T.float, requires_grad=False).to(self.device)
actions = T.tensor(actions, dtype=T.float, requires_grad=False).to(self.device)
actions_next = T.tensor(actions_next, dtype=T.float, requires_grad=False).to(self.device)
actions_main = T.tensor(actions_main, dtype=T.float, requires_grad=False).to(self.device)
sum_policy_loss = 0.
for agent in range(self.alg_agent_num):
agent_reward = rewards[:, agent]
agent_reward = T.tensor(agent_reward, dtype=T.float, requires_grad=False).to(self.device)
agent_reward = T.reshape(agent_reward, (batch_size,1))
critic_value = self.alg_agents[agent].critic.forward(critic_state, actions)
critic_valueNext = self.alg_agents[agent].target_critic.forward(critic_stateNext, actions_next)
target = agent_reward + self.alg_agents[agent].gamma*critic_valueNext
target = target.view(self.alg_agents[agent].batch_size, 1)
self.alg_agents[agent].critic.optimizer.zero_grad()
critic_loss = F.mse_loss(critic_value, target)
critic_loss.backward()
self.alg_agents[agent].actor.optimizer.zero_grad()
actor_loss = -self.alg_agents[agent].critic.forward(critic_state, actions_main).flatten()
actor_loss = T.mean(actor_loss)
actor_loss.backward()
self.alg_agents[agent].actor.optimizer.step()
self.alg_agents[agent].critic.optimizer.step()
sum_policy_loss += actor_loss
for agent in range(self.alg_agent_num):
self.alg_agents[agent].update_network_parameters(tau=self.alg_agents[agent].tau)
return sum_policy_loss
However, the weight of actor model didn’t updated. Could you help me to solve it.
Thanks in advance.