I tried to implement multi-env DDPG in PyTorch and (I don’t know how) implemented it with a stochastic policy. And it worked. Here is the code:
[main loop from DDPG file]
for epoch in trange(1000):
for step in range(1000):
actions = online_policy_network_cpu(states['observation'], states['desired_goal'], states['achieved_goal'])
next_states, rewards, dones, info = envs.step(actions.data.numpy())
replay_buffer.put(states, actions, rewards, next_states, dones)
for i in range(n_envs):
if dones[i]:
envs.reset_env(env_index=i)
states = next_states
if len(replay_buffer) > batch_size:
# Training
# for i in range(10 if len(replay_buffer) < 100000 else 100):
obs, des_goals, ach_goals, actions, rewards, next_obs, next_des_goals, next_ach_goals, dones = \
replay_buffer.sample(batch_size)
q_values = online_value_network(obs, des_goals, ach_goals, actions)
next_actions = target_policy_network(next_obs, next_des_goals, next_ach_goals)
q_n_values = target_value_network(next_obs, next_des_goals, next_ach_goals, next_actions)
y = rewards + gamma*(1 - dones)*q_n_values
value_loss = torch.mean((q_values - y) ** 2)
writer.add_scalar("Value loss", value_loss.cpu().data.numpy(), epoch)
value_opt.zero_grad()
value_loss.backward()
value_opt.step()
policy_loss = -torch.mean(online_value_network(obs, des_goals, ach_goals,
online_policy_network_gpu(obs, des_goals, ach_goals)))
writer.add_scalar("Policy loss", -policy_loss.cpu().data.numpy(), epoch)
policy_loss.backward()
policy_opt.step()
policy_opt.zero_grad()
synchronize_value_networks_params(eps=EPS)
synchronize_policy_networks_params(eps=EPS)
[from utils file with networks code]
class PolicyNetwork(nn.Module):
def __init__(self, obs_shape, goal_shape, output_shape, action_ranges):
super(PolicyNetwork, self).__init__()
self.action_ranges = action_ranges
self.layer_obs = nn.Linear(obs_shape, 200)
self.layer_goal = nn.Linear(2 * goal_shape, 200)
self.layer1 = nn.Linear(400, 100)
self.layer2 = nn.Linear(100, output_shape)
def forward(self, observation, desired_goal, achieved_goal):
# check if observation, desired_goal and achieved_goal are torch tensors
if not isinstance(observation, torch.Tensor):
observation = torch.FloatTensor(observation)
desired_goal = torch.FloatTensor(desired_goal)
achieved_goal = torch.FloatTensor(achieved_goal)
processed_obs = F.leaky_relu(self.layer_obs(observation))
concat_goal = torch.cat([desired_goal, achieved_goal], dim=-1)
processed_goal = F.leaky_relu(self.layer_goal(concat_goal))
out = torch.cat([processed_obs, processed_goal], dim=-1)
out = F.leaky_relu(self.layer1(out))
mu = torch.tanh(self.layer2(out))
# TODO: think about noise added to action to improve exploration. Do we really need this?
action = mu + torch.randn_like(mu) * 0.1
action = torch.clamp(action, self.action_ranges[0], self.action_ranges[1])
return action
class ValueNetwork(nn.Module):
def __init__(self, obs_shape, goal_shape, action_shape):
super(ValueNetwork, self).__init__()
self.layer_obs = nn.Linear(obs_shape, 200)
self.layer_goal = nn.Linear(2 * goal_shape, 200)
self.layer_action = nn.Linear(action_shape, 200)
self.layer1 = nn.Linear(400, 200)
self.layer2 = nn.Linear(400, 64)
self.layer3 = nn.Linear(64, 1)
def forward(self, observations, desired_goals, achieved_goals, actions):
processed_obs = F.leaky_relu(self.layer_obs(observations))
concat_goal = torch.cat([desired_goals, achieved_goals], dim=-1)
processed_goals = F.leaky_relu(self.layer_goal(concat_goal))
out = torch.cat([processed_obs, processed_goals], dim=-1)
out = F.leaky_relu(self.layer1(out))
processed_actions = F.leaky_relu(self.layer_action(actions))
out = torch.cat([out, processed_actions], dim=-1)
out = F.leaky_relu(self.layer2(out))
out = F.leaky_relu(self.layer3(out))
return out
item = namedtuple(
"Item",
["obs", "des_goal", "ach_goal", "action", "reward", "next_obs", "next_des_goal", "next_ach_goal", "done"]
)
But after I realized that it is not deterministic PG I commented out lines with distribution in PolicyNetwork, so forward method for PolicyNetwork now looks like this:
def forward(self, observation, desired_goal, achieved_goal):
# check if observation, desired_goal and achieved_goal are torch tensors
if not isinstance(observation, torch.Tensor):
observation = torch.FloatTensor(observation)
desired_goal = torch.FloatTensor(desired_goal)
achieved_goal = torch.FloatTensor(achieved_goal)
processed_obs = F.leaky_relu(self.layer_obs(observation))
concat_goal = torch.cat([desired_goal, achieved_goal], dim=-1)
processed_goal = F.leaky_relu(self.layer_goal(concat_goal))
out = torch.cat([processed_obs, processed_goal], dim=-1)
out = F.leaky_relu(self.layer1(out))
mu = torch.tanh(self.layer2(out))
# mu = mu + torch.randn_like(mu) * 0.1
# sigma = torch.relu(self.layer3(out))
# distribution = torch.distributions.Normal(mu, sigma)
# TODO: think about noise added to action to improve exploration. Do we really need this?
#action = distribution.sample() + torch.randn_like(mu) * 0.1
# action = torch.clamp(mu, self.action_ranges[0], self.action_ranges[1])
return mu
And now this code throw out the RuntimeError and say that retain_graph is required in value_loss.backward() on the second inner loop iteration (when step = 1). Why that is happening? What variables are freed after the backward pass? Nevertheless, the forward pass of policy network certainly was called after backward pass of value and policy networks! Please help me, I have been fighting with that for 2 days already!