Can not understand what variables are freed and why retain_graph is required

nik31096 · April 7, 2020, 7:29am

I tried to implement multi-env DDPG in PyTorch and (I don’t know how) implemented it with a stochastic policy. And it worked. Here is the code:
[main loop from DDPG file]


for epoch in trange(1000):
    for step in range(1000):
        actions = online_policy_network_cpu(states['observation'], states['desired_goal'], states['achieved_goal'])
        next_states, rewards, dones, info = envs.step(actions.data.numpy())
        replay_buffer.put(states, actions, rewards, next_states, dones)
        for i in range(n_envs):
            if dones[i]:
                envs.reset_env(env_index=i)
        states = next_states

        if len(replay_buffer) > batch_size:
            # Training
            # for i in range(10 if len(replay_buffer) < 100000 else 100):
            obs, des_goals, ach_goals, actions, rewards, next_obs, next_des_goals, next_ach_goals, dones = \
                replay_buffer.sample(batch_size)
            q_values = online_value_network(obs, des_goals, ach_goals, actions)
            next_actions = target_policy_network(next_obs, next_des_goals, next_ach_goals)
            q_n_values = target_value_network(next_obs, next_des_goals, next_ach_goals, next_actions)

            y = rewards + gamma*(1 - dones)*q_n_values

            value_loss = torch.mean((q_values - y) ** 2)
            writer.add_scalar("Value loss", value_loss.cpu().data.numpy(), epoch)
            value_opt.zero_grad()
            value_loss.backward()
            value_opt.step()

            policy_loss = -torch.mean(online_value_network(obs, des_goals, ach_goals,
                                                           online_policy_network_gpu(obs, des_goals, ach_goals)))
            writer.add_scalar("Policy loss", -policy_loss.cpu().data.numpy(), epoch)

            policy_loss.backward()
            policy_opt.step()
            policy_opt.zero_grad()

            synchronize_value_networks_params(eps=EPS)
            synchronize_policy_networks_params(eps=EPS)

[from utils file with networks code]

class PolicyNetwork(nn.Module):
    def __init__(self, obs_shape, goal_shape, output_shape, action_ranges):
        super(PolicyNetwork, self).__init__()
        self.action_ranges = action_ranges
        self.layer_obs = nn.Linear(obs_shape, 200)
        self.layer_goal = nn.Linear(2 * goal_shape, 200)
        self.layer1 = nn.Linear(400, 100)
        self.layer2 = nn.Linear(100, output_shape)

    def forward(self, observation, desired_goal, achieved_goal):
        # check if observation, desired_goal and achieved_goal are torch tensors
        if not isinstance(observation, torch.Tensor):
            observation = torch.FloatTensor(observation)
            desired_goal = torch.FloatTensor(desired_goal)
            achieved_goal = torch.FloatTensor(achieved_goal)

        processed_obs = F.leaky_relu(self.layer_obs(observation))
        concat_goal = torch.cat([desired_goal, achieved_goal], dim=-1)
        processed_goal = F.leaky_relu(self.layer_goal(concat_goal))

        out = torch.cat([processed_obs, processed_goal], dim=-1)
        out = F.leaky_relu(self.layer1(out))

        mu = torch.tanh(self.layer2(out))

        # TODO: think about noise added to action to improve exploration. Do we really need this?
        action = mu + torch.randn_like(mu) * 0.1
        action = torch.clamp(action, self.action_ranges[0], self.action_ranges[1])

        return action

class ValueNetwork(nn.Module):
    def __init__(self, obs_shape, goal_shape, action_shape):
        super(ValueNetwork, self).__init__()
        self.layer_obs = nn.Linear(obs_shape, 200)
        self.layer_goal = nn.Linear(2 * goal_shape, 200)
        self.layer_action = nn.Linear(action_shape, 200)
        self.layer1 = nn.Linear(400, 200)
        self.layer2 = nn.Linear(400, 64)
        self.layer3 = nn.Linear(64, 1)

    def forward(self, observations, desired_goals, achieved_goals, actions):
        processed_obs = F.leaky_relu(self.layer_obs(observations))
        concat_goal = torch.cat([desired_goals, achieved_goals], dim=-1)
        processed_goals = F.leaky_relu(self.layer_goal(concat_goal))
        out = torch.cat([processed_obs, processed_goals], dim=-1)
        out = F.leaky_relu(self.layer1(out))

        processed_actions = F.leaky_relu(self.layer_action(actions))
        out = torch.cat([out, processed_actions], dim=-1)
        out = F.leaky_relu(self.layer2(out))
        out = F.leaky_relu(self.layer3(out))

        return out


item = namedtuple(
    "Item",
    ["obs", "des_goal", "ach_goal", "action", "reward", "next_obs", "next_des_goal", "next_ach_goal", "done"]
)

But after I realized that it is not deterministic PG I commented out lines with distribution in PolicyNetwork, so forward method for PolicyNetwork now looks like this:

    def forward(self, observation, desired_goal, achieved_goal):
        # check if observation, desired_goal and achieved_goal are torch tensors
        if not isinstance(observation, torch.Tensor):
            observation = torch.FloatTensor(observation)
            desired_goal = torch.FloatTensor(desired_goal)
            achieved_goal = torch.FloatTensor(achieved_goal)

        processed_obs = F.leaky_relu(self.layer_obs(observation))
        concat_goal = torch.cat([desired_goal, achieved_goal], dim=-1)
        processed_goal = F.leaky_relu(self.layer_goal(concat_goal))

        out = torch.cat([processed_obs, processed_goal], dim=-1)
        out = F.leaky_relu(self.layer1(out))

        mu = torch.tanh(self.layer2(out))
        # mu = mu + torch.randn_like(mu) * 0.1
        # sigma = torch.relu(self.layer3(out))

        # distribution = torch.distributions.Normal(mu, sigma)

        # TODO: think about noise added to action to improve exploration. Do we really need this?
        #action = distribution.sample() + torch.randn_like(mu) * 0.1
        # action = torch.clamp(mu, self.action_ranges[0], self.action_ranges[1])

        return mu

And now this code throw out the RuntimeError and say that retain_graph is required in value_loss.backward() on the second inner loop iteration (when step = 1). Why that is happening? What variables are freed after the backward pass? Nevertheless, the forward pass of policy network certainly was called after backward pass of value and policy networks! Please help me, I have been fighting with that for 2 days already!

albanD · April 7, 2020, 2:43pm

Hi,

What most likely happens is that some elements from the first iteration are re-used in the second in a differentiable manner. So when computing gradients, it tries to backward through the first step again leading to the error you see.
You need to make sure that when you save Tensors and gradient should not flow back, you want to save a detached version with .detach().

nik31096 · April 7, 2020, 5:05pm

Thanks for the answer. I am trying to double check the code, and the only other place where backward is called is backward pass for policy network. But I can not understand why removing distributions from PolicyNetwork model start causing that error. Is there a way to know what variable causes that issue?

nik31096 · April 7, 2020, 6:19pm

Maybe do you have any guesses about what variables (or their gradients or something else) I should look for in the debug mode in PyCharm? I can not even imagine what causes this issue! Please help! Would be very grateful!

albanD · April 7, 2020, 6:42pm

You mentioned it happens on the second iteration of the step loop. So some Tensor in that for-loop? Not sure

nik31096 · April 7, 2020, 6:56pm

Ok, I got some updates: the first code snippet with distribution worked because I have called distributions.sample(), which does not allow gradient flow though distribution. I changed .sample() to be .rsample() (in line action = distribution.sample() + torch.randn_like(mu) * 0.1 ) and error was thrown out. So now the problem is the following code snippet requires retain_graph=True and I do not understand why:

class PolicyNetwork(nn.Module):
    def __init__(self, obs_shape, goal_shape, output_shape, action_ranges):
        super(PolicyNetwork, self).__init__()
        self.action_ranges = action_ranges
        self.layer_obs = nn.Linear(obs_shape, 200)
        self.layer_goal = nn.Linear(2 * goal_shape, 200)
        self.layer1 = nn.Linear(400, 100)
        self.layer2 = nn.Linear(100, output_shape)
        self.layer3 = nn.Linear(100, output_shape)

    def forward(self, observation, desired_goal, achieved_goal):
        # check if observation, desired_goal and achieved_goal are torch tensors
        if not isinstance(observation, torch.Tensor):
            observation = torch.FloatTensor(observation)
            desired_goal = torch.FloatTensor(desired_goal)
            achieved_goal = torch.FloatTensor(achieved_goal)

        processed_obs = F.leaky_relu(self.layer_obs(observation))
        concat_goal = torch.cat([desired_goal, achieved_goal], dim=-1)
        processed_goal = F.leaky_relu(self.layer_goal(concat_goal))

        out = torch.cat([processed_obs, processed_goal], dim=-1)
        out = F.leaky_relu(self.layer1(out))

        self.mu = torch.tanh(self.layer2(out))
        self.sigma = torch.relu(self.layer3(out))

        distribution = torch.distributions.Normal(self.mu, self.sigma)

        # TODO: think about noise added to action to improve exploration. Do we really need this?
        action = distribution.rsample() + torch.randn_like(self.mu) * 0.1
        action = torch.clamp(action, self.action_ranges[0], self.action_ranges[1])

        return action


class ValueNetwork(nn.Module):
    def __init__(self, obs_shape, goal_shape, action_shape):
        super(ValueNetwork, self).__init__()
        self.layer_obs = nn.Linear(obs_shape, 200)
        self.layer_goal = nn.Linear(2 * goal_shape, 200)
        self.layer_action = nn.Linear(action_shape, 200)
        self.layer1 = nn.Linear(400, 200)
        self.layer2 = nn.Linear(400, 64)
        self.layer3 = nn.Linear(64, 1)

    def forward(self, observations, desired_goals, achieved_goals, actions):
        processed_obs = F.leaky_relu(self.layer_obs(observations))
        concat_goal = torch.cat([desired_goals, achieved_goals], dim=-1)
        processed_goals = F.leaky_relu(self.layer_goal(concat_goal))
        out = torch.cat([processed_obs, processed_goals], dim=-1)
        out = F.leaky_relu(self.layer1(out))

        processed_actions = F.leaky_relu(self.layer_action(actions))
        out = torch.cat([out, processed_actions], dim=-1)
        out = F.leaky_relu(self.layer2(out))
        out = F.leaky_relu(self.layer3(out))

        return out


item = namedtuple(
    "Item",
    ["obs", "des_goal", "ach_goal", "action", "reward", "next_obs", "next_des_goal", "next_ach_goal", "done"]
)

obs, des_goals, ach_goals, actions, rewards, next_obs, next_des_goals, next_ach_goals, dones = \
                replay_buffer.sample(batch_size)
q_values = online_value_network(obs, des_goals, ach_goals, actions)
            next_actions = target_policy_network(next_obs, next_des_goals, next_ach_goals)
q_n_values = target_value_network(next_obs, next_des_goals, next_ach_goals, next_actions)

y = rewards + gamma*(1 - dones)*q_n_values

value_loss = torch.mean((q_values - y) ** 2)
writer.add_scalar("Value loss", value_loss.cpu().data.numpy(), epoch)
value_opt.zero_grad()
value_loss.backward()
value_opt.step()

policy_loss = -torch.mean(online_value_network(obs, des_goals, ach_goals,
                                                           online_policy_network_gpu(obs, des_goals, ach_goals)))
writer.add_scalar("Policy loss", -policy_loss.cpu().data.numpy(), epoch)

policy_loss.backward()
policy_opt.step()
policy_opt.zero_grad()

synchronize_value_networks_params(eps=EPS)
synchronize_policy_networks_params(eps=EPS)

albanD · April 7, 2020, 7:11pm

If you don’t want gradients to flow back as before, you can use .detach().
Otherwise you want to make sure that your observation/desired_goal/achieved_goal are Tensors that do not depend on the previous iteration (in a differentiable way) and use .detach() where needed to break these links if they currently exist.

nik31096 · April 7, 2020, 7:11pm

Another update. On stackoverflow I found very similar (seems to be the same) about retain_graph=True problem in DDPG: https://stackoverflow.com/questions/51349272/pytorch-backward-runtimeerror-trying-to-backward-through-the-graph-a-second. There was said that error might be caused by input tensors. But I checked (in debug mode) and none of (obs, actions, rewards, etc) has any gradients…

[Update]
This is exactly what albanD comment above is saying.
But I checked that each of the Tensors observation/desired_goal/etc has requires_grad=False.

albanD · April 7, 2020, 7:20pm

Another approach is to use torchviz: https://github.com/szagoruyko/pytorchviz/
This can be used to plot the graph used to compute gradients.
You can use that just before the .backward() call.
You want to make sure that the graph printed by the first iteration and the second one are the same.
If they are not, then you want to find where the second one links to part of the first one to know where to add the .detach().
If Let me know if you need help understanding these graphs.

nik31096 · April 7, 2020, 7:24pm

Finally! The solution is to put in the replay buffer only data from Tensors obs/action/etc. Because they participate in backward pass for value_loss and in the next iteration they are not in the graph any more.
In my case such a problem was caused by actions Tensor, which has required_grad=True and was missed on the next iteration of backward. Thanks to albanD!