I am having issues trying to train my policy networks. I currently have two policy networks, one for steering my car and the other for controlling the acceleration. I would like to update both under the same loss function. I am then passing both parameters of each network into an Adam optimizer and updating accordingly. However, it seems after the first pass my network returns nan.

Below is a reference to my training loop:

```
policy_optimizer = optim.Adam(list(accel_policy.parameters())+list(turn_policy.parameters()),lr=0.0001)
value_optimizer = optim.Adam(value_function.parameters(),lr=0.0001)
epsilon = 0.2 # Clipping factor
policy_losses = []
value_losses = []
epochs = 1
for epoch in range(epochs):
for inputs_batch, outputs_batch, actions_batch in dataloader:
# Move batches to GPU
inputs_batch, outputs_batch, actions_batch = inputs_batch.to(device), outputs_batch.to(device), actions_batch.to(device)
# Clone current policy and value networks
accel_policy_old = copy.deepcopy(accel_policy)
turn_policy_old = copy.deepcopy(turn_policy)
value_function_old = copy.deepcopy(value_function)
values = value_function(inputs_batch)
# Freeze clone networks
for param in accel_policy_old.parameters(): param.requires_grad = False
for param in turn_policy_old.parameters(): param.requires_grad = False
for param in value_function_old.parameters(): param.requires_grad = False
# Calculate advantages
advantages = outputs_batch.cpu() - value_function_old(inputs_batch).cpu().numpy()
advantages = advantages.detach()
advantages = advantages.to(device)
# Get old policy distribution from policy networks
accel_mean_old, accel_stdev_old = accel_policy_old(inputs_batch)[:,0],accel_policy_old(inputs_batch)[:,1]
turn_mean_old, turn_stdev_old = turn_policy_old(inputs_batch)[:,0],turn_policy_old(inputs_batch)[:,1]
# Get new policy distribution from policy networks
accel_mean, accel_stdev = accel_policy(inputs_batch)[:,0],accel_policy(inputs_batch)[:,1]
turn_mean, turn_stdev = turn_policy(inputs_batch)[:,0],turn_policy(inputs_batch)[:,1]
# Get acceleration and turn from actions taken
accelerations = torch.tensor(actions_batch[:,0])
turns = torch.tensor(actions_batch[:,1])
# Calulate probability given action using density function
pi = (1 / (accel_stdev * torch.sqrt(torch.tensor(2 * torch.pi)))) * torch.exp(-0.5 * ((accelerations - accel_mean) / accel_stdev) ** 2) * (1 / (turn_stdev * torch.sqrt(torch.tensor(2 * torch.pi)))) * torch.exp(-0.5 * ((turns - turn_mean) / turn_stdev) ** 2)
pi_old = (1 / (accel_stdev_old * torch.sqrt(torch.tensor(2 * torch.pi)))) * torch.exp(-0.5 * ((accelerations - accel_mean_old) / accel_stdev_old) ** 2) * (1 / (turn_stdev_old * torch.sqrt(torch.tensor(2 * torch.pi)))) * torch.exp(-0.5 * ((turns - turn_mean_old) / turn_stdev_old) ** 2)
# Calculate probability ratios
ratios = pi / pi_old
# Compute PPO loss
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1 - epsilon, 1 + epsilon) * advantages # Clipping
ppo_loss = -torch.min(surr1,surr2).mean()
# Update policy net
policy_optimizer.zero_grad()
ppo_loss.backward()
policy_optimizer.step()
# Update value network
value_loss = nn.MSELoss()(values,outputs_batch)
value_optimizer.zero_grad()
value_loss.backward()
value_optimizer.step()
policy_losses.append(ppo_loss.item())
value_losses.append(value_loss.item())
# Graph loss
plt.plot(policy_losses,color='r',label='Policy Loss')
plt.plot(value_losses,color='b',label='Value Loss')
plt.legend()
plt.show()
```

Here is a screenshot of the output before and after the first update:

Any advice or insights would be greatly appreciated!