Hi, I’m quite new to pytorch (and ML/RL at all, for that matter). I’m trying to adapt an Actor Critic algorithm I’ve copied from one of Machine Learning with Phil’s YouTube videos to my own environment. I’ve gotten his version working with gym’s cartpole. My environment has a continuous action space, and I want to learn the mean of a 2D gaussian policy (later, I may also learn the covariance matrix, but for now I’m just leaving it fixed). The policy means should be bounded on the intervals (0, 2 pi) and (0, pi/2), so I put a sigmoid activation function on the output layer and multiply them by 2 pi and pi/2, respectively. So this is how my agent chooses an action (I commented the original cartpole version for comparison):

```
def choose_action(self, observation):
# probabilities = F.softmax(self.actor.forward(observation))
# action_probs = T.distributions.Categorical(probabilities)
policy_thetas = torch.sigmoid(self.actor.forward(observation))
mu, S = get_params_from_policy_thetas(policy_thetas.detach.numpy())
action_probs = mn.MultivariateNormal(torch.FloatTensor(mu),
torch.FloatTensor(S))
action = action_probs.sample()
self.log_probs = action_probs.log_prob(action)
return action.item()
```

And then my agent learns according to

```
def learn(self, state, reward, new_state, done):
self.actor.optimizer.zero_grad()
self.critic.optimizer.zero_grad()
critic_value = self.critic.forward(state)
critic_value_ = self.critic.forward(new_state)
delta = ((reward+self.gamma*critic_value_*(1-int(done)))-critic_value)
actor_loss = -self.log_probs * delta
critic_loss = delta**2
(actor_loss + critic_loss).backward()
self.actor.optimizer.step()
self.critic.optimizer.step()
```

My code will run just fine for any number of episodes, but neither of the network weights ever seem to be updated. I thought this might be because actor_loss.grad and critic_loss.grad are both None, but this is also the case for the (properly working) cartpole. Then I thought maybe the DCG was broken somewhere, and most likely in the choose_action() function, because of some sub-optimal way I’m getting the parameters of the policy from the network output?

I’ve spent a long while searching around the forums but I’m just lost in approaching debugging this. Any help would be much appreciated.

Full code below (without my environment, it will work if you uncomment the lines in choose_action and comment out my multivariate normal stuff):

```
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym
from torch.distributions import multivariate_normal as mn
def get_params_from_policy_thetas(policy_thetas):
s11 = 10**-2
s22 = 10**-2
mu1 = (policy_thetas[0])*2*np.pi
mu2 = (policy_thetas[1])*np.pi/2
S = np.diag([s11,s22])
mu = np.asarray([mu1,mu2])
return(mu,S)
class GenericNetwork(nn.Module):
def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
super(GenericNetwork, self).__init__()
self.input_dims = input_dims
self.fc1_dims = fc1_dims
self.fc2_dims = fc2_dims
self.n_actions = n_actions
self.lr = lr
self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0')
self.to(self.device)
def forward(self, observation):
state = T.Tensor(observation).to(self.device)
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
class Agent(object):
def __init__(self, alpha, beta, input_dims, gamma = 0.99,
l1_size = 256, l2_size=256, n_actions=2):
self.gamma = gamma
self.log_probs = None
self.actor = GenericNetwork(alpha,input_dims,l1_size,l2_size,n_actions)
self.critic = GenericNetwork(beta,input_dims,l1_size,l2_size,n_actions=1)
def choose_action(self, observation):
# probabilities = F.softmax(self.actor.forward(observation))
# action_probs = T.distributions.Categorical(probabilities)
policy_thetas = torch.sigmoid(self.actor.forward(observation))
mu, S = get_params_from_policy_thetas(policy_thetas.detach.numpy())
action_probs = mn.MultivariateNormal(torch.FloatTensor(mu),
torch.FloatTensor(S))
action = action_probs.sample()
self.log_probs = action_probs.log_prob(action)
return action.item()
def learn(self, state, reward, new_state, done):
self.actor.optimizer.zero_grad()
self.critic.optimizer.zero_grad()
critic_value = self.critic.forward(state)
critic_value_ = self.critic.forward(new_state)
delta = ((reward+self.gamma*critic_value_*(1-int(done)))-critic_value)
actor_loss = -self.log_probs * delta
critic_loss = delta**2
(actor_loss + critic_loss).backward()
self.actor.optimizer.step()
self.critic.optimizer.step()
if __name__ == '__main__':
agent = Agent(alpha = 0.00001, beta = 0.0005, input_dims = [4],
gamma = 0.99, l1_size = 32, l2_size = 32, n_actions = 2)
env = gym.make('CartPole-v1')
score_history = []
n_episodes = 2500
for i in range(n_episodes):
done = False
score = 0
observation = env.reset()
while not done:
action = agent.choose_action(observation)
observation_,reward,done,info = env.step(action)
score+=reward
agent.learn(observation, reward, observation_, done)
observation = observation_
print('episode ', i, 'score %.3f' % score)
score_history.append(score)
```