# Actor Critic with Multivariate Normal - network weights fail to update

Hi, I’m quite new to pytorch (and ML/RL at all, for that matter). I’m trying to adapt an Actor Critic algorithm I’ve copied from one of Machine Learning with Phil’s YouTube videos to my own environment. I’ve gotten his version working with gym’s cartpole. My environment has a continuous action space, and I want to learn the mean of a 2D gaussian policy (later, I may also learn the covariance matrix, but for now I’m just leaving it fixed). The policy means should be bounded on the intervals (0, 2 pi) and (0, pi/2), so I put a sigmoid activation function on the output layer and multiply them by 2 pi and pi/2, respectively. So this is how my agent chooses an action (I commented the original cartpole version for comparison):

`````` def choose_action(self, observation):
# probabilities = F.softmax(self.actor.forward(observation))
# action_probs = T.distributions.Categorical(probabilities)
policy_thetas = torch.sigmoid(self.actor.forward(observation))
mu, S = get_params_from_policy_thetas(policy_thetas.detach.numpy())
action_probs = mn.MultivariateNormal(torch.FloatTensor(mu),
torch.FloatTensor(S))
action = action_probs.sample()
self.log_probs = action_probs.log_prob(action)

return action.item()
``````

And then my agent learns according to

``````def learn(self, state, reward, new_state, done):

critic_value = self.critic.forward(state)
critic_value_ = self.critic.forward(new_state)

delta = ((reward+self.gamma*critic_value_*(1-int(done)))-critic_value)

actor_loss = -self.log_probs * delta
critic_loss = delta**2

(actor_loss + critic_loss).backward()

self.actor.optimizer.step()
self.critic.optimizer.step()
``````

My code will run just fine for any number of episodes, but neither of the network weights ever seem to be updated. I thought this might be because actor_loss.grad and critic_loss.grad are both None, but this is also the case for the (properly working) cartpole. Then I thought maybe the DCG was broken somewhere, and most likely in the choose_action() function, because of some sub-optimal way I’m getting the parameters of the policy from the network output?

I’ve spent a long while searching around the forums but I’m just lost in approaching debugging this. Any help would be much appreciated.

Full code below (without my environment, it will work if you uncomment the lines in choose_action and comment out my multivariate normal stuff):

``````import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym
from torch.distributions import multivariate_normal as mn

def get_params_from_policy_thetas(policy_thetas):
s11 = 10**-2
s22 = 10**-2
mu1 = (policy_thetas)*2*np.pi
mu2 = (policy_thetas)*np.pi/2
S = np.diag([s11,s22])
mu = np.asarray([mu1,mu2])
return(mu,S)

class GenericNetwork(nn.Module):
def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
super(GenericNetwork, self).__init__()
self.input_dims = input_dims
self.fc1_dims = fc1_dims
self.fc2_dims = fc2_dims
self.n_actions = n_actions
self.lr = lr

self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)

self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0')
self.to(self.device)

def forward(self, observation):
state = T.Tensor(observation).to(self.device)
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
x = self.fc3(x)

return x

class Agent(object):
def __init__(self, alpha, beta, input_dims, gamma = 0.99,
l1_size = 256, l2_size=256, n_actions=2):
self.gamma = gamma
self.log_probs = None
self.actor = GenericNetwork(alpha,input_dims,l1_size,l2_size,n_actions)
self.critic = GenericNetwork(beta,input_dims,l1_size,l2_size,n_actions=1)

def choose_action(self, observation):
# probabilities = F.softmax(self.actor.forward(observation))
# action_probs = T.distributions.Categorical(probabilities)
policy_thetas = torch.sigmoid(self.actor.forward(observation))
mu, S = get_params_from_policy_thetas(policy_thetas.detach.numpy())
action_probs = mn.MultivariateNormal(torch.FloatTensor(mu),
torch.FloatTensor(S))
action = action_probs.sample()
self.log_probs = action_probs.log_prob(action)

return action.item()

def learn(self, state, reward, new_state, done):

critic_value = self.critic.forward(state)
critic_value_ = self.critic.forward(new_state)

delta = ((reward+self.gamma*critic_value_*(1-int(done)))-critic_value)

actor_loss = -self.log_probs * delta
critic_loss = delta**2

(actor_loss + critic_loss).backward()

self.actor.optimizer.step()
self.critic.optimizer.step()

if __name__ == '__main__':
agent = Agent(alpha = 0.00001, beta = 0.0005, input_dims = ,
gamma = 0.99, l1_size = 32, l2_size = 32, n_actions = 2)
env = gym.make('CartPole-v1')
score_history = []
n_episodes = 2500
for i in range(n_episodes):
done = False
score = 0
observation = env.reset()
while not done:
action = agent.choose_action(observation)
observation_,reward,done,info = env.step(action)
score+=reward
agent.learn(observation, reward, observation_, done)
observation = observation_
print('episode ', i, 'score %.3f' % score)
score_history.append(score)
``````

In order to use A-C series algorithms to train your actor network, you must make sure log probabilities are differentiable, right? Then the `action_probs.log_prob(action)` must be differentiable, therefore you cannot put detached `torch.FloatTensor(mu), torch.FloatTensor(S)` into the distribution since gradient will not flow through your `torch->numpy->torch` conversion. Instead, you must directly give the output of your actor to the distribution.

An example of an agent used in the 4-dim continuous gym bipedal walker environment:

``````class Actor(nn.Module):
# naive actor for env.walker
def __init__(self, state_dim, action_dim, max_action):
super(Actor, self).__init__()

self.fc1 = nn.Linear(state_dim, 128)
self.fc2 = nn.Linear(128, 128)
self.fc_mu = nn.Linear(128, action_dim)
self.fc_sigma = nn.Linear(128, action_dim)
self.max_action = max_action

def forward(self, state, action=None):
a = t.relu(self.fc1(state))
a = t.relu(self.fc2(a))

mu = self.max_action * t.tanh(self.fc_mu(a))

# we assume that each dimension of your action is not correlated
# therefore the covariance matrix is a positive definite diagonal matrix

# static, preset standard error
# diag = t.full(mu.shape, 0.5, device=mu.device)

# dynamic, trainable standard error
diag = softplus(self.fc_sigma(a))
cov = t.diag_embed(diag)
a_dist = MultivariateNormal(mu, cov)
action = action if action is not None else a_dist.sample()
action_log_prob = a_dist.log_prob(action)
entropy = a_dist.entropy()
return action.detach(), action_log_prob.unsqueeze(1), entropy.mean()
``````

I hope this answer is not too late though.