Actor Critic with Multivariate Normal - network weights fail to update

Hi, I’m quite new to pytorch (and ML/RL at all, for that matter). I’m trying to adapt an Actor Critic algorithm I’ve copied from one of Machine Learning with Phil’s YouTube videos to my own environment. I’ve gotten his version working with gym’s cartpole. My environment has a continuous action space, and I want to learn the mean of a 2D gaussian policy (later, I may also learn the covariance matrix, but for now I’m just leaving it fixed). The policy means should be bounded on the intervals (0, 2 pi) and (0, pi/2), so I put a sigmoid activation function on the output layer and multiply them by 2 pi and pi/2, respectively. So this is how my agent chooses an action (I commented the original cartpole version for comparison):

 def choose_action(self, observation):
        # probabilities = F.softmax(self.actor.forward(observation))
        # action_probs = T.distributions.Categorical(probabilities)
        policy_thetas = torch.sigmoid(self.actor.forward(observation))
        mu, S = get_params_from_policy_thetas(policy_thetas.detach.numpy())
        action_probs = mn.MultivariateNormal(torch.FloatTensor(mu),
                                             torch.FloatTensor(S))
        action = action_probs.sample()
        self.log_probs = action_probs.log_prob(action)
        
        return action.item()

And then my agent learns according to

def learn(self, state, reward, new_state, done):
        self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()
        
        critic_value = self.critic.forward(state)
        critic_value_ = self.critic.forward(new_state)
        
        delta = ((reward+self.gamma*critic_value_*(1-int(done)))-critic_value)
        
        actor_loss = -self.log_probs * delta
        critic_loss = delta**2
        
        (actor_loss + critic_loss).backward()
        
        self.actor.optimizer.step()
        self.critic.optimizer.step()

My code will run just fine for any number of episodes, but neither of the network weights ever seem to be updated. I thought this might be because actor_loss.grad and critic_loss.grad are both None, but this is also the case for the (properly working) cartpole. Then I thought maybe the DCG was broken somewhere, and most likely in the choose_action() function, because of some sub-optimal way I’m getting the parameters of the policy from the network output?

I’ve spent a long while searching around the forums but I’m just lost in approaching debugging this. Any help would be much appreciated.

Full code below (without my environment, it will work if you uncomment the lines in choose_action and comment out my multivariate normal stuff):

import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym
from torch.distributions import multivariate_normal as mn

def get_params_from_policy_thetas(policy_thetas):
    s11 = 10**-2
    s22 = 10**-2
    mu1 = (policy_thetas[0])*2*np.pi
    mu2 = (policy_thetas[1])*np.pi/2
    S = np.diag([s11,s22])
    mu = np.asarray([mu1,mu2])
    return(mu,S)

class GenericNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(GenericNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.lr = lr
        
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0')
        self.to(self.device)
        
    def forward(self, observation):
        state = T.Tensor(observation).to(self.device)
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

class Agent(object):
    def __init__(self, alpha, beta, input_dims, gamma = 0.99, 
                 l1_size = 256, l2_size=256, n_actions=2):
        self.gamma = gamma
        self.log_probs = None
        self.actor = GenericNetwork(alpha,input_dims,l1_size,l2_size,n_actions)
        self.critic = GenericNetwork(beta,input_dims,l1_size,l2_size,n_actions=1)
        
    def choose_action(self, observation):
        # probabilities = F.softmax(self.actor.forward(observation))
        # action_probs = T.distributions.Categorical(probabilities)
        policy_thetas = torch.sigmoid(self.actor.forward(observation))
        mu, S = get_params_from_policy_thetas(policy_thetas.detach.numpy())
        action_probs = mn.MultivariateNormal(torch.FloatTensor(mu),
                                             torch.FloatTensor(S))
        action = action_probs.sample()
        self.log_probs = action_probs.log_prob(action)
        
        return action.item()
    
    def learn(self, state, reward, new_state, done):
        self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()
        
        critic_value = self.critic.forward(state)
        critic_value_ = self.critic.forward(new_state)
        
        delta = ((reward+self.gamma*critic_value_*(1-int(done)))-critic_value)
        
        actor_loss = -self.log_probs * delta
        critic_loss = delta**2
        
        (actor_loss + critic_loss).backward()
        
        self.actor.optimizer.step()
        self.critic.optimizer.step()

if __name__ == '__main__':
    agent = Agent(alpha = 0.00001, beta = 0.0005, input_dims = [4], 
                  gamma = 0.99, l1_size = 32, l2_size = 32, n_actions = 2)
    env = gym.make('CartPole-v1')
    score_history = []
    n_episodes = 2500
    for i in range(n_episodes):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_,reward,done,info = env.step(action)
            score+=reward
            agent.learn(observation, reward, observation_, done)
            observation = observation_
        print('episode ', i, 'score %.3f' % score)
        score_history.append(score)        

In order to use A-C series algorithms to train your actor network, you must make sure log probabilities are differentiable, right? Then the action_probs.log_prob(action) must be differentiable, therefore you cannot put detached torch.FloatTensor(mu), torch.FloatTensor(S) into the distribution since gradient will not flow through your torch->numpy->torch conversion. Instead, you must directly give the output of your actor to the distribution.

An example of an agent used in the 4-dim continuous gym bipedal walker environment:

class Actor(nn.Module):
    # naive actor for env.walker
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc_mu = nn.Linear(128, action_dim)
        self.fc_sigma = nn.Linear(128, action_dim)
        self.max_action = max_action

    def forward(self, state, action=None):
        a = t.relu(self.fc1(state))
        a = t.relu(self.fc2(a))

        mu = self.max_action * t.tanh(self.fc_mu(a))

        # we assume that each dimension of your action is not correlated
        # therefore the covariance matrix is a positive definite diagonal matrix

        # static, preset standard error
        # diag = t.full(mu.shape, 0.5, device=mu.device)

        # dynamic, trainable standard error
        diag = softplus(self.fc_sigma(a))
        cov = t.diag_embed(diag)
        a_dist = MultivariateNormal(mu, cov)
        action = action if action is not None else a_dist.sample()
        action_log_prob = a_dist.log_prob(action)
        entropy = a_dist.entropy()
        return action.detach(), action_log_prob.unsqueeze(1), entropy.mean()

I hope this answer is not too late though.