Torch/RL newbie: Trying to do PPO

Hi,

I am trying to implement PPO Actor-Critic Style based on the paper by Schulman et al (paper).

Long story short: My code doesn’t learn and I do not know why. I suspect my hyperparameters are poorly chosen

github link to my code

# Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import gym

# Hyperparameters
env = "Pendulum-v0"
env = gym.make(env)
numInputs = env.observation_space.shape[0]
numOutputs = env.action_space.shape[0]

HSzIn = 10 * numInputs
HSzOut = 10 * numOutputs
GAMMA = 0.95 # discount factor
LAMBDA = 0.95 # Lambda for GAE
SampleSz = 2**10 # Number of generated samples per Iteration
BatchSz = 2**5 # Size of Minibatch for update
numIterations = 10000 # Number of iterations
LearningRate = 1e-4
numUpdates = 2**5
Epsilon = 0.2
CRITIC_DISCOUNT = 0.5
ENTROPY_BETA = 0.001

def test(env,net):
    state = env.reset()
    done = False
    rew = 0
    while not done:
            state = torch.FloatTensor(np.transpose(state))
            dist, value = net(state)
            action = dist.sample()
            nextState, reward, done , _ = env.step(action.detach().numpy())
            rew += reward
    print(rew)


class ActorCritic(nn.Module):
    def __init__(self, numInputs, numOutputs, hiddenSz=HSzIn, hiddenSzOut=HSzOut, std=0.0):
        super(ActorCritic, self).__init__()

        self.critic = nn.Sequential(
            nn.Linear(numInputs, hiddenSz),
            nn.Tanh(),
            nn.Linear(hiddenSz, hiddenSzOut),
            nn.Tanh(),
            nn.Linear(hiddenSzOut,1)
            )

        self.actor = nn.Sequential(
            nn.Linear(numInputs, hiddenSz),
            nn.Tanh(),
            nn.Linear(hiddenSz, hiddenSzOut),
            nn.Tanh(),
            nn.Linear(hiddenSzOut, numOutputs),
                )

        self.logStd = nn.Parameter(torch.ones(1,numOutputs) * std) # how to update variance ? 

    def forward(self,x):
        value = self.critic(x)
        mu = self.actor(x)
        std = self.logStd.exp()
        dist = Normal(mu,std)
        return dist, value

# Initialize Net and optimizer
net = ActorCritic(numInputs,numOutputs)
optimizer = optim.Adam(net.parameters(), lr=LearningRate)

# generate samples and data necessary for the update
for ite in range(numIterations):
    logProbs  = torch.zeros(1, SampleSz)
    values    = torch.zeros(1, SampleSz)
    states    = torch.zeros(SampleSz,numInputs)
    actions   = torch.zeros(SampleSz, numOutputs)
    rewards   = torch.zeros(1, SampleSz)
    dones     = torch.zeros(1, SampleSz)

    state = env.reset()
    for step in range(SampleSz):
        state = torch.Tensor(np.transpose(state)) # transpose to avoid size-mismatch type-error
        dist, value = net(state)
        action = dist.sample()
        nextState, reward, done , _ = env.step(action.detach().numpy())
        
        logProbs[0][step] = dist.log_prob(action)
        values[0][step] = value
        rewards[0][step] = torch.Tensor(reward)# reward is one-element array of type float32
        dones[0][step] = 1 - done
        states[:][step] = state
        actions[:][step] = action
        
        if done:
            state = env.reset()
        else:
            state = nextState

    # GAE
    _,finalValue = net(torch.Tensor(np.transpose(nextState))) # add V(s') for final s 

    allValues = torch.cat((values,finalValue),1)
    advan_t = 0
    reward = 0
    advantages = torch.zeros(1, SampleSz)
    sumDiscRew = torch.zeros(1, SampleSz)
    for i in reversed(range(SampleSz)):
        if dones[0][i]:
            reward = 0
        # delta is the TD residual
        delta = rewards[0][i] + GAMMA*allValues[0][i+1]*dones[0][i] - allValues[0][i]
        advan_t = delta + GAMMA*LAMBDA*advan_t*dones[0][i] # implicit reset of advan_t when new traj starts
        advantages[0][i] = advan_t
        sumDiscRew[0][i] = rewards[0][i] + GAMMA*reward

        
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    
    # Update Network
    for _ in range(numUpdates):
        randIndices = np.random.randint(0, SampleSz, BatchSz)
        # Compute loss functions
        surr1 = torch.zeros(1, BatchSz)
        surr2 = torch.zeros(1, BatchSz)
        valueDiffs = torch.zeros(1, BatchSz)
        entropies = torch.zeros(1, BatchSz)
        ind = 0
        for i in randIndices:
            dist, value = net(states[i])
            entropy = dist.entropy()

            entropies[0][ind] = entropy
            
            new_log_prob = dist.log_prob(actions[i])
            ratio = (new_log_prob - logProbs[0][i]).exp()
            surr1[0][ind] = ratio * advantages[0][i]
            surr2[0][ind] = torch.clamp(ratio, 1.0 - Epsilon, 1.0 + Epsilon) * advantages[0][i]

            valueDiffs[0][ind] = sumDiscRew[0][i] - value

            ind += 1
            
        actor_loss  = - torch.min(surr1, surr2).mean()

        # MSE between discounted sum of discounted future rewards and predictions
        critic_loss = valueDiffs.pow(2).mean()

        loss = CRITIC_DISCOUNT * critic_loss + actor_loss  - ENTROPY_BETA * entropies.mean()

        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
        
    
test(env,net)