Hi,
I am trying to implement PPO Actor-Critic Style based on the paper by Schulman et al (paper).
Long story short: My code doesn’t learn and I do not know why. I suspect my hyperparameters are poorly chosen
# Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import gym
# Hyperparameters
env = "Pendulum-v0"
env = gym.make(env)
numInputs = env.observation_space.shape[0]
numOutputs = env.action_space.shape[0]
HSzIn = 10 * numInputs
HSzOut = 10 * numOutputs
GAMMA = 0.95 # discount factor
LAMBDA = 0.95 # Lambda for GAE
SampleSz = 2**10 # Number of generated samples per Iteration
BatchSz = 2**5 # Size of Minibatch for update
numIterations = 10000 # Number of iterations
LearningRate = 1e-4
numUpdates = 2**5
Epsilon = 0.2
CRITIC_DISCOUNT = 0.5
ENTROPY_BETA = 0.001
def test(env,net):
state = env.reset()
done = False
rew = 0
while not done:
state = torch.FloatTensor(np.transpose(state))
dist, value = net(state)
action = dist.sample()
nextState, reward, done , _ = env.step(action.detach().numpy())
rew += reward
print(rew)
class ActorCritic(nn.Module):
def __init__(self, numInputs, numOutputs, hiddenSz=HSzIn, hiddenSzOut=HSzOut, std=0.0):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(numInputs, hiddenSz),
nn.Tanh(),
nn.Linear(hiddenSz, hiddenSzOut),
nn.Tanh(),
nn.Linear(hiddenSzOut,1)
)
self.actor = nn.Sequential(
nn.Linear(numInputs, hiddenSz),
nn.Tanh(),
nn.Linear(hiddenSz, hiddenSzOut),
nn.Tanh(),
nn.Linear(hiddenSzOut, numOutputs),
)
self.logStd = nn.Parameter(torch.ones(1,numOutputs) * std) # how to update variance ?
def forward(self,x):
value = self.critic(x)
mu = self.actor(x)
std = self.logStd.exp()
dist = Normal(mu,std)
return dist, value
# Initialize Net and optimizer
net = ActorCritic(numInputs,numOutputs)
optimizer = optim.Adam(net.parameters(), lr=LearningRate)
# generate samples and data necessary for the update
for ite in range(numIterations):
logProbs = torch.zeros(1, SampleSz)
values = torch.zeros(1, SampleSz)
states = torch.zeros(SampleSz,numInputs)
actions = torch.zeros(SampleSz, numOutputs)
rewards = torch.zeros(1, SampleSz)
dones = torch.zeros(1, SampleSz)
state = env.reset()
for step in range(SampleSz):
state = torch.Tensor(np.transpose(state)) # transpose to avoid size-mismatch type-error
dist, value = net(state)
action = dist.sample()
nextState, reward, done , _ = env.step(action.detach().numpy())
logProbs[0][step] = dist.log_prob(action)
values[0][step] = value
rewards[0][step] = torch.Tensor(reward)# reward is one-element array of type float32
dones[0][step] = 1 - done
states[:][step] = state
actions[:][step] = action
if done:
state = env.reset()
else:
state = nextState
# GAE
_,finalValue = net(torch.Tensor(np.transpose(nextState))) # add V(s') for final s
allValues = torch.cat((values,finalValue),1)
advan_t = 0
reward = 0
advantages = torch.zeros(1, SampleSz)
sumDiscRew = torch.zeros(1, SampleSz)
for i in reversed(range(SampleSz)):
if dones[0][i]:
reward = 0
# delta is the TD residual
delta = rewards[0][i] + GAMMA*allValues[0][i+1]*dones[0][i] - allValues[0][i]
advan_t = delta + GAMMA*LAMBDA*advan_t*dones[0][i] # implicit reset of advan_t when new traj starts
advantages[0][i] = advan_t
sumDiscRew[0][i] = rewards[0][i] + GAMMA*reward
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
# Update Network
for _ in range(numUpdates):
randIndices = np.random.randint(0, SampleSz, BatchSz)
# Compute loss functions
surr1 = torch.zeros(1, BatchSz)
surr2 = torch.zeros(1, BatchSz)
valueDiffs = torch.zeros(1, BatchSz)
entropies = torch.zeros(1, BatchSz)
ind = 0
for i in randIndices:
dist, value = net(states[i])
entropy = dist.entropy()
entropies[0][ind] = entropy
new_log_prob = dist.log_prob(actions[i])
ratio = (new_log_prob - logProbs[0][i]).exp()
surr1[0][ind] = ratio * advantages[0][i]
surr2[0][ind] = torch.clamp(ratio, 1.0 - Epsilon, 1.0 + Epsilon) * advantages[0][i]
valueDiffs[0][ind] = sumDiscRew[0][i] - value
ind += 1
actor_loss = - torch.min(surr1, surr2).mean()
# MSE between discounted sum of discounted future rewards and predictions
critic_loss = valueDiffs.pow(2).mean()
loss = CRITIC_DISCOUNT * critic_loss + actor_loss - ENTROPY_BETA * entropies.mean()
optimizer.zero_grad()
loss.backward(retain_graph=True)
optimizer.step()
test(env,net)