Hi! First time posting here!
I’ve been learning RL this summer and this week I’ve tried to make a PPO implementation on Pytorch with the help of some repositories from github with similiar algorithms.
The code runs OpenAI’s Lunar Lander but I have several errors that I have not been able to fix, the biggest one being that the algorithm quickly converges to doing the same action regardless of the state. The other major problem I’ve found is that even though I’m using backwards() only once, I get an error asking me to set retain_graph to True.
Because of that, I see no improvement of the rewards obtained over 1000 steps, I don’t know if the algorithm needs more steps to be able to see an improvement.
I’m really sorry if this kind of problems have no place in this forums, I just didn’t know where to post this.
Also I’m sorry for the messy code, it’s my first time doing this kind of algorithms, and I’m fairly new with pytorch and machine learning in general.
Tanks a lot in advance!!
Code:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.distributions import Categorical
import gym
class actorCritic(nn.Module):
def __init__(self):
super(actorCritic, self).__init__()
self.fc = nn.Sequential(
nn.Linear(8, 16),
nn.Linear(16, 32),
nn.Linear(32, 64),
nn.ReLU(inplace=True)
)
self.pi = nn.Linear(64, 4)
self.value = nn.Linear(64, 1)
def forward(self, x):
x = self.fc(x)
pi_1 = self.pi(x)
pi_out = F.softmax(pi_1, dim=-1)
value_out = self.value(x)
return pi_out, value_out
def GAE(rewards, values, masks):
gamma = 0.99
lamb = 0.95
advan_t = 0
sizes = rewards.size()
advantages = torch.zeros(1, sizes[1])
for t in reversed(range(sizes[1])):
delta = rewards[0][t] + gamma*values[0][t+1]*masks[0][t] - values[0][t]
advan_t = delta + gamma*lamb*advan_t*mask[0][t]
advantages[0][t] = advan_t
real_values = values[:,:sizes[1]] + advantages
return advantages, real_values
def plot_rewards(rewards):
plt.figure(2)
plt.clf()
plt.plot(rewards)
plt.pause(0.001)
plt.savefig('TruePPO 500 steps.png')
def interact(times, states):
rewards = torch.zeros(1, times)
actions = torch.zeros(1, times)
mask = torch.ones(1, times)
for steps in range(times):
action_probs, _ = network(states[steps])
m = Categorical(action_probs)
action = int(m.sample())
obs, reward, done, _ = env.step(action)
if done:
obs = env.reset()
mask[0][steps] = 0
states[steps+1] = torch.from_numpy(obs).float()
rewards[0][steps] = reward
actions[0][steps] = action
return states, rewards, actions, mask
#Parameters
total_steps = 1000
batch_size = 10
env = gym.make('LunarLander-v2')
network = actorCritic()
old_network = actorCritic()
optimizer = torch.optim.Adam(network.parameters(), lr = 0.001)
states = torch.zeros(batch_size+1, 8)
steps = 0
obs_ = env.reset()
obs = torch.from_numpy(obs_).float()
states[0] = obs
reward_means = []
nn_paramD = network.state_dict()
old_network.load_state_dict(nn_paramD)
while steps < total_steps:
print (steps)
states, rewards, actions, mask = interact(batch_size, states)
#calculate values, GAE, normalize advantages, randomize, calculate loss, backprop,
_, values = network(states)
values = values.view(-1, batch_size+1)
advantages, v_targ = GAE(rewards, values, mask)
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
optimizer.zero_grad()
for n in range(rewards.size()[1]):
probabilities, _ = network(states[n])
print (probabilities)
m = Categorical(probabilities)
action_prob = m.probs[int(actions[0][n])]
entropia = m.entropy()
old_probabilities, _ = old_network(states[n])
m_old = Categorical(old_probabilities)
old_action_prob = m.probs[actions[0][n].int()]
old_action_prob.detach()
ratio = action_prob / old_action_prob
surr1 = ratio*advantages[0][n]
surr2 = torch.clamp(ratio, min = (1.-0.2), max = (1.+0.2))
policy_loss = -torch.min(surr1, surr2)
value_loss = 0.5*(values[0][n]-v_targ[0][n])**2
entropy_loss = -entropia
total_loss = policy_loss + value_loss + 0.01*entropy_loss
total_loss.backward(retain_graph = True)
optimizer.step()
reward_means.append(rewards.numpy().mean())
old_network.load_state_dict(nn_paramD)
nn_paramD = network.state_dict()
steps += 1
plot_rewards(reward_means)