Hi! First time posting here!
I’ve been learning RL this summer and this week I’ve tried to make a PPO implementation on Pytorch with the help of some repositories from github with similiar algorithms.
The code runs OpenAI’s Lunar Lander but I have several errors that I have not been able to fix, the biggest one being that the algorithm quickly converges to doing the same action regardless of the state. The other major problem I’ve found is that even though I’m using backwards() only once, I get an error asking me to set retain_graph to True.
Because of that, I see no improvement of the rewards obtained over 1000 steps, I don’t know if the algorithm needs more steps to be able to see an improvement.
I’m really sorry if this kind of problems have no place in this forums, I just didn’t know where to post this.
Also I’m sorry for the messy code, it’s my first time doing this kind of algorithms, and I’m fairly new with pytorch and machine learning in general.
Tanks a lot in advance!!
import torch import numpy as np import torch.nn as nn import torch.nn.functional as F import matplotlib.pyplot as plt from torch.distributions import Categorical import gym class actorCritic(nn.Module): def __init__(self): super(actorCritic, self).__init__() self.fc = nn.Sequential( nn.Linear(8, 16), nn.Linear(16, 32), nn.Linear(32, 64), nn.ReLU(inplace=True) ) self.pi = nn.Linear(64, 4) self.value = nn.Linear(64, 1) def forward(self, x): x = self.fc(x) pi_1 = self.pi(x) pi_out = F.softmax(pi_1, dim=-1) value_out = self.value(x) return pi_out, value_out def GAE(rewards, values, masks): gamma = 0.99 lamb = 0.95 advan_t = 0 sizes = rewards.size() advantages = torch.zeros(1, sizes) for t in reversed(range(sizes)): delta = rewards[t] + gamma*values[t+1]*masks[t] - values[t] advan_t = delta + gamma*lamb*advan_t*mask[t] advantages[t] = advan_t real_values = values[:,:sizes] + advantages return advantages, real_values def plot_rewards(rewards): plt.figure(2) plt.clf() plt.plot(rewards) plt.pause(0.001) plt.savefig('TruePPO 500 steps.png') def interact(times, states): rewards = torch.zeros(1, times) actions = torch.zeros(1, times) mask = torch.ones(1, times) for steps in range(times): action_probs, _ = network(states[steps]) m = Categorical(action_probs) action = int(m.sample()) obs, reward, done, _ = env.step(action) if done: obs = env.reset() mask[steps] = 0 states[steps+1] = torch.from_numpy(obs).float() rewards[steps] = reward actions[steps] = action return states, rewards, actions, mask #Parameters total_steps = 1000 batch_size = 10 env = gym.make('LunarLander-v2') network = actorCritic() old_network = actorCritic() optimizer = torch.optim.Adam(network.parameters(), lr = 0.001) states = torch.zeros(batch_size+1, 8) steps = 0 obs_ = env.reset() obs = torch.from_numpy(obs_).float() states = obs reward_means =  nn_paramD = network.state_dict() old_network.load_state_dict(nn_paramD) while steps < total_steps: print (steps) states, rewards, actions, mask = interact(batch_size, states) #calculate values, GAE, normalize advantages, randomize, calculate loss, backprop, _, values = network(states) values = values.view(-1, batch_size+1) advantages, v_targ = GAE(rewards, values, mask) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) optimizer.zero_grad() for n in range(rewards.size()): probabilities, _ = network(states[n]) print (probabilities) m = Categorical(probabilities) action_prob = m.probs[int(actions[n])] entropia = m.entropy() old_probabilities, _ = old_network(states[n]) m_old = Categorical(old_probabilities) old_action_prob = m.probs[actions[n].int()] old_action_prob.detach() ratio = action_prob / old_action_prob surr1 = ratio*advantages[n] surr2 = torch.clamp(ratio, min = (1.-0.2), max = (1.+0.2)) policy_loss = -torch.min(surr1, surr2) value_loss = 0.5*(values[n]-v_targ[n])**2 entropy_loss = -entropia total_loss = policy_loss + value_loss + 0.01*entropy_loss total_loss.backward(retain_graph = True) optimizer.step() reward_means.append(rewards.numpy().mean()) old_network.load_state_dict(nn_paramD) nn_paramD = network.state_dict() steps += 1 plot_rewards(reward_means)