Hi,
I am implementing a simple Policy Gradient algorithm but it seems that the model is not learning anything. Its actions are still almost random even after about 2000 episodes in cartpole-v1. I have stepped through the code to make sure I am storing the action log probability and episode rewards correctly, and it seems that is not the problem. Is there something that I’m missing?
import torch
import torch.autograd as autograd
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import categorical
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
class PGN(nn.Module):
def __init__(self, obs_size, n_actions):
super(PGN, self).__init__()
self.fc1= nn.Linear(obs_size, 256)
self.fc2= nn.Linear(256,256)
self.fc3= nn.Linear(256, n_actions)
def forward(self, obs):
x= F.relu(self.fc1(obs))
x= F.relu(self.fc2(x))
x=self.fc3(x)
return x
def train():
#hyperparameters
lr=0.001
GAMMA =0.99
#make environment
env = gym.make('CartPole-v1')
obs=env.reset()
done = False
net= PGN(len(obs),env.action_space.n)
optimizer=optim.Adam(net.parameters(),lr=lr)
action_list=[]
reward_list=[]
count = 0 #count number of episodes
num_episodes =2000 #max number of episodes
max_r=0 #for visualization purposes
while (True):
output = net(torch.from_numpy(obs).float())
action_prob = torch.distributions.Categorical(F.softmax(output))
action = action_prob.sample()
obs, reward, done, info = env.step(action.item())
#env.render()
max_r += reward
action_list.append(action_prob.log_prob(action))
reward_list.append(reward)
if done:
print(max_r) #print episode reward at end of episode
max_r =0
obs=env.reset()
total_reward=0
reward_list_calc=[]
for i in reversed(reward_list): #discount rewards. to do this in linear time, we traverse backward
total_reward *=GAMMA
total_reward += i
reward_list_calc.append(total_reward)
reward_list = list(reversed(reward_list_calc))
reward_list = np.asarray(reward_list)
mean = np.mean(reward_list)
std = np.std(reward_list) if np.std(reward_list) > 0 else 1
reward_list = (reward_list-mean)/std #apparently z-scores are used to normalize data. we want to reduce variance here
action_list = torch.tensor(action_list, requires_grad = True)
reward_list = torch.tensor(reward_list, requires_grad = True)
loss = -reward_list*action_list
loss = torch.sum(loss)
optimizer.zero_grad()
loss.backward()
optimizer.step()
reward_list=[]
action_list=[]
count+=1 #number of episode
if(count==num_episodes):
print("done")
break
train()