I am implementing a simple Policy Gradient algorithm but it seems that the model is not learning anything. Its actions are still almost random even after about 2000 episodes in cartpole-v1. I have stepped through the code to make sure I am storing the action log probability and episode rewards correctly, and it seems that is not the problem. Is there something that I’m missing?
import torch import torch.autograd as autograd from torch import Tensor import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.distributions import categorical import gym import numpy as np from torch.utils.tensorboard import SummaryWriter class PGN(nn.Module): def __init__(self, obs_size, n_actions): super(PGN, self).__init__() self.fc1= nn.Linear(obs_size, 256) self.fc2= nn.Linear(256,256) self.fc3= nn.Linear(256, n_actions) def forward(self, obs): x= F.relu(self.fc1(obs)) x= F.relu(self.fc2(x)) x=self.fc3(x) return x def train(): #hyperparameters lr=0.001 GAMMA =0.99 #make environment env = gym.make('CartPole-v1') obs=env.reset() done = False net= PGN(len(obs),env.action_space.n) optimizer=optim.Adam(net.parameters(),lr=lr) action_list= reward_list= count = 0 #count number of episodes num_episodes =2000 #max number of episodes max_r=0 #for visualization purposes while (True): output = net(torch.from_numpy(obs).float()) action_prob = torch.distributions.Categorical(F.softmax(output)) action = action_prob.sample() obs, reward, done, info = env.step(action.item()) #env.render() max_r += reward action_list.append(action_prob.log_prob(action)) reward_list.append(reward) if done: print(max_r) #print episode reward at end of episode max_r =0 obs=env.reset() total_reward=0 reward_list_calc= for i in reversed(reward_list): #discount rewards. to do this in linear time, we traverse backward total_reward *=GAMMA total_reward += i reward_list_calc.append(total_reward) reward_list = list(reversed(reward_list_calc)) reward_list = np.asarray(reward_list) mean = np.mean(reward_list) std = np.std(reward_list) if np.std(reward_list) > 0 else 1 reward_list = (reward_list-mean)/std #apparently z-scores are used to normalize data. we want to reduce variance here action_list = torch.tensor(action_list, requires_grad = True) reward_list = torch.tensor(reward_list, requires_grad = True) loss = -reward_list*action_list loss = torch.sum(loss) optimizer.zero_grad() loss.backward() optimizer.step() reward_list= action_list= count+=1 #number of episode if(count==num_episodes): print("done") break train()