Hi,

I am implementing a simple Policy Gradient algorithm but it seems that the model is not learning anything. Its actions are still almost random even after about 2000 episodes in cartpole-v1. I have stepped through the code to make sure I am storing the action log probability and episode rewards correctly, and it seems that is not the problem. Is there something that I’m missing?

```
import torch
import torch.autograd as autograd
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import categorical
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
class PGN(nn.Module):
def __init__(self, obs_size, n_actions):
super(PGN, self).__init__()
self.fc1= nn.Linear(obs_size, 256)
self.fc2= nn.Linear(256,256)
self.fc3= nn.Linear(256, n_actions)
def forward(self, obs):
x= F.relu(self.fc1(obs))
x= F.relu(self.fc2(x))
x=self.fc3(x)
return x
def train():
#hyperparameters
lr=0.001
GAMMA =0.99
#make environment
env = gym.make('CartPole-v1')
obs=env.reset()
done = False
net= PGN(len(obs),env.action_space.n)
optimizer=optim.Adam(net.parameters(),lr=lr)
action_list=[]
reward_list=[]
count = 0 #count number of episodes
num_episodes =2000 #max number of episodes
max_r=0 #for visualization purposes
while (True):
output = net(torch.from_numpy(obs).float())
action_prob = torch.distributions.Categorical(F.softmax(output))
action = action_prob.sample()
obs, reward, done, info = env.step(action.item())
#env.render()
max_r += reward
action_list.append(action_prob.log_prob(action))
reward_list.append(reward)
if done:
print(max_r) #print episode reward at end of episode
max_r =0
obs=env.reset()
total_reward=0
reward_list_calc=[]
for i in reversed(reward_list): #discount rewards. to do this in linear time, we traverse backward
total_reward *=GAMMA
total_reward += i
reward_list_calc.append(total_reward)
reward_list = list(reversed(reward_list_calc))
reward_list = np.asarray(reward_list)
mean = np.mean(reward_list)
std = np.std(reward_list) if np.std(reward_list) > 0 else 1
reward_list = (reward_list-mean)/std #apparently z-scores are used to normalize data. we want to reduce variance here
action_list = torch.tensor(action_list, requires_grad = True)
reward_list = torch.tensor(reward_list, requires_grad = True)
loss = -reward_list*action_list
loss = torch.sum(loss)
optimizer.zero_grad()
loss.backward()
optimizer.step()
reward_list=[]
action_list=[]
count+=1 #number of episode
if(count==num_episodes):
print("done")
break
train()
```