# REINFORCE does not converge when i make this change

I am following the REINFORCE implementation on cartpole from here:

The network does a very good job in solving the cartpole, but when i change line 98 from

``````log_prob_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]

``````

to

``````log_prob_actions_v = batch_qvals_v * log_prob_v[:, batch_actions_t]
``````

the code works, but mysteriously, the algorithm does not converge at all…why?
Also can somebody show me how to implement this without using ptan? I am not getting how the library is working… It would help a lot…

I modified the code like this so it does not use ptan but I have not gotten it to converge jet (I am new to reinforcement learning, pytorch and python in generel)

``````import gym
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

GAMMA = 0.99
LEARNING_RATE = 0.01
EPISODES_PER_BATCH = 4

class PGN(nn.Module):
def __init__(self, input_size, n_actions):
super(PGN, self).__init__()

self.net = nn.Sequential(
nn.Linear(input_size, 128),
nn.ReLU(),
nn.Linear(128, n_actions)
)

def forward(self, x):
return self.net(x)

def calc_qvals(rewards):
res = []
sum_r = 0.0
for r in reversed(rewards):
sum_r *= GAMMA
sum_r += r
res.append(sum_r)
return list(reversed(res))

def play_episode(net):
states = []
actions = []
rewards = []
obs = env.reset()
done  = False
while True:
state = np.asarray(obs, dtype=np.float32)
action = torch.argmax(F.softmax(net(torch.from_numpy(state).view(1,-1)), dim=1)).item()
obs, reward, done, _ = env.step(action)
if done:
return states, actions, rewards
states.append(state)
actions.append(action)
rewards.append(reward)

env = gym.make("CartPole-v0")

net = PGN(env.observation_space.shape, env.action_space.n)
print(net)

batch_states, batch_actions, batch_qvals = [], [], []

mean_reward = 0

batch_episodes_played = 0
episodes_played = 0
while True:
states, actions, rewards = play_episode(net)
batch_states.extend(states)
batch_actions.extend(actions)
batch_qvals.extend(calc_qvals(rewards))
mean_reward += sum(rewards)
batch_episodes_played += 1

if batch_episodes_played < EPISODES_PER_BATCH:
continue

mean_reward /= EPISODES_PER_BATCH

episodes_played += EPISODES_PER_BATCH
print(episodes_played, " eppisodes, mean reward: ", mean_reward)
mean_reward = 0

torch_batch_states = torch.FloatTensor(batch_states)
torch_batch_qvals = torch.FloatTensor(batch_qvals)
torch_batch_actions = torch.LongTensor(batch_actions)

log_prob_v = F.log_softmax(net(torch_batch_states), dim=1)

log_prob_actions_v = torch_batch_qvals * log_prob_v[range(len(batch_states)), torch_batch_actions]
loss_v = -log_prob_actions_v.mean()

loss_v.backward()
optimizer.step()

batch_episodes_played = 0
batch_states.clear()
batch_actions.clear()
batch_qvals.clear()
``````

I now found the problem why my program was not converging i used argmag instead of sampling from the propabillety distribution from the network (in function play_episode). This change should fix that problem:

``````def play_episode(net):
states, actions, rewards = [], [], []
obs = env.reset()
done  = False
while True:
state = np.asarray(obs, dtype=np.float32)
action = Categorical(F.softmax(net(torch.from_numpy(state).view(1,-1)), dim=1)).sample().item()
obs, reward, done, _ = env.step(action)
if done:
return states, actions, rewards
states.append(state)
actions.append(action)
rewards.append(reward)
``````