REINFORCE does not converge when i make this change

I am following the REINFORCE implementation on cartpole from here:


The network does a very good job in solving the cartpole, but when i change line 98 from

log_prob_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]

to

log_prob_actions_v = batch_qvals_v * log_prob_v[:, batch_actions_t]

the code works, but mysteriously, the algorithm does not converge at all…why?
Also can somebody show me how to implement this without using ptan? I am not getting how the library is working… It would help a lot…
Thanks in advancee

I modified the code like this so it does not use ptan but I have not gotten it to converge jet (I am new to reinforcement learning, pytorch and python in generel)

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

GAMMA = 0.99
LEARNING_RATE = 0.01
EPISODES_PER_BATCH = 4

class PGN(nn.Module):
    def __init__(self, input_size, n_actions):
        super(PGN, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        return self.net(x)


def calc_qvals(rewards):
    res = []
    sum_r = 0.0
    for r in reversed(rewards):
        sum_r *= GAMMA
        sum_r += r
        res.append(sum_r)
    return list(reversed(res))

def play_episode(net):
    states = []
    actions = []
    rewards = []
    obs = env.reset()
    done  = False
    while True:
        state = np.asarray(obs, dtype=np.float32)
        action = torch.argmax(F.softmax(net(torch.from_numpy(state).view(1,-1)), dim=1)).item()
        obs, reward, done, _ = env.step(action)
        if done:
            return states, actions, rewards
        states.append(state)
        actions.append(action)
        rewards.append(reward)


env = gym.make("CartPole-v0")

net = PGN(env.observation_space.shape[0], env.action_space.n)
print(net)

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

batch_states, batch_actions, batch_qvals = [], [], []

mean_reward = 0

batch_episodes_played = 0
episodes_played = 0
while True:
    states, actions, rewards = play_episode(net)
    batch_states.extend(states)
    batch_actions.extend(actions)
    batch_qvals.extend(calc_qvals(rewards))
    mean_reward += sum(rewards)
    batch_episodes_played += 1

    if batch_episodes_played < EPISODES_PER_BATCH:
        continue

    mean_reward /= EPISODES_PER_BATCH

    episodes_played += EPISODES_PER_BATCH
    print(episodes_played, " eppisodes, mean reward: ", mean_reward)
    mean_reward = 0

    optimizer.zero_grad()

    torch_batch_states = torch.FloatTensor(batch_states)
    torch_batch_qvals = torch.FloatTensor(batch_qvals)
    torch_batch_actions = torch.LongTensor(batch_actions)

    log_prob_v = F.log_softmax(net(torch_batch_states), dim=1)

    log_prob_actions_v = torch_batch_qvals * log_prob_v[range(len(batch_states)), torch_batch_actions]
    loss_v = -log_prob_actions_v.mean()

    loss_v.backward()
    optimizer.step()

    batch_episodes_played = 0
    batch_states.clear()
    batch_actions.clear()
    batch_qvals.clear()

I now found the problem why my program was not converging i used argmag instead of sampling from the propabillety distribution from the network (in function play_episode). This change should fix that problem:

def play_episode(net):
    states, actions, rewards = [], [], []
    obs = env.reset()
    done  = False
    while True:
        state = np.asarray(obs, dtype=np.float32)
        action = Categorical(F.softmax(net(torch.from_numpy(state).view(1,-1)), dim=1)).sample().item()
        obs, reward, done, _ = env.step(action)
        if done:
            return states, actions, rewards
        states.append(state)
        actions.append(action)
        rewards.append(reward)