I am following the REINFORCE implementation on cartpole from here:
#!/usr/bin/env python3
import gym
import ptan
import numpy as np
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
GAMMA = 0.99
LEARNING_RATE = 0.01
EPISODES_TO_TRAIN = 4
class PGN(nn.Module):
def __init__(self, input_size, n_actions):
super(PGN, self).__init__()
This file has been truncated. show original
The network does a very good job in solving the cartpole, but when i change line 98 from
log_prob_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]
to
log_prob_actions_v = batch_qvals_v * log_prob_v[:, batch_actions_t]
the code works, but mysteriously, the algorithm does not converge at all…why?
Also can somebody show me how to implement this without using ptan? I am not getting how the library is working… It would help a lot…
Thanks in advancee
Tim2
(Tim Kellermann)
April 13, 2020, 12:25pm
2
I modified the code like this so it does not use ptan but I have not gotten it to converge jet (I am new to reinforcement learning, pytorch and python in generel)
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
GAMMA = 0.99
LEARNING_RATE = 0.01
EPISODES_PER_BATCH = 4
class PGN(nn.Module):
def __init__(self, input_size, n_actions):
super(PGN, self).__init__()
self.net = nn.Sequential(
nn.Linear(input_size, 128),
nn.ReLU(),
nn.Linear(128, n_actions)
)
def forward(self, x):
return self.net(x)
def calc_qvals(rewards):
res = []
sum_r = 0.0
for r in reversed(rewards):
sum_r *= GAMMA
sum_r += r
res.append(sum_r)
return list(reversed(res))
def play_episode(net):
states = []
actions = []
rewards = []
obs = env.reset()
done = False
while True:
state = np.asarray(obs, dtype=np.float32)
action = torch.argmax(F.softmax(net(torch.from_numpy(state).view(1,-1)), dim=1)).item()
obs, reward, done, _ = env.step(action)
if done:
return states, actions, rewards
states.append(state)
actions.append(action)
rewards.append(reward)
env = gym.make("CartPole-v0")
net = PGN(env.observation_space.shape[0], env.action_space.n)
print(net)
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
batch_states, batch_actions, batch_qvals = [], [], []
mean_reward = 0
batch_episodes_played = 0
episodes_played = 0
while True:
states, actions, rewards = play_episode(net)
batch_states.extend(states)
batch_actions.extend(actions)
batch_qvals.extend(calc_qvals(rewards))
mean_reward += sum(rewards)
batch_episodes_played += 1
if batch_episodes_played < EPISODES_PER_BATCH:
continue
mean_reward /= EPISODES_PER_BATCH
episodes_played += EPISODES_PER_BATCH
print(episodes_played, " eppisodes, mean reward: ", mean_reward)
mean_reward = 0
optimizer.zero_grad()
torch_batch_states = torch.FloatTensor(batch_states)
torch_batch_qvals = torch.FloatTensor(batch_qvals)
torch_batch_actions = torch.LongTensor(batch_actions)
log_prob_v = F.log_softmax(net(torch_batch_states), dim=1)
log_prob_actions_v = torch_batch_qvals * log_prob_v[range(len(batch_states)), torch_batch_actions]
loss_v = -log_prob_actions_v.mean()
loss_v.backward()
optimizer.step()
batch_episodes_played = 0
batch_states.clear()
batch_actions.clear()
batch_qvals.clear()
Tim2
(Tim Kellermann)
April 14, 2020, 2:43pm
3
I now found the problem why my program was not converging i used argmag instead of sampling from the propabillety distribution from the network (in function play_episode). This change should fix that problem:
def play_episode(net):
states, actions, rewards = [], [], []
obs = env.reset()
done = False
while True:
state = np.asarray(obs, dtype=np.float32)
action = Categorical(F.softmax(net(torch.from_numpy(state).view(1,-1)), dim=1)).sample().item()
obs, reward, done, _ = env.step(action)
if done:
return states, actions, rewards
states.append(state)
actions.append(action)
rewards.append(reward)