Issue with REINFORCE implementation

Hi all,
I’ve been playing around with the REINFORCE algorithm, and decided to modify the example found here to include a replay memory buffer. For some reason, the changes mean that the network no longer learns very well, even when the memory buffer is being used in the same way as the original lists holding the actions and log probs (as is currently implemented below). I can’t for the life of me figure out what is causing this, though. It’s driving me insane. Full code below. Please help!

> import argparse
> import gym
> import numpy as np
> from itertools import count
> 
> import torch
> import torch.nn as nn
> import torch.nn.functional as F
> import torch.optim as optim
> from torch.autograd import Variable
> from torch.distributions import Categorical
> 
> import random
> from collections import namedtuple
> 
> parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
> 
> parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor (default: 0.99)')
> parser.add_argument('--seed', type=int, default=543, metavar='N', help='random seed (default: 543)')
> parser.add_argument('--render', action='store_true', help='render the environment')
> parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='interval between training status logs (default: 10)')
> parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training')
> parser.add_argument('--memory-capacity', type=int, default=None, metavar='mc', help='batch replay memory capacity')
> parser.add_argument('--batch-size', type=int, default=64, metavar='b', help='batchsize for update function')
> 
> args = parser.parse_args()
> 
> args.cuda = not args.no_cuda and torch.cuda.is_available()
> torch.manual_seed(args.seed)
> if args.cuda:
>     torch.cuda.manual_seed(args.seed)
> 
> env = gym.make('CartPole-v0')
> env.seed(args.seed)
> torch.manual_seed(args.seed)
> 
> def discounted_reward(reward, gamma=0.99):
>     reward = reward[::-1]
>     disc_reward = []
>     cumulative = 0
>     for r in reward:
>         val = cumulative*gamma + r
>         disc_reward.insert(0, val)
>         cumulative = val
>     return disc_reward
> 
> 
> Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'log_prob'))
> class ReplayMemory(object):
>     def __init__(self, capacity=None):
>         self.capacity = capacity
>         self.memory = []
>         self.position = 0
> 
>     def push(self, *args):
>         if self.capacity is not None:
>             if len(self.memory) < self.capacity:
>                 self.memory.append(None)
>             self.memory[self.position] = Transition(*args)
>             self.position = (self.position + 1) % self.capacity
>         else:
>             self.memory.append(Transition(*args))
> 
>     def sample(self, batch_size):
>         return random.sample(self.memory, batch_size)
> 
>     def pull(self):
>         return self.memory
> 
>     def __len__(self):
>         return len(self.memory)
> 
>     def clear(self):
>         self.memory = []
>         self.position = 0
> 
> 
> class Policy(nn.Module):
>     def __init__(self, input_dim, hidden_dim, output_dim, capacity, use_gpu=True):
>         super(Policy, self).__init__()
>         self.input_dim = input_dim
>         self.hidden_dim = hidden_dim
>         self.output_dim = output_dim
>         self.use_gpu = use_gpu
> 
>         self.affine1 = nn.Linear(input_dim, hidden_dim)
>         self.affine2 = nn.Linear(hidden_dim, output_dim)
> 
>         self.memory = ReplayMemory(capacity)
> 
>         self.state = []
>         self.action = []
>         self.next_state = []
>         self.reward = []
>         self.log_prob = []
> 
>         if use_gpu:
>             self.Tensor = torch.cuda.FloatTensor
>         else:
>             self.Tensor = torch.FloatTensor
> 
>     def forward(self, x):
>         x = F.relu(self.affine1(x))
>         action_scores = self.affine2(x)
>         return F.softmax(action_scores, dim=1)
> 
>     def select_action(self, state):
>         probs = self.forward(state)
>         m = Categorical(probs)
>         action = m.sample()
>         log_prob = m.log_prob(action)
>         return action.data[0], log_prob.data[0]
> 
>     def update(self, batch, optimizer):
>         rewards = torch.cat(batch.reward)
>         log_probs = torch.cat(batch.log_prob)
>         rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
>         optimizer.zero_grad()
>         loss = (-log_probs*rewards).sum()
>         loss = Variable(torch.Tensor([loss]), requires_grad=True)
>         loss.backward()
>         optimizer.step()
> 
> def main(args):
> 
>     policy = Policy(4, 128, 2, args.memory_capacity, args.cuda)
>     
>     if args.cuda:
>         policy.cuda()
> 
>     optimizer = optim.Adam(policy.parameters(), lr=1e-2)
>     running_reward = 10
>     for ep in count(1):
>         
>         policy.state = []
>         policy.action = []
>         policy.next_state = []
>         policy.reward = []
>         policy.log_prob = []
> 
>         state = env.reset()
>         state = policy.Tensor([state])
>         
>         for t in range(10000):
>             action, log_prob = policy.select_action(Variable(state))
>             next_state, reward, done, _ = env.step(action)
>             
>             policy.state.append(state)
>             policy.action.append(policy.Tensor([action]))
>             policy.next_state.append(policy.Tensor([next_state]))
>             policy.reward.append(policy.Tensor([reward]))
>             policy.log_prob.append(policy.Tensor([log_prob]))
>             
>             if done:
>                 break
> 
>             state = policy.Tensor([next_state])
> 
>         running_reward = running_reward*0.99+t*0.01
>         disc_reward = discounted_reward(policy.reward)
>         
>         for s, a, ns, r, lp in zip(policy.state, policy.action, policy.next_state, disc_reward, policy.log_prob):
>             policy.memory.push(s, a, ns, r, lp)
>         
>         if args.memory_capacity is not None:
>             transitions = policy.memory.sample(args.batch_size)
>         else:
>             transitions = policy.memory.pull()
>         batch = Transition(*zip(*transitions))
>         policy.update(batch, optimizer)
>         
>         if args.memory_capacity is None:
>             policy.memory.clear()
>         
>         if ep % args.log_interval == 0:
>             print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(ep, t, running_reward))
>         if running_reward > env.spec.reward_threshold:
>             print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, t))
>             break
> 
> 
> if __name__ == '__main__':
>     main(args)

Have not looked through all of your code, but I’d recommend splitting your main() function into some smaller chunks/subfunctions so it’s easier to debug :).

I eventually figured it out – originally, my forward method was outputting the FloatTensor, and I was only storing the contents. When backpropagating the loss, I would wrap this in a variable, and use backward(), and the reward would increase, but nowhere near what it should have. I fixed it by returning and storing the variable (rather than the contents) and it worked as expected.