Given an input, I want to predict the right output using policy gradient. The input is a number between 0-9 and the output is also a number between 0-9. I defined the reward to be -abs(input-output) which is some kind of loss function, and using gradient ascent I hope the numbers to match but unfortunately, they don’t.
I’m using the pytorch REINFORCE implementation:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils as utils
from torch.autograd import Variable
import numpy as np
class Policy(nn.Module):
def __init__(self, hidden_size, num_inputs, action_space):
super(Policy, self).__init__()
self.action_space = action_space
num_outputs = action_space
self.linear1 = nn.Linear(num_inputs, hidden_size)
self.linear2 = nn.Linear(hidden_size, num_outputs)
def forward(self, inputs):
x = inputs
x = F.relu(self.linear1(x))
action_scores = self.linear2(x)
return F.softmax(action_scores)
class REINFORCE:
def __init__(self, hidden_size, num_inputs, action_space, gamma=0.99):
torch.manual_seed(0)
np.random.seed(0)
self.action_space = action_space
self.model = Policy(hidden_size, num_inputs, action_space)
self.model = self.model.cuda()
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
self.model.train()
self.gamma = gamma
self.rewards = []
self.log_probs = []
self.entropies = []
def select_action(self, state):
probs = self.model(Variable(state).cuda())
action = probs.multinomial().data.unsqueeze(0)
prob = probs[:, action[0, 0]].view(1, -1)
log_prob = prob.log()
entropy = - (probs * probs.log()).sum()
return action[0], log_prob, entropy
def update_parameters(self):
R = torch.zeros(1, 1)
loss = 0
for i in reversed(range(len(self.rewards))):
R = self.gamma * R + self.rewards[i]
loss = loss - (self.log_probs[i] * (Variable(R).expand_as(self.log_probs[i])).cuda()).sum() - (
0.0001 * self.entropies[i].cuda()).sum()
loss = loss / len(self.rewards)
self.optimizer.zero_grad()
loss.backward()
utils.clip_grad_norm(self.model.parameters(), 40)
self.optimizer.step()
self.rewards = []
self.log_probs = []
self.entropies = []
def add_experience(self, reward, log_prob, entropy):
self.rewards.append(reward)
self.log_probs.append(log_prob)
self.entropies.append(entropy)
if __name__ == '__main__':
reinforce = REINFORCE(128,1,10)
while True:
for i in range(10):
input = torch.FloatTensor([i]).unsqueeze(0)
output, log_prob, entropy = reinforce.select_action(input)
reward = -torch.abs(output.type(torch.FloatTensor) - input)
reinforce.add_experience(reward, log_prob, entropy)
reinforce.update_parameters()
What am I doing wrong?
Thanks