Hi,
I want to know why [torch.ones(1)]
should be the first gradient of autograd.backward
in this example https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py#L77-L79
final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions))
gradients = [torch.ones(1)] + [None] * len(saved_actions)
autograd.backward(final_nodes, gradients)
I expect that the example of Variable.reinforce
is a sugar of Variable.backward
like this
SavedAction = namedtuple('SavedAction', ['action', 'value', 'logp'])
def select_action(state):
state = torch.from_numpy(state).float().unsqueeze(0)
probs, state_value = model(Variable(state))
action = probs.multinomial().detach()
logp = probs.index_select(dim=1, index=action.squeeze(1)).log()
model.saved_actions.append(SavedAction(action, state_value, logp))
return action.data
def finish_episode():
R = 0
saved_actions = model.saved_actions
rewards = []
for r in model.rewards[::-1]:
R = r + args.gamma * R
rewards.insert(0, R)
rewards = torch.Tensor(rewards)
rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
loss = 0.0
for (action, value, logp), r in zip(saved_actions, rewards):
reward = r - value.data[0,0]
loss += logp * -reward # NOTE: action.reinforce(reward)
loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r])))
optimizer.zero_grad()
loss.backward()
optimizer.step()
del model.rewards[:]
del model.saved_actions[:]
I confirmed a approx match of their results.