I’m trying to implement a simple DQN. And I wonder if I have understood it correctly that it’s fine to apply the loss function to only the difference between the (scalar) target and just one element of the output, something like this:
def fn_reinforce(self,batch): # (state, action, reward, next_state)
for i in range(self.batch_size):
if batch[i].next_state is None:
Q_target = batch[i].reward
Q_predict = self.policy_net(batch[i].state)[0,batch[i].action]
loss = self.loss_fn(Q_predict, Q_target)
else:
with torch.no_grad():
next_Q = torch.max(self.target_net(batch[i].next_state))
Q_target = batch[i].reward + next_Q
Q_predict = self.policy_net(batch[i].state)[0, batch[i].action]
loss = self.loss_fn(Q_predict, Q_target)
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters():
param.grad.data.clamp_(-1, 1)
self.optimizer.step()