I am having trouble training an agent to play Doom through the vizdoom library using as a state the frame of the game and a convolutional neural network. I am giving you my implementation of the training loop and I wanted to ask if everything is right and the gradients are computed correctly.
def train_policy_gradient_net(env, net, optimizer, num_episodes, gamma, max_trajectory_len, standardize):
rewards_per_episode = 
steps_per_episode = 
loss_per_epoch = 
for i in range(num_episodes): state = env.reset() done = False total_reward = 0 steps = 0 count = 0 log_probabilities =  rewards =  while(count < max_trajectory_len and not done): # use the network's stochastic policy to choose action and convert shape from [1, num_actions] to [num_actions] log_prob = net(torch.tensor(np.expand_dims(np.expand_dims(state, axis=0), axis=0)).to(device)).squeeze(0) # get probabilities from log probabilities prob = torch.exp(log_prob) # for numerical precision reasons the probabilities are normalised with their sum to sum exactly at 1 (because before they might not sum up exactly to 1) prob = Tensor.cpu(prob).detach().numpy() action = np.random.choice(range(env.game.get_available_buttons_size()), p=prob / np.sum(prob)) # keep all the log probabilities of the actions that were selected log_probabilities.append(log_prob[action]) next_state, reward, done = env.step(action) total_reward += reward steps += 1 rewards.append(reward) state = next_state count += 1 rewards_per_episode.append(total_reward) steps_per_episode.append(steps) # train network on the trajectory generated from the episode # compute G for each action g_vals = np.zeros(len(rewards)) g_vals[-1] = rewards[-1] for j in range(len(rewards) - 2, -1, -1): g_vals[j] = rewards[j] + gamma * g_vals[j + 1] if(standardize and len(g_vals) > 1): g_vals = (g_vals - np.mean(g_vals)) / np.std(g_vals) # we want to maximize the gain multiplied by the probabilities so we use a - to make the task to minimize the -gain so we can use pytorch's gradient descent log_probabilities_tensor = torch.stack(log_probabilities) g_vals = torch.tensor(g_vals).to(device) loss = -torch.sum((log_probabilities_tensor * g_vals)) loss_per_epoch.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() return rewards_per_episode, steps_per_episode, loss_per_epoch