Implementation of vanilla policy gradient (reinforce) method

Ategantos · May 14, 2023, 1:38pm

I am having trouble training an agent to play Doom through the vizdoom library using as a state the frame of the game and a convolutional neural network. I am giving you my implementation of the training loop and I wanted to ask if everything is right and the gradients are computed correctly.

def train_policy_gradient_net(env, net, optimizer, num_episodes, gamma, max_trajectory_len, standardize):
rewards_per_episode =
steps_per_episode =
loss_per_epoch =

for i in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    steps = 0

    count = 0
    log_probabilities = []
    rewards = []
    while(count < max_trajectory_len and not done):
        # use the network's stochastic policy to choose action and convert shape from [1, num_actions] to [num_actions]
        log_prob = net(torch.tensor(np.expand_dims(np.expand_dims(state, axis=0), axis=0)).to(device)).squeeze(0)
        # get probabilities from log probabilities
        prob = torch.exp(log_prob)
        
        # for numerical precision reasons the probabilities are normalised with their sum to sum exactly at 1 (because before they might not sum up exactly to 1)
        prob = Tensor.cpu(prob).detach().numpy()
        action = np.random.choice(range(env.game.get_available_buttons_size()), p=prob / np.sum(prob))

        # keep all the log probabilities of the actions that were selected
        log_probabilities.append(log_prob[action])

        next_state, reward, done = env.step(action)
        
        total_reward += reward
        steps += 1

        rewards.append(reward)

        state = next_state
        count += 1
    
    rewards_per_episode.append(total_reward)
    steps_per_episode.append(steps)
    
    # train network on the trajectory generated from the episode
    # compute G for each action
    g_vals = np.zeros(len(rewards))
    g_vals[-1] = rewards[-1]
    for j in range(len(rewards) - 2, -1, -1):
        g_vals[j] = rewards[j] + gamma * g_vals[j + 1]

    if(standardize and len(g_vals) > 1):
        g_vals = (g_vals - np.mean(g_vals)) / np.std(g_vals)

    # we want to maximize the gain multiplied by the probabilities so we use a - to make the task to minimize the -gain so we can use pytorch's gradient descent
    log_probabilities_tensor = torch.stack(log_probabilities)
    g_vals = torch.tensor(g_vals).to(device)
    loss = -torch.sum((log_probabilities_tensor * g_vals))
    loss_per_epoch.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

return rewards_per_episode, steps_per_episode, loss_per_epoch