A2C model converging to a low score

Gauranga_Das · May 23, 2019, 3:21pm

I am trying to implement the A2C algorithm. But, my model is converging to a low score(10) per episode.
I am implementing from the following link https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f

import numpy as np
import matplotlib.pyplot as plt
import gym
import sys

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

env = gym.make('CartPole-v0')
state = env.reset()
total_rewards = []

num_episodes=1000
batch_size=32
gamma=0.98
epsilon = 1
epsilon_min = 0.005
epsilon_decay = 0.998

class ACNetwork(nn.Module):
    def __init__(self):
        super(ACNetwork,self).__init__()
        self.n_inputs = env.observation_space.shape[0]
        self.n_outputs = env.action_space.n
        
        self.l1 = nn.Linear(self.n_inputs, 32)
        self.l2 = nn.Linear(32,64)
        self.l3 = nn.Linear(64,1)
        self.l4 = nn.Linear(64,self.n_outputs)
        
    def forward(self,x):
        x = self.l1(x)
        x = F.relu(x)
        x = self.l2(x)
        x = F.relu(x)
        values = self.l3(x)
        probs = F.softmax(self.l4(x),dim=-1)
        return values,probs
    
agent = ACNetwork()
optimizer = optim.Adam(agent.parameters(),lr=0.01)
done = False

def select_action(state):
    _,action_probs = agent(torch.from_numpy(state).type(torch.FloatTensor))
    action_probs = action_probs.detach().numpy()
    return np.argmax(action_probs)


for ep in range(num_episodes):
    state = env.reset()
    ep_reward = 0
    states = []
    next_states = []
    actions = []
    rewards = []
    dones = []
    done = False
    while done == False:
        action = select_action(state)
        next_state, reward, done, _ = env.step(action)
        
        if done:
            reward = -1
        else:
            reward = reward 
            ep_reward += 1
        
        states.append(state)
        next_states.append(next_state)
        rewards.append(reward)
        actions.append(action)
        state = next_state
        dones.append(done)
        
        if done:
            print(f"EPISODE : {ep}   REWARD : {ep_reward}")
            # CONVERTING TO TENSOR
            states = torch.FloatTensor(states)
            next_states = torch.FloatTensor(next_states)
            actions = torch.LongTensor(actions)
            rewards = torch.FloatTensor(rewards)
            dones = torch.FloatTensor(dones)
            # PREDICTING VALUES AND PROBABILTIES
            optimizer.zero_grad()
            values,probs = agent(states)
            values = torch.squeeze(values)
            next_values,_ = agent(next_states)
            next_values = torch.squeeze(next_values)
            # LOSS CALCULAION
            adv = rewards + gamma*next_values - values
            log_probs = torch.log(probs[[np.arange(len(actions)), actions]])
            policy_loss = -(log_probs*adv)
            policy_loss = torch.mean(policy_loss)
            value_loss = torch.pow(adv,2)
            value_loss = torch.mean(value_loss)
            loss = value_loss + policy_loss
            loss.backward()
            optimizer.step()