Need Help Super Mario Bros PPO Implementation

Hi, I need help in implementing Super Mario Bros agentwith PPO. I tried to run this code using CPU in Google Colab. However, I received this error:

RuntimeError                              Traceback (most recent call last)
<ipython-input-7-7383d21bcea4> in <module>()
    187 
    188             #action = random_choice_prob_index(dist)
--> 189             action=dist.sample()#.view(-1, 1)
    190 
    191             next_state, reward, done, _ = env.step(action)

/usr/local/lib/python3.6/dist-packages/torch/distributions/categorical.py in sample(self, sample_shape)
    105         probs = self.probs.expand(param_shape)
    106         probs_2d = probs.reshape(-1, self._num_events)
--> 107         sample_2d = torch.multinomial(probs_2d, 1, True)
    108         return sample_2d.reshape(sample_shape)
    109 

RuntimeError: invalid multinomial distribution (encountering probability entry < 0)

I’m not sure what’s my error is. Can someone who has experience in Reinforcement Learning help me with my code?

This is my agent code:

  # Prepare environments
    envs = [make_env() for i in range(NUM_ENVS)]
    envs = SubprocVecEnv(envs)
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    state = envs.reset()
    print(state.shape)
    #Size 240
    state_size=envs.observation_space.shape[0]
    #Number of actions 7 (according to NES Controller)
    action_size=envs.action_space.n


    model = BaseActorCriticNetwork(state_size,action_size).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    frame_idx  = 0
    train_epoch = 0
    best_reward = None

    early_stop = False
    while not early_stop:

        log_probs = []
        values    = []
        states    = []
        actions   = []
        rewards   = []
        masks     = []

        for _ in range(PPO_STEPS):
            state = torch.FloatTensor(state).to(device)
            state = state.float()
            #Change format to NCHW
            state = state.permute(0,3,2,1)
            #feed the state into the model
            dist, value = model(state)
            #Change the output to distribution
            dist=Categorical(dist)

            #action = random_choice_prob_index(dist)
            action=dist.sample()#.view(-1, 1)
        
            next_state, reward, done, _ = env.step(action)
            log_prob = dist.log_prob(action)
            
            log_probs.append(log_prob)

This is my model code(Implemented using ActorCritic):

class BaseActorCriticNetwork(nn.Module):
    def __init__(self, input_size, output_size, use_noisy_net=False):
        super(BaseActorCriticNetwork, self).__init__()
        if use_noisy_net:
            linear = NoisyLinear
        else:
            linear = nn.Linear

        self.feature = nn.Sequential(
            linear(input_size, 128),
            nn.ReLU(),
            linear(128, 128),
            nn.ReLU()
        )
        self.actor = linear(128, output_size)
        self.critic = linear(128, 1)
        #stabilize the weights
        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.kaiming_uniform_(p.weight)
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.kaiming_uniform_(p.weight, a=1.0)
                p.bias.data.zero_()

    def forward(self, state):
        x = self.feature(state)
        policy = self.actor(x)
        value = self.critic(x)
        return policy, value

Thanks,
Aqil

Where is your ppo training function? You haven’t even shown the surrograte loss clipping.