Hi, I need help in implementing Super Mario Bros agentwith PPO. I tried to run this code using CPU in Google Colab. However, I received this error:
RuntimeError Traceback (most recent call last)
<ipython-input-7-7383d21bcea4> in <module>()
187
188 #action = random_choice_prob_index(dist)
--> 189 action=dist.sample()#.view(-1, 1)
190
191 next_state, reward, done, _ = env.step(action)
/usr/local/lib/python3.6/dist-packages/torch/distributions/categorical.py in sample(self, sample_shape)
105 probs = self.probs.expand(param_shape)
106 probs_2d = probs.reshape(-1, self._num_events)
--> 107 sample_2d = torch.multinomial(probs_2d, 1, True)
108 return sample_2d.reshape(sample_shape)
109
RuntimeError: invalid multinomial distribution (encountering probability entry < 0)
I’m not sure what’s my error is. Can someone who has experience in Reinforcement Learning help me with my code?
This is my agent code:
# Prepare environments
envs = [make_env() for i in range(NUM_ENVS)]
envs = SubprocVecEnv(envs)
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
state = envs.reset()
print(state.shape)
#Size 240
state_size=envs.observation_space.shape[0]
#Number of actions 7 (according to NES Controller)
action_size=envs.action_space.n
model = BaseActorCriticNetwork(state_size,action_size).to(device)
print(model)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
frame_idx = 0
train_epoch = 0
best_reward = None
early_stop = False
while not early_stop:
log_probs = []
values = []
states = []
actions = []
rewards = []
masks = []
for _ in range(PPO_STEPS):
state = torch.FloatTensor(state).to(device)
state = state.float()
#Change format to NCHW
state = state.permute(0,3,2,1)
#feed the state into the model
dist, value = model(state)
#Change the output to distribution
dist=Categorical(dist)
#action = random_choice_prob_index(dist)
action=dist.sample()#.view(-1, 1)
next_state, reward, done, _ = env.step(action)
log_prob = dist.log_prob(action)
log_probs.append(log_prob)
This is my model code(Implemented using ActorCritic):
class BaseActorCriticNetwork(nn.Module):
def __init__(self, input_size, output_size, use_noisy_net=False):
super(BaseActorCriticNetwork, self).__init__()
if use_noisy_net:
linear = NoisyLinear
else:
linear = nn.Linear
self.feature = nn.Sequential(
linear(input_size, 128),
nn.ReLU(),
linear(128, 128),
nn.ReLU()
)
self.actor = linear(128, output_size)
self.critic = linear(128, 1)
#stabilize the weights
for p in self.modules():
if isinstance(p, nn.Conv2d):
init.kaiming_uniform_(p.weight)
p.bias.data.zero_()
if isinstance(p, nn.Linear):
init.kaiming_uniform_(p.weight, a=1.0)
p.bias.data.zero_()
def forward(self, state):
x = self.feature(state)
policy = self.actor(x)
value = self.critic(x)
return policy, value
Thanks,
Aqil