Hello,
I added autocast to this a2c model. It is %30 faster, and it looks like it trains well, but can’t be sure if I am doing something wrong here. I would be grateful if you can point it out any mistakes. Thanks.
‘’’
def train_agent():
state = env.reset()
state = torch.from_numpy(state).float().to(device)
for i_episode in range(episode_start, 4000000): # 1000000
log_probs = []
values = []
rewards = []
masks = []
entropy = 0
states = torch.zeros([num_steps, 720, env.shape[1]]).to(device)
actions = []
dones = []
final_reward = []
for step in range(num_steps):
states[step] = state
with autocast():
with torch.no_grad():
value, action, action_log_prob = model.act(state.unsqueeze(1))
action = action.item()
next_state, reward, done, info = env.step(action)
rewards.append(reward)
dones.append(0 if done else 1)
actions.append(action)
if done:
final_reward.append(info)
state = next_state
state = torch.from_numpy(state).float().to(device)
with autocast():
with torch.no_grad():
next_value, _ = model(state.unsqueeze(1))
returns = compute_returns(next_value, rewards, dones)
returns = torch.cat(returns)
actions = torch.tensor(actions).unsqueeze(1).to(device)
states = states.permute(1, 0, 2)
values, action_log_probs, dist_entropy = model.evaluate_actions(states, actions)
advantages = returns - values
value_loss = advantages.pow(2).mean()
action_loss = -(advantages.detach() * action_log_probs.unsqueeze(1)).mean()
value_loss_coef = 0.5
entropy_coef = 0.01
loss = (value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef)
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
max_grad_norm = 0.5
nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
‘’’