Hi, I am trying to write asynchronous actor-critic in pytorch. While calculating the generalized advantage estimate I ran into an unusual error.
The error was reported by the line I highlighted in the code using ** . The error raised was Can’t call numpy() on Variable that requires grad. Use var.detach().numpy() instead. If I use detach on the values then the code runs fine but I guess since detach removes the values variable from the computational graph and no gradients will be calculated for them. Hence, gradients calculated will wrong. Please correct me if I am wrong. Thanks.
The line that generates this error is :
delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
Replacing this line with this works fine :
delta_t = rewards[i] + args.gamma * values[i + 1].detach() - values[i].detach()
Code
def train(rank, args, shared_model, counter, lock, optimizer=None):
torch.manual_seed(args.seed+rank)
cell_size = 1.0
view_size = args.view_size
max_vel = args.max_speed_agent
time_period = 0.2
DENSITIES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
env = TrafficSim(render=False, render_grid=True,time_period=time_period,fps=20,cell_size=cell_size,view_size=view_size,comm_mode=False,frac_cells=0.6,regions_width=3,max_vel=max_vel,lane_lambda=0.0,query_lambda=0.0,trajec_file='tsim/micro.pkl')
model = ActorCritic(args.observation_size, args.num_actions)
writer = SummaryWriter(args.logdir+args.exp_name+"_process_"+str(rank))
if optimizer is None:
optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
model.train()
sampled_prob = random.sample(DENSITIES, k=1)[0]
state = env.reset(sampled_prob)
state = torch.from_numpy(state).float()
done = True
episode_length = 0
while True:
# Sync with the shared model at the start of episode or after tmax
model.load_state_dict(shared_model.state_dict())
if done:
cx = torch.zeros(1, model.memsize)
hx = torch.zeros(1, model.memsize)
else:
cx = cx.detach()
hx = hx.detach()
values = []
log_probs = []
rewards = []
entropies = []
for step in range(args.num_steps):
episode_length += 1
value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
prob = F.softmax(logit, dim=-1)
log_prob = F.log_softmax(logit, dim=-1)
entropy = -(log_prob * prob).sum(1, keepdim=True)
entropies.append(entropy)
action = prob.multinomial(num_samples=1).detach()
log_prob = log_prob.gather(1, action)
reward, state, done = env.step(action.numpy()[0,0],"NULL")
done = done or episode_length >= args.max_epsiode_length
#reward = max(min(reward, 1), -1)
with lock:
counter.value += 1
if (counter.value % args.save_interval_steps) == 0:
print("Saved Weights")
torch.save(shared_model.state_dict(),"weights_"+args.exp_name)
if done:
episode_length = 0
sampled_prob = random.sample(DENSITIES, k=1)[0]
state = np.array(env.reset(sampled_prob)).flatten()
state = torch.from_numpy(state).float()
values.append(value)
log_probs.append(log_prob)
rewards.append(reward)
if done:
break
R = torch.zeros(1,1)
if not done:
value, _, _ = model((state.unsqueeze(0), (hx,cx)))
R = value.detach()
values.append(R)
policy_loss = 0.0
value_loss = 0.0
gae = torch.zeros(1,1)
for i in reversed(range(len(rewards))):
R = args.gamma * R + rewards[i]
advantage = R - values[i]
value_loss = value_loss + 0.5 * advantage.pow(2)
# GAE
**delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]**
gae = gae * args.gamma * args.tau + delta_t
policy_loss = policy_loss - log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]
optimizer.zero_grad()
print("Worker : %d, Policy Loss : %.2f, Value Loss : %.2f, Entropy : %.2f"%(rank, policy_loss.item(), value_loss.item(), 0.0))
writer.add_scalar("Policy Loss",policy_loss.item(),counter.value)
writer.add_scalar("Value Loss",value_loss.item(),counter.value)
(policy_loss + args.value_loss_coef * value_loss).backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
ensure_shared_grads(model, shared_model)
optimizer.step()