I am trying to implement a policy gradient method (using reward-to-go and a state value baseline). I’m given a template and my job is to fill in the gaps in the template.
When I try to run the code after filling in the gaps, I get an inplace error (shown at end of post). The template should not have errors, so the problems I encounter should be due to the code I’ve written.
Essentially, I asking if anybody can spot an inplace operation (and provide a workaround) in the code that I show below.
The parts I have filled in are shown below.
1. A function to calculate the return-to-go, given a string of returns over multiple episodes
def _calc_return(self, r, done):
'''
TODO 2.1: Given a tensor of per-timestep rewards (r), and a tensor (done)
indicating if a timestep is the last timestep of an episode, Output a
tensor (return_t) containing the return (i.e. reward-to-go) at each timestep.
'''
gamma = self._discount
done_indices = torch.nonzero(done, as_tuple=False)
indices = torch.cat((torch.zeros(1), torch.squeeze(torch.transpose(done_indices, 0, 1)), torch.Tensor([len(r) - 1])))
return_t_list = []
for i in range(len(indices) - 1):
if i == 0:
episode = r[int(indices[i]):int(indices[i + 1] + 1)]
episode_indices = torch.arange(0, len(episode))
for j in range(len(episode_indices)):
episode_to_go = episode[j:]
indices_to_go = episode_indices[j:]
gammas = torch.full([len(episode_to_go)], gamma)
exponents = episode_indices[:len(episode_indices) - j]
discounts = torch.pow(gammas, exponents)
discounted_sum = torch.dot(episode_to_go, discounts)
return_t_list.append(discounted_sum)
if i != 0:
episode = r[int(indices[i] + 1):int(indices[i + 1] + 1)]
episode_indices = torch.arange(0, len(episode))
for j in range(len(episode_indices)):
episode_to_go = episode[j:]
indices_to_go = episode_indices[j:]
gammas = torch.full([len(episode_to_go)], gamma)
exponents = episode_indices[:len(episode_indices) - j]
discounts = torch.pow(gammas, exponents)
discounted_sum = torch.dot(episode_to_go, discounts)
return_t_list.append(discounted_sum)
return_t = torch.stack(return_t_list)
return return_t
2. A function to calculate the “advantage” (reward-to-go minus state value)
def _calc_adv(self, norm_obs, ret):
'''
TODO 2.2: Given the normalized observations (norm_obs) and the return at
every timestep (ret), output the advantage at each timestep (adv).
'''
value = self._model.eval_critic(norm_obs)
value = torch.squeeze(torch.transpose(value, 0, 1))
adv = ret - value
return adv
3. A function to calculate the critic loss, i.e. the loss used to train the value function
def _calc_critic_loss(self, norm_obs, tar_val):
'''
TODO 2.3: Given the normalized observations (norm_obs) and the returns at
every timestep (tar_val), compute a loss for updating the value
function (critic).
'''
value = self._model.eval_critic(norm_obs)
value = torch.squeeze(torch.transpose(value, 0, 1))
squared_diff = (tar_val - value)**2
loss = squared_diff.mean()
return loss
4. A function to calculate the actor loss, i.e the loss used to train the policy
def _calc_actor_loss(self, norm_obs, norm_a, adv):
'''
TODO 2.4: Given the normalized observations (norm_obs), normalized
actions (norm_a), and the advantage at every timestep (adv), compute
a loss for updating the policy (actor).
'''
policy = self._model.eval_actor(norm_obs)
policy_a = policy.log_prob(norm_a)
loss = -(adv * policy_a).mean()
return loss
Now, when I run all the code, I get the following error:
Traceback (most recent call last):
File "run.py", line 103, in <module>
main(sys.argv)
File "run.py", line 93, in main
train(agent=agent, max_samples=max_samples, out_model_file=out_model_file,
File "run.py", line 50, in train
agent.train_model(max_samples=max_samples, out_model_file=out_model_file,
File "/Users/jesse/rl_assignments/learning/base_agent.py", line 57, in train_model
train_info = self._train_iter()
File "/Users/jesse/rl_assignments/learning/base_agent.py", line 226, in _train_iter
train_info = self._update_model()
File "/Users/jesse/rl_assignments/a2/pg_agent.py", line 144, in _update_model
actor_info = self._update_actor(actor_batch)
File "/Users/jesse/rl_assignments/a2/pg_agent.py", line 187, in _update_actor
loss.backward()
File "/Users/jesse/anaconda3/envs/rl/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/Users/jesse/anaconda3/envs/rl/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 1]], which is output 0 of AsStridedBackward0, is at version 41; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
Essentially, I am guessing that I’ve done an inplace operation somewhere that is messing with the gradient computation. However, I do not know what to look for, and torch.autograd.set_detect_anomaly(True)
doesn’t seem helpful.
I appreciate any help.