I am training a RL agent on the “CartPole-v0” environment from Gym
. This is my net:
class DiscreteNet(nn.Module):
"""For discrete action space"""
def __init__(self, obs_size, n_actions):
super(DiscreteNet, self).__init__()
self.net = nn.Sequential(
nn.Linear(obs_size, 128),
nn.ReLU(),
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, n_actions),
# Use a sigmoid output do that the values for every action are between 0 and 1
nn.Sigmoid()
)
def forward(self, x):
return self.net(x)
Now I create the network and the environment:
env = gym.make("CartPole-v0")
n_obs = env.observation_space.shape[0]
n_actions = env.action_space.n
print(f"n_obs: {n_obs}, n_actions: {n_actions}")
# The neural network
net = DiscreteNet(n_obs, n_actions).to(device)
# The optimizer to optimize the network parameters
optimizer = optim.Adam(params=net.parameters(), lr=0.01)
And this is how I train it:
Episode = namedtuple("Episode", field_names=["reward", "steps"])
EpisodeStep = namedtuple("EpisodeStep", field_names=["obs_v", "action_v", "log_prob_v"])
for batch_iter_no in range(N_EPISODES * BATCH_SIZE):
# New episode
episode_reward = 0
batch = []
episode_reward = 0.0
episode_steps = []
obs = env.reset()
# Obtain a batch of episodes
while True:
obs_v = torch.FloatTensor([obs]).to(device)
act_v = net(obs_v).to(device)
distribution = Categorical(act_v)
action_v = distribution.sample().to(device)
log_prob_v = distribution.log_prob(action_v).to(device)
next_obs, reward, is_done, _ = env.step(action_v.item())
episode_reward += reward
episode_steps.append(EpisodeStep(obs_v=obs_v, action_v=action_v, log_prob_v=log_prob_v))
if is_done:
batch.append(Episode(reward=episode_reward, steps=episode_steps))
episode_reward = 0
episode_steps = []
next_obs = env.reset()
if len(batch) == BATCH_SIZE:
break
obs = next_obs
# Filter the batch for elite episodes
rewards = [e.reward for e in batch]
reward_bound = np.percentile(rewards, PERCENTILE)
reward_mean = float(np.mean(rewards))
for episode_iter_no, e in enumerate(batch):
if e.reward < reward_bound:
continue
# Train the NN on the elite episode
cumulated_loss = 0
episode_len = 0
for s in e.steps:
# Loss is: - log_prob
loss_v = (- s.log_prob_v).to(device)
# Logging variables
cumulated_loss += loss_v.item()
episode_len += 1
# For some reason this next line is needed
#loss_v.requires_grad = True
optimizer.zero_grad()
loss_v.backward()
optimizer.step()
# Print the logging info after each episode
print(f"{batch_iter_no}, {episode_iter_no}: avg_loss={cumulated_loss/episode_len:.3f},"
f"episode_reward={e.reward}, episode_boundary={reward_bound}")
However this throws the error below. I have tried looking at other questions that had this error, but do not quite understand the error message itself. How do I know which variables are needed for computing the gradients in my code? And which one is causing this error?
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-216-320c323f8526> in <module>
55
56 optimizer.zero_grad()
---> 57 loss_v.backward()
58 optimizer.step()
59
~/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
219 retain_graph=retain_graph,
220 create_graph=create_graph)
--> 221 torch.autograd.backward(self, gradient, retain_graph, create_graph)
222
223 def register_hook(self, hook):
~/anaconda3/envs/sumo_rl/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
128 retain_graph = create_graph
129
--> 130 Variable._execution_engine.run_backward(
131 tensors, grad_tensors_, retain_graph, create_graph,
132 allow_unreachable=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 2]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I did enable torch.autograd.set_detect_anomaly(True)
for the error message above, but it does not change the traceback. I am in a Jupyter Notebook if that is important.