Hi there! I have got a problem when i construct a policy gradient example. My value estimator is construct by an easy dnn network. However, as i make this dnn a little bit change, i met this error. I think i haven’t do some inplace operation. Here is my code. thank you in advance!
import itertools
from collections import namedtuple
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as f
from tqdm import tqdm
class PolicyEstimator():
def __init__(self, state: torch.tensor, env: gym.Env, learning_rate: float = 0.01, device=torch.device('cpu'), name: str = "PolicyEstimator"):
self.state = state
self.env = env
self.learning_rate = learning_rate
self.trans_mu = nn.Linear(state.shape[0], 16, bias=True, device=device)
self.trans_sigma = nn.Linear(state.shape[0], 16, bias=True, device=device)
self.model = nn.Sequential(self.trans_mu, self.trans_sigma)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
def predict(self, state: torch.tensor):
if type(state) is np.ndarray: state = torch.tensor(state.astype(np.float32))
self.state = state
self.mu = torch.sigmoid(torch.sum(self.trans_mu(self.state)))
self.sigma = f.softplus(torch.sum(self.trans_sigma(self.state))) + 1e-5
self.normal_dist = torch.distributions.Normal(loc=self.mu, scale=self.sigma)
self.action = self.normal_dist.sample(sample_shape=(1, ))
self.action = torch.clip(self.action, min=self.env.action_space.low[0], max=self.env.action_space.high[0])
return self.action
def loss(self, state: torch.tensor, target: torch.tensor, action: torch.tensor):
if type(state) is np.ndarray: state = torch.tensor(state.astype(np.float32))
self.state = state
self.mu = torch.sigmoid(torch.sum(self.trans_mu(self.state)))
self.sigma = f.softplus(torch.sum(self.trans_sigma(self.state))) + 1e-5
self.normal_dist = torch.distributions.Normal(loc=self.mu, scale=self.sigma)
loss = -self.normal_dist.log_prob(action) * target - 1e-1 * self.normal_dist.entropy()
return loss
def update(self, state: torch.tensor, target: torch.tensor, action: torch.tensor):
self.optimizer.zero_grad()
loss = self.loss(state, target, action)
loss.backward()
self.optimizer.step()
return loss
class ValueEstimator():
def __init__(self, state: torch.tensor, env: gym.Env, learning_rate: float = 0.01, device=torch.device('cpu'), name: str = "value_estimater"):
self.state = state
self.env = env
self.learning_rate = learning_rate
self.classifier = nn.Sequential(nn.Linear(state.shape[0], 32, bias=True, device=device), nn.Sigmoid(), nn.Linear(32, 1, bias=True, device=device),nn.ReLU(inplace=False))
# when i replace above code to this, there will be fine and no error.
# self.classifier = nn.Sequential(nn.Linear(state.shape[0], 32, bias=True, device=device), nn.ReLU())
self.optimizer = torch.optim.Adam(self.classifier.parameters(), lr=self.learning_rate)
def predict(self, state: torch.tensor):
if type(state) is np.ndarray: state = torch.tensor(state.astype(np.float32))
value_estimate = torch.sum(self.classifier(state))
return value_estimate
def loss(self, state: torch.tensor, target: torch.tensor):
value_estimate = self.predict(state)
loss_fn = nn.MSELoss()
loss = loss_fn(value_estimate, target)
return loss
def update(self, state: torch.tensor, target: torch.tensor):
self.optimizer.zero_grad()
loss = self.loss(state, target)
loss.backward(retain_graph=True)
self.optimizer.step()
return loss
EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"])
def actor_critic(env: gym.Env, estimator_policy: PolicyEstimator, estimator_value: ValueEstimator, num_episodes: int, discount_factor: float = 1.0):
Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state', 'done'])
stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))
for i_episode in range(num_episodes):
state = env.reset()
episode = []
for t in tqdm(itertools.count()):
env.render()
action = estimator_policy.predict(state)
next_state, reward, done, _ = env.step(action)
stats.episode_rewards[i_episode] += reward
stats.episode_lengths[i_episode] = t
episode.append(Transition(state=state, action=action, reward=reward, next_state=next_state, done=done))
# calculate TD Target
value_next = estimator_value.predict(next_state)
td_target = reward + value_next * discount_factor
td_error = td_target - estimator_value.predict(state)
# update the value estimator
estimator_value.update(state, td_target)
# update the policy estimator
# using the td_error as our advantage estimate
estimator_policy.update(state, td_error, action)
# print out which step we are on.
print(f"\rstep{t} @ Episode {i_episode+1}/{num_episodes} ({stats.episode_rewards[i_episode - 1]})", end='')
if done: break
state = next_state
return stats
if __name__ == "__main__":
torch.autograd.set_detect_anomaly(True)
env = gym.envs.make("MountainCarContinuous-v0")
state = torch.tensor(env.observation_space.sample())
pe = PolicyEstimator(state, env, learning_rate=0.001)
ve = ValueEstimator(state, env, learning_rate=0.1)
stats = actor_critic(env, pe, ve, 50, discount_factor=0.99)
the error shows:
(hw) ➜ rllearning python -u “/Users/hehaoyuan/rllearning/policyGradient.py”
0it [00:00, ?it/s]/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/autograd/init.py:147: UserWarning: Error detected in MmBackward. Traceback of forward call that caused the error:
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 218, in
stats = actor_critic(env, pe, ve, 50, discount_factor=0.99)
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 191, in actor_critic
td_error = td_target - estimator_value.predict(state)
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 152, in predict
value_estimate = torch.sum(self.classifier(state))
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1051, in _call_impl
return forward_call(*input, **kwargs)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/modules/container.py”, line 139, in forward
input = module(input)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1051, in _call_impl
return forward_call(*input, **kwargs)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/modules/linear.py”, line 96, in forward
return F.linear(input, self.weight, self.bias)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/nn/functional.py”, line 1847, in linear
return torch._C._nn.linear(input, weight, bias)
(Triggered internally at …/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward(
0it [00:00, ?it/s]
Traceback (most recent call last):
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 218, in
stats = actor_critic(env, pe, ve, 50, discount_factor=0.99)
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 200, in actor_critic
estimator_policy.update(state, td_error, action)
File “/Users/hehaoyuan/rllearning/policyGradient.py”, line 135, in update
loss.backward()
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/_tensor.py”, line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File “/Users/hehaoyuan/miniforge3/envs/hw/lib/python3.8/site-packages/torch/autograd/init.py”, line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [32, 1]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!