Hi, I am a reinforcement learning learner, but I think this problem is basically a autograd problem for my bad understanding on torch.
My structure is mlp + lstm + mlp, the version information is : ubuntu OS, python3.8, torch1.11.0+cu115
Here is my case code:
I hope you can help me fix it, thanks!
class ActorPPO(nn.Module):
def __init__(self, mid_dim, hidden_dim, state_dim, action_dim): super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.lstm_hidden_hx = torch.randn([1, 1, hidden_dim], dtype=torch.float, device=self.device) self.lstm_hidden_cx = torch.randn([1, 1, hidden_dim], dtype=torch.float, device=self.device) self.net_mlp1_1 = nn.Linear(state_dim, mid_dim) self.net_mlp1_2 = nn.Linear(mid_dim, hidden_dim) self.net_lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True) self.net_mlp2_1 = nn.Linear(hidden_dim, hidden_dim) self.net_mlp2_2 = nn.Linear(hidden_dim, action_dim) self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True) self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi)) def forward(self, state): torch.autograd.set_detect_anomaly(True) self.net_lstm.flatten_parameters() s = F.relu(self.net_mlp1_1(state)) s = F.relu(self.net_mlp1_2(s)) lstm_out, (self.lstm_hidden_hx, self.lstm_hidden_cx) = self.net_lstm(s, (self.lstm_hidden_hx, self.lstm_hidden_cx)) output = F.relu(self.net_mlp2_1(lstm_out)) output = F.relu(self.net_mlp2_2(lstm_out)) return output.tanh() # action.tanh() def get_logprob_entropy(self, state, action): torch.autograd.set_detect_anomaly(True) a_avg = torch.stack([self.forward(state[i,:,:].unsqueeze(0)) for i in range(0, state.size(0))], dim=0) a_avg = torch.squeeze(a_avg) # shape = [32,128,3] stands for [batch_size, sequence_length, action_dim] a_std = self.a_logstd.exp() delta = ((a_avg - action) / a_std).pow(2) * 0.5 logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(2) # new_logprob return logprob # should be [batch_size, sequence_length] def get_old_logprob(self, _action, noise): # noise = action - a_noise delta = noise.pow(2) * 0.5 return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(2) # old_logprob
class CriticAdv(nn.Module):
def __init__(self, mid_dim, hidden_dim, state_dim): super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.lstm_hidden_hx = torch.randn([1, 1, hidden_dim], dtype=torch.float, device=self.device) self.lstm_hidden_cx = torch.randn([1, 1, hidden_dim], dtype=torch.float, device=self.device) self.net_mlp1 = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, hidden_dim), nn.ReLU(), ) self.net_lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True) self.net_mlp2 = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.Hardswish(), nn.Linear(hidden_dim, 1), ) def forward(self, state): torch.autograd.set_detect_anomaly(True) self.net_lstm.flatten_parameters() s = self.net_mlp1(state) lstm_out, (self.lstm_hidden_hx, self.lstm_hidden_cx) = self.net_lstm(s, (self.lstm_hidden_hx, self.lstm_hidden_cx)) output = self.net_mlp2(lstm_out) return output.tanh() # Q value
class Agent:
def __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.act = ActorPPO(2**5, 2**4, 10, 3).to(self.device) self.cri = CriticAdv(2**5, 2**4, 10).to(self.device) self.criterion = torch.nn.SmoothL1Loss() learning_rate = 1e-4 self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=learning_rate) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=learning_rate) def update_net(self): obj_critic = obj_actor = logprob = obj_critic_c = None state = torch.randn(32, 64, 10).to(torch.device('cuda')) action = torch.randn(32, 64, 3).to(torch.device('cuda')) r_sum = torch.randn(32, 64).to(torch.device('cuda')) logprob = torch.randn(32, 64).to(torch.device('cuda')) advantage = torch.randn(32, 64).to(torch.device('cuda')) new_logprob = self.act.get_logprob_entropy(state, action) ratio = (new_logprob - logprob.detach()).exp() obj_surrogate = advantage * ratio obj_actor = obj_surrogate.mean() self.optim_update(self.act_optimizer, obj_actor) value = torch.stack([self.cri(state[i,:,:].unsqueeze(0)) for i in range(0, state.size(0))], dim=0) value = torch.squeeze(value) obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6) self.optim_update(self.cri_optimizer, obj_critic) @staticmethod def optim_update(optimizer, objective): optimizer.zero_grad() torch.autograd.set_detect_anomaly(True) objective.backward(retain_graph=True) optimizer.step()
When I call update_net, it failed and report the error
[W …\torch\csrc\autograd\python_anomaly_mode.cpp:104] Warning: Error detected in CudnnRnnBackward0. Traceback of forward call that caused the error:
File “test_case.py”, line 114, in
agent.update_net()
File “test_case.py”, line 92, in update_net
new_logprob = self.act.get_logprob_entropy(state, action)
File “test_case.py”, line 42, in get_logprob_entropy
a_avg = torch.stack([self.forward(state[i,:,:].unsqueeze(0)) for i in range(0, state.size(0))], dim=0)
File “test_case.py”, line 42, in
a_avg = torch.stack([self.forward(state[i,:,:].unsqueeze(0)) for i in range(0, state.size(0))], dim=0)
File “test_case.py”, line 34, in forward
lstm_out, (self.lstm_hidden_hx, self.lstm_hidden_cx) = self.net_lstm(s, (self.lstm_hidden_hx, self.lstm_hidden_cx))
File “D:\Anaconda3\envs\tfenv\lib\site-packages\torch\nn\modules\module.py”, line 1102, in _call_impl
return forward_call(*input, **kwargs)
File “D:\Anaconda3\envs\tfenv\lib\site-packages\torch\nn\modules\rnn.py”, line 692, in forward
self.dropout, self.training, self.bidirectional, self.batch_first)
(function print_stack)
Traceback (most recent call last):
File “test_case.py”, line 114, in
agent.update_net()
File “test_case.py”, line 96, in update_net
self.optim_update(self.act_optimizer, obj_actor)
File “test_case.py”, line 107, in optim_update
objective.backward(retain_graph=True)
File “D:\Anaconda3\envs\tfenv\lib\site-packages\torch_tensor.py”, line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "D:\Anaconda3\envs\tfenv\lib\site-packages\torch\autograd_init.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 16]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I hope you can help me ~