Backward() failed because some inplace operation but I cant find it

Hi, I am a reinforcement learning learner, but I think this problem is basically a autograd problem for my bad understanding on torch.
My structure is mlp + lstm + mlp, the version information is : ubuntu OS, python3.8, torch1.11.0+cu115
Here is my case code:
I hope you can help me fix it, thanks!

class ActorPPO(nn.Module):

def __init__(self, mid_dim, hidden_dim, state_dim, action_dim):
    super().__init__()
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.lstm_hidden_hx = torch.randn([1, 1, hidden_dim], dtype=torch.float, device=self.device)
    self.lstm_hidden_cx = torch.randn([1, 1, hidden_dim], dtype=torch.float, device=self.device)
    self.net_mlp1_1 = nn.Linear(state_dim, mid_dim)
    self.net_mlp1_2 = nn.Linear(mid_dim, hidden_dim)
    self.net_lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
    self.net_mlp2_1 = nn.Linear(hidden_dim, hidden_dim)
    self.net_mlp2_2 = nn.Linear(hidden_dim, action_dim)

    self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
    self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))

def forward(self, state):
    torch.autograd.set_detect_anomaly(True)
    self.net_lstm.flatten_parameters()   
    s = F.relu(self.net_mlp1_1(state))
    s = F.relu(self.net_mlp1_2(s))
    lstm_out, (self.lstm_hidden_hx, self.lstm_hidden_cx) = self.net_lstm(s, (self.lstm_hidden_hx, self.lstm_hidden_cx))
    output = F.relu(self.net_mlp2_1(lstm_out))
    output = F.relu(self.net_mlp2_2(lstm_out))
    
    return output.tanh()  # action.tanh()

def get_logprob_entropy(self, state, action):
    torch.autograd.set_detect_anomaly(True)           
    a_avg = torch.stack([self.forward(state[i,:,:].unsqueeze(0)) for i in range(0, state.size(0))], dim=0)
    a_avg = torch.squeeze(a_avg) # shape = [32,128,3] stands for [batch_size, sequence_length, action_dim]
    a_std = self.a_logstd.exp()
    delta = ((a_avg - action) / a_std).pow(2) * 0.5
    logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(2)  # new_logprob
   
    return logprob # should be [batch_size, sequence_length]

def get_old_logprob(self, _action, noise):  # noise = action - a_noise
    delta = noise.pow(2) * 0.5
    return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(2)  # old_logprob

class CriticAdv(nn.Module):

def __init__(self, mid_dim, hidden_dim, state_dim):
    super().__init__()
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.lstm_hidden_hx = torch.randn([1, 1, hidden_dim], dtype=torch.float, device=self.device)
    self.lstm_hidden_cx = torch.randn([1, 1, hidden_dim], dtype=torch.float, device=self.device)
    self.net_mlp1 = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
                             nn.Linear(mid_dim, hidden_dim), nn.ReLU(), )
    self.net_lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
    self.net_mlp2 = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.Hardswish(),
                                  nn.Linear(hidden_dim, 1), )

def forward(self, state):
    torch.autograd.set_detect_anomaly(True)
    self.net_lstm.flatten_parameters() 
    s = self.net_mlp1(state)
    lstm_out, (self.lstm_hidden_hx, self.lstm_hidden_cx) = self.net_lstm(s, (self.lstm_hidden_hx, self.lstm_hidden_cx))
    output = self.net_mlp2(lstm_out)
    
    return output.tanh()  # Q value

class Agent:

def __init__(self):
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.act = ActorPPO(2**5, 2**4, 10, 3).to(self.device)
    self.cri = CriticAdv(2**5, 2**4, 10).to(self.device)
    self.criterion = torch.nn.SmoothL1Loss()
    learning_rate = 1e-4
    self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=learning_rate)
    self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=learning_rate)
def update_net(self):
    obj_critic = obj_actor = logprob = obj_critic_c = None
    state = torch.randn(32, 64, 10).to(torch.device('cuda'))
    action = torch.randn(32, 64, 3).to(torch.device('cuda'))
    r_sum = torch.randn(32, 64).to(torch.device('cuda'))
    logprob = torch.randn(32, 64).to(torch.device('cuda'))
    advantage = torch.randn(32, 64).to(torch.device('cuda'))
    new_logprob = self.act.get_logprob_entropy(state, action)
    ratio = (new_logprob - logprob.detach()).exp()
    obj_surrogate = advantage * ratio
    obj_actor = obj_surrogate.mean()
    self.optim_update(self.act_optimizer, obj_actor)

    value = torch.stack([self.cri(state[i,:,:].unsqueeze(0)) for i in range(0, state.size(0))], dim=0)
    value = torch.squeeze(value)
    obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6)
    self.optim_update(self.cri_optimizer, obj_critic)
        
@staticmethod
def optim_update(optimizer, objective):
    optimizer.zero_grad()
    torch.autograd.set_detect_anomaly(True)
    objective.backward(retain_graph=True)
    optimizer.step()

When I call update_net, it failed and report the error

[W …\torch\csrc\autograd\python_anomaly_mode.cpp:104] Warning: Error detected in CudnnRnnBackward0. Traceback of forward call that caused the error:
File “test_case.py”, line 114, in
agent.update_net()
File “test_case.py”, line 92, in update_net
new_logprob = self.act.get_logprob_entropy(state, action)
File “test_case.py”, line 42, in get_logprob_entropy
a_avg = torch.stack([self.forward(state[i,:,:].unsqueeze(0)) for i in range(0, state.size(0))], dim=0)
File “test_case.py”, line 42, in
a_avg = torch.stack([self.forward(state[i,:,:].unsqueeze(0)) for i in range(0, state.size(0))], dim=0)
File “test_case.py”, line 34, in forward
lstm_out, (self.lstm_hidden_hx, self.lstm_hidden_cx) = self.net_lstm(s, (self.lstm_hidden_hx, self.lstm_hidden_cx))
File “D:\Anaconda3\envs\tfenv\lib\site-packages\torch\nn\modules\module.py”, line 1102, in _call_impl
return forward_call(*input, **kwargs)
File “D:\Anaconda3\envs\tfenv\lib\site-packages\torch\nn\modules\rnn.py”, line 692, in forward
self.dropout, self.training, self.bidirectional, self.batch_first)
(function print_stack)
Traceback (most recent call last):
File “test_case.py”, line 114, in
agent.update_net()
File “test_case.py”, line 96, in update_net
self.optim_update(self.act_optimizer, obj_actor)
File “test_case.py”, line 107, in optim_update
objective.backward(retain_graph=True)
File “D:\Anaconda3\envs\tfenv\lib\site-packages\torch_tensor.py”, line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "D:\Anaconda3\envs\tfenv\lib\site-packages\torch\autograd_init
.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 16]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

I hope you can help me ~

Well I think I found the reason
The initialization of hidden state should be like:
self.lstm_hidden = None
and then the forward process:
output, self.lstm_hidden = nn.LSTM(s)