Actor-critic / inplace operation error

I’m making a code(actor-critic frame / Exactly counterfactual multi-agent(COMA).
and anyway

I can’t fix following error

Traceback (most recent call last):
  File "C:/Users/sueng/OneDrive - SNU/FJSP_MARL/COMA.py", line 330, in <module>
    agents.train()
  File "C:\Users\sueng\OneDrive - SNU\FJSP_MARL\COMA.py", line 216, in train
    actor_loss.backward()
  File "C:\Users\sueng\anaconda3\envs\FJSP\lib\site-packages\torch\tensor.py", line 166, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "C:\Users\sueng\anaconda3\envs\FJSP\lib\site-packages\torch\autograd\__init__.py", line 99, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 36]], which is output 0 of SoftmaxBackward, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
import torch
import torch.nn as nn
import torch.nn.functional as F
from FJSP import RL_ENV

from torch.distributions import Categorical
from copy import deepcopy


class Memory:
    def __init__(self, agent_num, action_dim):
        self.agent_num = agent_num
        self.action_dim = action_dim
        self.states = []
        self.actions = []
        self.actions_last = []
        self.observations = []
        self.pi = [[] for _ in range(agent_num)]
        self.reward = []
        self.done = [[] for _ in range(agent_num)]
    def get(self):
        actions = torch.tensor(self.actions)
        observations = self.observations
        states = self.states
        actions_last = self.actions_last
        pi = []
        for i in range(self.agent_num):
            pi.append(torch.cat(self.pi[i]).view(len(self.pi[i]), self.action_dim))
        reward = torch.tensor(self.reward)
        done = self.done
        return states, actions, observations, pi, reward, actions_last, done
    def clear(self):
        self.actions = []
        self.observations = []
        self.states = []
        self.actions_last = []
        self.pi = [[] for _ in range(self.agent_num)]
        self.reward = []
        self.done = [[] for _ in range(self.agent_num)]


class Actor(nn.Module):
    def __init__(self, obs_dim, action_dim, hidden_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(obs_dim+action_dim, 64)
        self.GRU_layer = nn.GRUCell(64, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)
        #self.fc3 = nn.Linear(45, action_dim)
        self.ReLU = nn.ReLU()
        self.Softmax = nn.Softmax(dim = 1)
    def forward(self, obs_a_cat, hidden_last):
        x = self.ReLU(self.fc1(obs_a_cat))
        gru_out = self.GRU_layer(x, hidden_last)
        utility = self.fc2(gru_out)
        action_prob = self.Softmax(utility)
        #utility = x.detach()
        return action_prob, gru_out, utility

class Critic(nn.Module):
    def __init__(self, agent_num, state_dim, obs_dim, action_dim):
        super(Critic, self).__init__()

        input_dim = (agent_num-1) *action_dim + state_dim + obs_dim +agent_num + action_dim
        #     action_dim + state_dim + obs_dim +   + action_dim
        # other_agent_actions, states, obs, agent_id, agent_last_action

        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        output = self.fc3(x)
        return output


class COMA:
    def __init__(self, agent_num, state_dim, obs_dim, action_dim, hidden_dim, lr_c, lr_a, gamma, target_update_steps):
        self.agent_num = agent_num
        self.state_dim = state_dim
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim

        self.gamma = gamma

        self.target_update_steps = target_update_steps

        self.memory = Memory(agent_num, action_dim)

        self.actors = [Actor(obs_dim, action_dim, hidden_dim) for _ in range(agent_num)]
        self.critic = Critic(agent_num, state_dim, obs_dim, action_dim)

        self.critic_target = Critic(agent_num, state_dim, obs_dim, action_dim)
        self.critic_target.load_state_dict(self.critic.state_dict())

        self.actors_optimizer = [torch.optim.Adam(self.actors[i].parameters(), lr=lr_a) for i in range(agent_num)]
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr_c)

        self.encoded_action = np.eye(action_dim)
        self.encoded_id = np.eye(agent_num)
        self.count = 0

    def get_actions(self, observations, avail_actions, action_last, hidden_last):
        #print(observations.shape)
        observations = np.array(observations).reshape(-1, self.obs_dim)
        action_last = np.array(action_last).reshape(-1, self.action_dim)
        obs_u_cat = np.concatenate([observations, action_last], axis = 1)
        #print(obs_u_cat.shape)


        actions = []
        hiddens = []
        utilities = []


        for i in range(self.agent_num):

            obs_u_i = torch.Tensor(obs_u_cat[i]).reshape(1, -1)
            hidden_i = torch.Tensor(hidden_last[i]).reshape(1, -1)

            #print(obs_u_i.shape, hidden_i.shape)

            #print(obs_u_i.shape, hidden_i.shape, self.obs_dim+self.action_dim)
            #hidden_i = hidden_i.view(1, -1)
            #
            #with torch.autograd.set_detect_anomaly(True):
            dist, hidden, utility = self.actors[i](obs_u_i, hidden_i)
            self.memory.pi[i].append(dist)
            avail_action = avail_actions[i]
            mask = torch.from_numpy(np.array(avail_action)).reshape(1,-1)
            dist[mask==0] = 0

            action = Categorical(dist).sample()
            actions.append(action.item())
            hiddens.append(hidden)

            utilities.append(utility.detach().numpy().tolist())

        return actions, hiddens, utilities

    def train(self):
        actor_optimizer = self.actors_optimizer
        critic_optimizer = self.critic_optimizer
        states, actions, observations, pi, reward, actions_last, done = self.memory.get()
        states = torch.Tensor(states).view(-1, self.state_dim)

        encoded_actions = [self.encoded_action[act].tolist() for act in actions]


        actions_last = np.reshape(actions_last, (self.agent_num, -1, self.action_dim))
        observations = np.reshape(observations, (self.agent_num, -1, self.obs_dim))



        print("????????????????")

        for i in range(self.agent_num):
                # train actor
            other_agent_actions = deepcopy(encoded_actions)
            other_agent_actions = [delete_index(actions, i) for actions in other_agent_actions]
            agent_last_action = actions_last[i]
            obs = observations[i]
            agent_id = [self.encoded_id[i] for _ in range(len(reward))]
            input_critic = self.build_input_critic(other_agent_actions, states, obs, agent_id, agent_last_action)

            Q_target = self.critic_target(input_critic).detach() # target network는 (직접적인) 학습을 수행하지 않는 파라미터이므로 detach()를 써야함

            action_taken = actions.type(torch.long)[:, i].reshape(-1, 1)

            """ pi((u_a_1,..... u_a-) * q(s, (u_a_1,..... u_a-)) + 
                pi((u_a_2,..... u_a-) * q(s, (u_a_2,..... u_a-)) +...
                pi((u_a_|U|,.....u_a_) * q(s, (u_a_|U|,....u_a_))"""

            Q_taken_target = torch.gather(Q_target, dim=1, index=action_taken).squeeze()

            # train critic

            Q = self.critic(input_critic)
            action_taken = actions.type(torch.long)[:, i].reshape(-1, 1)
            Q_taken = torch.gather(Q, dim=1, index=action_taken).squeeze()



            y_i = []
            for t in range(len(reward)):
                if done[t]:
                    y_i.append(reward[t] + self.gamma * Q_taken_target[t + 1].item())
                else:
                    y_i.append(reward[t])

            #y_i = torch.Tensor([ if done[t] else reward[t] for t in range(len(reward))])
            y_i = torch.Tensor(y_i)
            #print(y_i.shape, Q_taken.shape)
            critic_loss = torch.mean((y_i.detach() - Q_taken) ** 2)
            critic_optimizer.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 5)
            critic_optimizer.step()



            Q_taken_target_clone = Q_taken_target.clone()
            Q_target_clone = Q_target.clone()
            baseline = torch.sum(pi[i] * Q_target_clone, dim=1)

            advantage = Q_taken_target_clone.detach() - baseline # advantage 계산
            log_pi = torch.log(torch.gather(pi[i], dim=1, index=action_taken).squeeze())
            actor_loss = - torch.mean(advantage.detach() * log_pi)
            print(actor_loss)

            actor_optimizer[i].zero_grad()
            #with torch.autograd.set_detect_anomaly(True):

            actor_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.actors[i].parameters(), 5)


            actor_optimizer[i].step()


        if self.count == self.target_update_steps:
            self.critic_target.load_state_dict(agents.critic.state_dict())
            self.count = 0
        else:
            self.count += 1

        self.memory.clear()


    def build_input_critic(self, other_agent_actions, state, obs, agent_id, agent_last_action): #agent_id, observations, actions):
        other_agent_actions = torch.Tensor(other_agent_actions).view(-1, (self.agent_num-1)*self.action_dim)
        state = torch.Tensor(state).view(-1, self.state_dim)
        obs = torch.Tensor(obs).view(-1, self.obs_dim)
        agent_id = torch.Tensor(agent_id).view(-1, self.agent_num)
        agent_last_action = torch.Tensor(agent_last_action).view(-1, self.action_dim)
        input_critic = torch.cat([other_agent_actions, state, obs, agent_id, agent_last_action], axis = 1)
        return input_critic

def delete_index(input, idx):
    del input[idx]
    return input

import matplotlib.pyplot as plt
import numpy as np

from COMA import COMA


def moving_average(x, N):
    return np.convolve(x, np.ones((N,)) / N, mode='valid')


if __name__ == "__main__":
    # Hyperparameters
    #with torch.autograd.set_detect_anomaly(True):
    env = RL_ENV()
    env_info = env.get_env_info()

    agent_num = env_info["n_agents"]
    obs_dim = env_info['obs_shape'] #+ agent_num
    state_dim = env_info['state_shape']
    action_dim = env_info["n_actions"]
    hidden_dim = 32

    gamma = 0.99
    lr_a = 0.0001
    lr_c = 0.005

    target_update_steps = 10

    # agent initialisation

    agents = COMA(agent_num, state_dim, obs_dim, action_dim, hidden_dim, lr_c, lr_a, gamma, target_update_steps)


    env.render()

    episode_reward = 0
    episodes_reward = []

    # training loop

    n_episodes = 10000
    episode = 0

    for episode in range(n_episodes):
        env = RL_ENV()
        env.render()
        episode_reward = 0
        makespan = 0
        actions_last = env.last_action
        hidden_last = torch.zeros((agent_num, hidden_dim))
        done = False
        step = 0
        while done==False:

            obs = env.get_obs()
            state = env.get_state()


            agents.memory.actions_last.append(actions_last)
            avail_actions = np.array(env.get_avail_actions())
            actions, hidden_last, utility = agents.get_actions(obs, avail_actions, actions_last, hidden_last)

            utility = utility

            indices = torch.LongTensor(actions).view(-1, 1)

            utility = torch.Tensor(utility).view(agent_num, -1)

            utility = torch.gather(utility, 1, indices)

            reward, done, actions = env.step(actions, utility)
            reward = reward / 100
            agents.memory.states.append(state)
            agents.memory.observations.append(obs)
            agents.memory.actions.append(actions)
            agents.memory.done.append(done)
            agents.memory.reward.append(reward)
            step = step +1
            print(step)


            if done:
                episodes_reward.append(episode_reward)
                episode_reward = 0
                episode = episode +1
                agents.train()

                score = env.env.now
                print(f"episode: {episode}, average reward: {score}")