Runtime error in a RL agent backward function

Trying to test an RL agent and this error happened, tried everything in the book and still the same error, would be grateful if someone helped me.
Thanks in advance :innocent:

class DQN(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DQN, self).__init__()
        
        self.lr= lr
        self.input_dims= input_dims
        self.fc1_dims= fc1_dims
        self.fc2_dims= fc2_dims
        self.n_actions= n_actions
        
        self.fc1= nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2= nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3= nn.Linear(self.fc2_dims, self.n_actions)
        #self.fc3= nn.Linear(self.fc2_dims, self.fc3_dims)
        #self.fc4= nn.Linear(self.fc3_dims, self.n_actions)
        
        self.optimizer= optim.Adam(self.parameters(), lr=lr)
        self.loss= nn.MSELoss()
        self.device= T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        
        self.to(self.device)
        
    def forward(self, state):
        '''
        This function return the Q values of each state
        '''
        #state= obs.clone().detach().to(self.device)
        x= F.relu(self.fc1(state))
        x= F.relu(self.fc2(x))        
        actions= F.relu(self.fc3(x))
        return actions
        
class ShiftableAgent:
    def __init__(self, lr, gamma, epsilon,input_dims,batch_size, n_actions,
                 max_mem_size=100000, eps_min=0.1, eps_dec=0.01,momentum=0.90):
        
        self.lr= lr
        self.gamma= gamma
        self.epsilon= epsilon
        self.epsilon_min= eps_min
        self.epsilon_decrement= eps_dec
        self.batch_size= batch_size
        self.action_space= [action for action in range(n_actions)]
        self.mem_size= max_mem_size
        self.mem_ctr= 0
        self.iter_ctr= 0
        self.exploit_times= 0
        self.replace_target= 100
        
        
        self.Q_eval= DQN(self.lr, input_dims, fc1_dims= 256, fc2_dims=256, n_actions= n_actions)
        
        self.state_memory= np.zeros((self.mem_size, *input_dims), dtype= np.float32)
        self.action_memory= np.zeros(self.mem_size, dtype= np.int32)
        self.reward_memory= np.zeros(self.mem_size, dtype=np.int32)
        self.new_state_memory= np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.terminal_memory= np.zeros(self.mem_size, dtype= np.uint8)
    
    
    def store_transition(self, state, action,reward, state_, done):
        index= self.mem_ctr % self.mem_size
        
        self.state_memory[index]= state
        self.action_memory[index]= action
        self.reward_memory[index]= reward
        self.new_state_memory[index]= state_
        self.terminal_memory[index]= done
        self.mem_ctr+=1
    
    def choose_action(self, observation):
        if np.random.uniform(0,1) > self.epsilon:
            state= T.tensor([observation]).to(self.q_eval.device)
            actions= self.q_eval.forward(state)
            action= T.argmax(actions).item()
        else:
            
            action= np.random.choice(self.action_space)
        return action
    
    def learn(self):
        
        if self.mem_ctr < self.batch_size:
            return
        
        self.Q_eval.optimizer.zero_grad()
        
        max_mem= min(self.mem_ctr, self.mem_size)
        batch= np.random.choice(max_mem, self.batch_size, replace= False)
        batch_index= np.arange(self.batch_size, dtype= np.int32)
        
        state_batch= T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        next_state_batch= T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch= T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch= T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
        action_batch= self.action_memory[batch]
        
        q_eval= self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next= self.Q_eval.forward(next_state_batch)
        q_next[terminal_batch.bool()]= 0
        
        q_target= reward_batch + self.gamma * T.max(q_next, dim= 1)[0]
        
        loss= self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        
        self.Q_eval.optimizer.step()
        self.epsilon= self.epsilon - self.epsilon_decrement if self.epsilon > self.epsilon_min else self.epsilon_min

env= shift_env(num_hours=24, appliance_hourly_power_consumption=0.083, hourly_total_power_consumption= 25,
               max_on_times=3, appliance_runtime=1.5)

#time= 12960 # one year and half of training
time= 48
for t in range(time):
    
    state= env.reset()
    score= 0
    done= False
    print(f'at time t {t}')
    
    while not done:
        action= agent.choose_action(state)
        next_state, reward, done, info= env.step(action)
        score+=reward
        #print(f'state: {state}, action: {action}, next_state: {next_state}, reward: {reward}, done: {done}')
        agent.store_transition(state, action, next_state, reward, done)
        agent.learn()
        state= next_state
        if action==0:
            action_0.update({t: action})
        
        
    scores.append(score)
    eps_history.append(agent.epsilon)
  
    print(f'time {t+1} score {score}\n')
    print(info)
    print('_'*20)

agent= ShiftableAgent(lr= 0.7, gamma=0.8, epsilon=1.0, input_dims=[1],batch_size=128, n_actions=2,)

RuntimeError                              Traceback (most recent call last)
Cell In[12], line 16
     14 #print(f'state: {state}, action: {action}, next_state: {next_state}, reward: {reward}, done: {done}')
     15 agent.store_transition(state, action, next_state, reward, done)
---> 16 agent.learn()
     17 state= next_state
     18 if action==0:

Cell In[7], line 72, in ShiftableAgent.learn(self)
     69 q_target= reward_batch + self.gamma * T.max(q_next, dim= 1)[0]
     71 loss= self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
---> 72 loss.backward()
     74 self.Q_eval.optimizer.step()
     75 self.epsilon= self.epsilon - self.epsilon_decrement if self.epsilon > self.epsilon_min else self.epsilon_min

File D:\PRO\anaconda3\Lib\site-packages\torch\_tensor.py:522, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    512 if has_torch_function_unary(self):
    513     return handle_torch_function(
    514         Tensor.backward,
    515         (self,),
   (...)
    520         inputs=inputs,
    521     )
--> 522 torch.autograd.backward(
    523     self, gradient, retain_graph, create_graph, inputs=inputs
    524 )

File D:\PRO\anaconda3\Lib\site-packages\torch\autograd\__init__.py:266, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    261     retain_graph = create_graph
    263 # The reason we repeat the same comment below is that
    264 # some Python versions print out the first line of a multi-line function
    265 # calls in the traceback and some print out the last line
--> 266 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    267     tensors,
    268     grad_tensors_,
    269     retain_graph,
    270     create_graph,
    271     inputs,
    272     allow_unreachable=True,
    273     accumulate_grad=True,
    274 )

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 2]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck! 

Can’t track where the change happened and how to fix it