Deep Q-Learning: Grad is none though leaf=True requires_grad=True

Hi

I am trying to solve a Capacitated Vehicle Routing Problem with Deep Q-Learning. However my algorithm is not learning at all, I tried to print the gradients after the backward pass and they just return None, even though requires_grad=True and leaf_node=True. I have tried so many things, but I cannot get it to work.

Could anybody please help me with what the problem is?

My implementation is below:

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        h1 = int(2*(state_size - action_size - 1) / 3) + action_size + 1
        h2 = int((state_size - action_size - 1) / 3) + action_size + 1
        self.main = nn.Sequential(
            nn.Linear(in_features=state_size, out_features=h1),
            nn.LeakyReLU(),
            nn.Linear(in_features=h1, out_features=h2),
            nn.LeakyReLU(),
            nn.Linear(in_features=h2, out_features=action_size),
            nn.Identity()
          )

    def forward(self, state):
      return self.main(state)

def get_q_values(q_network, state):
  q_values = q_network(state)
  customer_availability_length = num_customer
  if len(state.shape) == 1:  # Check if it's a single state
      customer_availability_tensor = state[:customer_availability_length]
  else:  
      customer_availability_tensor = state[:, :customer_availability_length]

  customer_availability_mask = 1 - customer_availability_tensor  # Invert the mask
  customer_availability_mask = customer_availability_mask * (10e9)  # Scale the mask

  q_values = q_values + customer_availability_mask  # Add the mask to q_values

  return q_values


def compute_target_q_values(rewards, next_states, q_network, is_done_mask, discount_factor=1):
    next_state_values = get_q_values(q_network, next_states)
    min_next_state_values, _ = torch.min(next_state_values, dim=1)
    min_next_state_values[is_done_mask] = 0
    target_q_values = rewards + discount_factor * min_next_state_values
    return target_q_values


max_trials = 10000 # Total training epochs for each instance

batch_size = 32 # Number of transitions used in each training update

experience_replay_capacity = 10000 # Maximum number of past experiences (transitions) to store

learning_rate = 0.0001 # Learning rate for optimizer

target_update_frequency = 1000

## Agent setup and training
state_size = num_customer + num_vehicles * 3 + 1
action_size = num_customer

q_network = QNetwork(state_size, action_size).to(device)
print("Q-network: ", q_network)
q_network.train()
target_network = QNetwork(state_size, action_size).to(device)
print("Target network: ", target_network)
target_network.load_state_dict(q_network.state_dict())
target_network.eval()

optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)
loss = nn.MSELoss()

Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'is_done'))

decay = 0.999 # Decay
min_epsilon = 0.1

def train(config=None):
    start_time = time.time()
    epsilon = 1.0 # Initialize epsilon
   automatically removed when it is full
    experience_replay = deque(maxlen=experience_replay_capacity)
    losses = []  # List to store losses for plotting
    for trial in range(max_trials):
      for instance in range(num_instances):
        environment = enviroment(instance)  # Create a new environment for each instance and each trial

        iterator = 0
        while not environment.is_terminal() and iterator < 50:
          iterator += 1
          vehicles_to_take_action = environment.get_vehicles_to_take_action()

          for vehicle in vehicles_to_take_action:
            state = environment.get_current_state_as_1D_tensor()
            possible_actions = environment.get_possible_actions()
            if not possible_actions:
              break

            if environment.get_vehicle_return_to_depot(vehicle):
              action = 0
            else:
              n_possible_actions = len(possible_actions)
              if not possible_actions:
                action = 0
              else:
                if random.random() <= epsilon:
                    if (n_possible_actions > 1):
                      action_index = random.randint(0, n_possible_actions-1) # Choose random action
                    else:
                      action_index = 0
                    action = possible_actions[action_index]
                else:
                    q_values = get_q_values(q_network, state)
                    action = torch.argmin(q_values).item()

            reward = environment.get_immediate_reward(action=action, vehicle=vehicle)

            environment.update_current_state_to_post_decision_state(action=action, vehicle=vehicle)
            environment.update_post_decision_state_to_next_state(action=action, vehicle=vehicle, instance_number = instance)
            next_state = environment.get_current_state_as_1D_tensor()

            experience_replay.append(Transition(state, action, next_state, reward, environment.is_terminal()))

            if len(experience_replay) >= batch_size:
                batch = random.sample(experience_replay, batch_size)
                batch = Transition(*zip(*batch))
                states_list = [state.float() for state in batch.state]
                states = torch.stack(states_list).to(device).requires_grad_(True)
                actions = torch.tensor(batch.action, dtype=torch.int64, device=device)
                rewards = torch.tensor(batch.reward, dtype=torch.float32, device=device)
                next_states_list = [state.float() for state in batch.next_state]
                next_states = torch.stack(next_states_list).to(device).requires_grad_(True)
                is_done_list = torch.tensor(batch.is_done, dtype=torch.bool, device=device)

        
                with torch.enable_grad():
                  target_q_values = compute_target_q_values(rewards, next_states, target_network, is_done_list, discount_factor=0.999)
                  q_values = get_q_values(q_network, states)
                  q_values = q_values.clone().detach().requires_grad_(True)
                  q_values = q_values.gather(1, actions.unsqueeze(1))  # Get the Q-value for the chosen action
                  optimizer.zero_grad()
                  loss_tensor = loss(q_values, target_q_values.unsqueeze(1))
                  loss_tensor.backward()
                  for param in q_network.parameters():
                    if param.grad is not None:
                        print("Gradients found")
                  optimizer.step()
   
                losses.append(loss_tensor.item())

                if trial % target_update_frequency == 0:
                    print("Target networks updated")
                    target_network.load_state_dict(q_network.state_dict())

      epsilon = max(0.1, epsilon*decay)

      print("Trial: ", trial)

    plt.figure(figsize=(8, 5))
    plt.plot(losses, label='Training Loss', color='blue', linestyle='-')
    plt.xlabel('Iterations')
    plt.ylabel('Loss')
    plt.title('Training Loss Over Iterations')
    plt.legend()
    plt.grid()
    plt.show()

    print(f"Time taken: {time.time() - start_time:.2f}s")
    return q_network

Edit to post, I cant seem to find out how to edit my original post so this is the solution.

I solved the issue, the problem is this line:
q_values = q_values.clone().detach().requires_grad_(True)

which should just be deleted.