Loss keep growing insread of decreasing in DQN

Hi, I’m developing an agent for a reinforcement learning problem. It uses a custom environment at gym with continuous state space and continuous action space. The resulting action is a vector [u, v], which is a command for the bot to perform. Since the input to the network is coordination, I couldn’t use the ReLu as an activation function. So, I used LeakyReLu with negative slope of 1 (which made it a Linear function).
Here is my model:

class DQN(nn.Module):
    def __init__(self, in_features=6, hidden_layers=[15,15,15,15,15,10, 10, 8, 4], outputs=2):
        super().__init__()
        self.first_layer   = nn.Linear(in_features, hidden_layers[0])
        self.hidden1 = nn.Linear(hidden_layers[0], hidden_layers[1])
        self.hidden2 = nn.Linear(hidden_layers[1], hidden_layers[2])
        self.hidden3 = nn.Linear(hidden_layers[2], hidden_layers[3])
        self.hidden4 = nn.Linear(hidden_layers[3], hidden_layers[4])
        self.hidden5 = nn.Linear(hidden_layers[4], hidden_layers[5])
        self.hidden6 = nn.Linear(hidden_layers[5], hidden_layers[6])
        self.hidden7 = nn.Linear(hidden_layers[6], hidden_layers[7])
        self.hidden8 = nn.Linear(hidden_layers[7], hidden_layers[8])
        self.output  = nn.Linear(hidden_layers[8], outputs)


    def forward(self, x):
        x = x.to(device)
        x = F.leaky_relu(self.first_layer(x), negative_slope=1)
        x = F.leaky_relu(self.hidden1(x), negative_slope=1)
        x = F.leaky_relu(self.hidden2(x), negative_slope=1)
        x = F.sigmoid(self.hidden3(x))
        x = F.sigmoid(self.hidden4(x))
        x = F.sigmoid(self.hidden5(x))
        x = F.sigmoid(self.hidden6(x))
        x = F.sigmoid(self.hidden7(x))
        x = F.leaky_relu(self.hidden8(x), negative_slope=1)
        x = F.leaky_relu(self.output(x), negative_slope=1)
        return x

And this is my optimization function:

def optimize_model():
    print(len(memory))
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    reshaped_reward_batch = torch.cat(
        (
            reward_batch.reshape(reward_batch.shape[0],1),
            reward_batch.reshape(reward_batch.shape[0],1)
        ), 1
    )
    state_action_values = policy_net(state_batch)

    next_state_values = torch.zeros((BATCH_SIZE, 2), device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).detach()

    expected_state_action_values = (next_state_values * GAMMA) + reshaped_reward_batch


    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values)
    print('Loss', loss.item())

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

And this is my training loop:

for i_episode in range(num_episodes):
    state = env.reset()
    state = torch.tensor([state], device=device, dtype=torch.float)
    for t in count():
        action = select_action(state)
        action_cpu = action.cpu()
        state, reward, done, info = env.step(action_cpu[0])
        state = torch.tensor([state], device=device, dtype=torch.float)
        reward = torch.tensor([reward], device=device, dtype=torch.float)
        
        if not done:
            next_state = state
        else:
            next_state = None
        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        state = next_state

        min_distance = min(info["distance"], min_distance)
        print(f'Eposide: {i_episode}, Iteration: {t}, Reward: {reward[0]}')
        print(f'Action: {action_cpu[0]}')
        print(f'Min Distance from target: {min_distance}\n')
        optimize_model()
        steps += 1
        if done:
            episode_durations.append(t + 1)
            break
    # Update the target network, copying all weights and biases in DQN
    if steps % STEPS_TO_UPDATE_TARGET_NET == 0:
        target_net.load_state_dict(policy_net.state_dict())

The problem is the loss keeps growing in this network instead of decreasing. Any idea about the reason and solution?