During training the loss is decreasing well but the predictions aren't getting modified

I am currently trying to train a many-to-one LSTM model to predict the quality of an episode. However during the training the loss is decreasing well but the predictions from the model aren’t changing. I was thinking that this could be a result of only using a single dataset, or maybe it could be that my model isn’t calculating the gradients properly because the grad attributes for the forward passes are None. In the end, I have been looking into how to fix this for a while now and I was curious if anyone could help?

Here are all the necessary snippets of the code:

Also for reference the shapes of the data should be well described in the architecture file (lowest code-snippet), however if you have any more questions please let me know!

Here is the main file that does the training

import torch
import copy
import torch.optim as optim
from tqdm import tqdm
import lstm
import numpy as np
from torch.nn.utils.rnn import pad_sequence


DEVICE = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu')


def load_dataset(velocity, gap_size):
    """
    Loads the rollouts for each parameter setting (where velocity and gap size are the variable parameters)
    """
    rollout_data = np.load(
        f"exp_data/exp_vel_{velocity}_gapSize_{gap_size}.npy", allow_pickle=True)

    # Iterate over the field names and copy values from the original structured array to the new array
    X = []
    for episode_idx, x in enumerate(rollout_data):
        # stores all the states for each timestep in the episode
        chosen_metrics = []
        for timestep_idx, full_state_arr in enumerate(x):
            # convert each tuple in the structured array
            full_state_dict = {}
            for field_name in full_state_arr.dtype.names:
                full_state_dict[field_name] = torch.squeeze(
                    torch.from_numpy(full_state_arr[field_name]))

            # parse the desired metrics (to be fed into the model) from the full state dictionary
            torch_tensor_x = torch.Tensor(size=(1, 2), device=DEVICE)
            torch_tensor_x[0][0] = full_state_dict['base_positions'][0]
            torch_tensor_x[0][1] = full_state_dict['base_linear_velocities'][0]
            chosen_metrics.append(torch_tensor_x)
        X.append(copy.deepcopy(chosen_metrics))

    # Pad the sequences and stack them into a tensor
    max_len = max([len(t) for t_list in X for t in t_list])
    padded_tensor_list = []
    for t_list in X:
        padded_t_list = []
        for t in t_list:
            padded_t = torch.nn.functional.pad(
                t, (0, max_len - len(t)), mode='constant', value=0)
            padded_t_list.append(padded_t)
        padded_tensor_list.append(padded_t_list)

    padded_tensor = pad_sequence(
        [torch.stack(t_list) for t_list in padded_tensor_list], batch_first=True)

    return padded_tensor, len(X)


def main():
    velocity, gap_size = 0.25, 3
    num_metrics, num_episodes = 2, 5

    # dictionary where the first key is the gap size and the value is a list which holds the indices corresponding to which episodes were preferred

    # Picking 2 bad, 2 good, and 2 mediocre episodes to fine tune on
    handpicked_episodes = {3: [[], [], [0], [4], [], [], [1], [], [], [1], [], []],
                           4: [[], [], [], [], [], [4], [], [], [], [], [], []],
                           5: [[], [], [], [], [], [], [], [], [], [], [], []],
                           6: [[], [], [], [], [], [], [], [1], [], [], [], []]}
    # Gather all the tensor data for each parameter setting into one dataset
    X, num_episodes = [], 0
    max_timestep_len = 0
    while gap_size <= 6:
        while velocity <= 3.00:
            # TODO change the data organization to be cleaner later if you have time
            temp_tensor, temp_num_episodes = load_dataset(
                velocity=velocity, gap_size=gap_size)
            # After squeezing X is 3D-tensor | shape: (num episodes, max num of trajectories (length of episode, padded as necessary), input_dim (num of metrics chosen from full_state_trajectory))
            temp_tensor.squeeze_(2)
            max_timestep_len = max(max_timestep_len, temp_tensor.shape[1])
            X.append(temp_tensor)
            num_episodes += temp_num_episodes
            velocity += 0.25
        gap_size += 1
        velocity = 0.25

    # pad the tensors in X then concatenate
    padded_tensors = []
    for tensor in X:
        pad_size = max_timestep_len - tensor.size(1)
        padded_tensor = torch.nn.functional.pad(tensor, (0, 0, 0, pad_size))
        padded_tensors.append(padded_tensor)

    # Concatenate the padded tensors along the first dimension
    concatenated_tensors = torch.cat(padded_tensors, dim=0)

    # parse the preferred episodes from the dataset
    num_parameter_settings, gap_size_key, parameter_settings_count = len(
        padded_tensors), 3, 0
    X = []

    save_plots, handpick = True, True

    for i in range(concatenated_tensors.shape[0]):
        # check multiple of temp_num_episodes * 12 b/c there are temp_num_episodes in each parameter setting and there are 12
        # parameter settings for each gap size
        if i % (temp_num_episodes * 12) == 0 and i != 0:
            gap_size_key += 1

        # for every parameter setting we want to parse the preferred episode
        if i % temp_num_episodes == 0:
            # check if we ran into an empty list
            if handpick:
                if handpicked_episodes[gap_size_key][parameter_settings_count % 12]:
                    preferred_episode_idx = preferred_episodes[gap_size_key][parameter_settings_count % 12][0] + (
                        parameter_settings_count * temp_num_episodes)
                    X.append(concatenated_tensors[preferred_episode_idx])
                parameter_settings_count += 1
            else:
                if preferred_episodes[gap_size_key][parameter_settings_count % 12]:
                    preferred_episode_idx = preferred_episodes[gap_size_key][parameter_settings_count % 12][0] + (
                        parameter_settings_count * temp_num_episodes)
                    X.append(concatenated_tensors[preferred_episode_idx])
                parameter_settings_count += 1

    # convert X into tensor
    X = torch.stack(X, dim=0)
    X.requires_grad_(True)

    batch_size = X.shape[0]
    reward_model = lstm.Reward_LSTM(num_metrics, 50, 2, batch_size).to(DEVICE)
    optimizer = optim.Adam(reward_model.parameters())
    P = np.random.rand(X.shape[0])

    # Get the reward for each episode in X
    num_epochs = 1000
    intermediate_vals, k = torch.ones(
        (num_epochs), device=DEVICE, requires_grad=True), 0

    for epoch_idx in tqdm(range(num_epochs)):
        r = reward_model(X)
        r.requires_grad_(True)

        with torch.no_grad():
            for i in range(batch_size):
                for j in range(batch_size):
                    # compute the loss
                    if i != j:
                        k += 1
                        if P[i] > P[j]:
                            intermediate_vals[epoch_idx] = -torch.log(
                                torch.sigmoid((r[i] - r[j])))
                        else:
                            intermediate_vals[epoch_idx] = -torch.log(
                                torch.sigmoid((r[j] - r[i])))

        # calculate gradient and update parameters of reward model
        optimizer.zero_grad()
        loss = torch.mean(intermediate_vals)
        loss.requires_grad_(True)
        loss.backward(retain_graph=True)
        optimizer.step()

if __name__ == "__main__":
    main()

Here is the NN architecture file (imported as lstm.py in the training code)

class Reward_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes) -> None:
        super(Reward_LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size,
                            num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        out, _ = self.lstm(x, (h0, c0))
        # out: batch_size, seq_length, hidden_size
        out = out[:, -1, :]
        out = self.fc(out)
        # remove the 2nd dimension from the output
        out = out[0]
        return out

Thank you in advance!