Problem with basic LSTM for multivariate time series prediction

Hi,
I’m trying to implement a basic LSTM model for a multivariate time series problem. The problem is that the model always predicts the same number, and when training, the loss is almost constant through all epochs. Now I’m questioning if my model definition and training loop are correct.
I would appreciate it if anyone could give me a hint in case there is something wrong with my code.

To provide a bit more context the input to the network is in the format: [batch_size, seq_length, num_features] and the target: [batch_size, 1]. The target variable is numbers in descending order. So, given an input sequence, I would like to predict a number that should be the smallest number in the target array.

Here is my model definition:

class RUL_Estimator(nn.Module):
    def __init__(self, n_features, hidden_dim, seq_length, num_layers=2, output_dim=1):
        super(RUL_Estimator, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_length = seq_length
        self.num_layers = num_layers

        # Define the LSTM layers
        self.lstm = nn.LSTM( input_size=n_features, hidden_size=self.hidden_dim,
            num_layers=self.num_layers, batch_first=True, dropout=0.2
        )
        self.linear = nn.Linear(in_features=self.hidden_dim, out_features=output_dim)

    def forward(self, input):

        lstm_out, _ = self.lstm(input)
        
        # I read that here I should be interested in the data corresponding to the last 
        # time step of the sequence, that's why: lstm_out[:, -1, :]
   
        pred = self.linear(lstm_out[:, -1, :])
        return pred

Here is my training function:

# Training Function
def train_model(model, loss_function, optimizer, num_epochs=25):
    since = time.time()
    best_loss = 1e10
    best_model_wts = copy.deepcopy(model.state_dict())
    loss_history = {'train': [], 'val': []}

    for epoch in range(1, num_epochs+1):
        print('\nEpoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)

        # Each epoch has a training phase and a validation phase at every 10 epochs
        for phase in ['train', 'val']:
            # Set model to training or evaluation mode
            model.train() if phase == 'train' else model.eval()
            running_loss = 0.0

            # Iterate over data.
            for inputs, labels in tqdm(dataloaders[phase]):

                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs.float())
                    loss = loss_function(outputs, labels.float())

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item()

            epoch_loss = running_loss / len(dataloaders[phase])
            loss_history[phase].append(epoch_loss)
            if epoch % 5 == 0:
                if phase == 'train':
                    train_stats = '{} ==> Loss: {:.4f}'.format(phase.upper(), epoch_loss)
                else:
                    # print(train_stats)
                    print('\n'+train_stats+' -- {} ==> Loss: {:.4f}'.format(phase.upper(), epoch_loss))

            # deep copy the model
            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'models/LSTM_v0.pth')

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, loss_history

Thank you very much for your help.