Model Not Optimizing

Hello I am trying to train a stacked GRU. My training and validation losses tend to remain the same for every epoch. I have tried two different loss functions and learning rates. The loss doesn’t seem to change.

import torch
from import IterableDataset
from torch import nn
import torch.nn.functional as F
from import DataLoader
import pytorch_lightning as p1
import numpy as np
from torchmetrics import Accuracy, Precision, Recall
import pandas as pd

class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(NeuralNetwork, self).__init__()
        #self.loss = torch.nn.CrossEntropyLoss()
        self.loss = torch.nn.MSELoss()
        #self.loss = torch.nn.L1Loss() = .01
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        # Above initializes all the variables.
        self.hiddenlayers = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.outputlayer = nn.Linear(input_size, output_size)
        # This code will create a stacked GRU with n number of layers.
    def forward(self, x):
        #x = x.float()
        output, hidden = self.hiddenlayers(x)
        # Pushes input through the stacked GRU.
        # Returns the last GRU layers last hidden state.
        x = F.relu(hidden[-1])
        # Pushes hidden layer from GRU through activation function.
        x = F.softmax(self.outputlayer(x), dim=1)
        # Pushes that hidden state through a linear softmax layer.
        return x
        # Returns the predicted protein coding expression values.
# Creates the neural network class
class CustomDataset(IterableDataset):
    def __init__(self, data):
        super(CustomDataset, self).__init__()
        self.X, self.y = data
    def __iter__(self):
        for i in range(len(self.X[:,1,1])):
            yield self.X[i,:,:], self.y[i,:]
    def __len__(self):
        return len(self.X[:,1,1])
# Creates a custom dataset class to train the neural network.
class DataModule():
    def __init__(self):
        self.input = np.load('input_data.npy')
        self.output = np.load('output_data.npy')

        self.input = self.input[:10000, :, :]
        self.output = self.output[:10000, :]

        # Loads the input and output data for the NN.
        self.inputsize = len(self.input[0,0,:])
        self.outputsize = len(self.output[0,:])
        # Sets the size of the input and output features for the NN.
        self.batch_size = 1
        self.num_workers = 1
        # Set number of samples per batch
        self.input = torch.from_numpy(self.input, ).float()
        self.output = torch.from_numpy(self.output, ).float()
    def setup(self, stage):
        if stage == 'fit':
            split = len(self.input[:,1,1]) / 2
            split = int(split)
            self.trainX, self.trainy = self.input[:split, :, :], self.output[:split, :]
            self.valX, self.valy = self.input[split:, :, :], self.output[split:, :]
            # Splits the dataset into two smaller datasets for training.
        if stage == 'test' or stage is None:
            self.testX, self.testy = self.input, self.output
            # Returns the full dataset for testing.
    def train_dataloader(self):
        data = self.trainX, self.trainy
        return DataLoader(CustomDataset(data), batch_size = self.batch_size, num_workers=self.num_workers)
    # Loads data for training.
    def val_dataloader(self):
        data = self.valX, self.valy
        return DataLoader(CustomDataset(data), batch_size = self.batch_size, num_workers=self.num_workers)
    # Loads data for validation.
    def test_dataloader(self):
        data = self.testX, self.testy
        return DataLoader(CustomDataset(data), batch_size = self.batch_size, num_workers=self.num_workers)
    # Loads data for testing.

# Pass the num of nodes and gpus as arguments to program

dm = DataModule()
train_loader = dm.train_dataloader()
val_loader = dm.val_dataloader()
# Prepares the data
model = NeuralNetwork(dm.inputsize, dm.inputsize, dm.outputsize, 2,)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

optimizer = torch.optim.SGD(model.parameters(),, momentum=0.8)

def TrainStep(model, x, y):
    optimizer.zero_grad()   # zero the gradient buffers
    logits = model(x)
    loss = model.loss(logits, y)
    return loss.item()

def ValidationStep(model, x, y):
    logits = model(x)
    loss = model.loss(logits, y)
    similarity = torch.nn.CosineSimilarity()
    return loss.item(), similarity(logits, y).item()

trainlossfile = open("TrainLoss2.txt", "a")
validationlossfile = open("ValLoss2.txt", "a")
similarityfile = open("Similarity2.txt", "a")

def Train(epochs):
    TrainLoss, ValidationLoss, Similarity = [], [], []
    for i in range(epochs):
        print("EPOCH " + str(i))
        for i, data in enumerate(train_loader, 0):
            x, y = data[0].to(device), data[1].to(device)
            loss = TrainStep(model, x, y)
            #print("Training Loss: ")
            trainlossfile.write(", ")
        for i, data  in enumerate(val_loader, 0):
            x, y = data[0].to(device), data[1].to(device)
            loss, similarity = ValidationStep(model, x, y)
            #print("Valiation Loss: ")
            #print("Similarity: ")
            validationlossfile.write(", ")
            similarityfile.write(", ")
        path = "model_"+str(i)+".pt"{
            'epoch': i,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, path)
    return TrainLoss, ValidationLoss, Similarity
TrainLoss, ValidationLoss, Similarity = Train(1000)

I dont know if you have tried this or not, but can you do this in train_dataloader

def train_dataloader(self):
    data = self.trainX, self.trainy
    return DataLoader(CustomDataset(data), batch_size = self.batch_size, num_workers=self.num_workers, shuffle=True)

I just tried to shuffle the data-loader, but I am using an iterable dataset and thus it does not like shuffling the data.

I will rewrite my data-loader and dataset class and let you know my results.

Based on your code it seems you are using self.loss = torch.nn.MSELoss() with F.softmax in a classification setup, which sounds uncommon at least.
If you are indeed working on a multi-class classification could you remove the F.softmax and use nn.CrossEntropyLoss instead?