Training loss not decreasing for bland GRU network

Hi,

For my seemingly bland GRU network, the training loss never seems to want to decrease. I suspect vanishing gradients, but I stuggle to see why it would be the case. I’ve yet to figure out why. The loss may change in the 8th to 10th digit, but that’s pretty much it. The input data is tabular MinMax-Scaled, and the output is a binary 0-1. I’d be thankful for any help, because this is not the first time this issue has occured to me and me not being able to solve it!

class Network(nn.Module):

def __init__(self, input_size, sequence_length, hidden_size, num_layers, device, datatype):
    super(Network, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.gru = nn.GRU(input_size, hidden_size, num_layers=1, batch_first=True)
    self.dropout_layer = nn.Dropout(p=0.5)
    self.fc1 = nn.Linear(sequence_length* hidden_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, int(hidden_size / 2))
    self.out = nn.Linear(int(hidden_size / 2) + 1, 1)
    
    self.device = device
    self.datatype = datatype
    self.to(device)
    
    

def forward(self, x, weight):
    weight = weight.unsqueeze(1)
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_().to(device)
    x, hn = self.gru(x, h0)
    #x = torch.flatten(x, 1)
    x = x.squeeze(1)
    x = self.dropout_layer(x)
    x = F.relu(self.fc1(x))
    x = self.dropout_layer(x)
    x = F.relu(self.fc2(x))
    x = self.dropout_layer(x)
    x = torch.cat((x, weight), dim=1)
    x = torch.sigmoid(self.out(x))
    x = x.squeeze(1)
    return x

hidden_size = 128
num_layers = 1
input_size = 130 # I think
sequence_length = 1
num_classes = 2 # To be changed when the autoencoder comes into play
batch_size = 256 * 2
epochs = 100

datatype = torch.float32
device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)

datasets = {
‘train’: JaneDataset(train_c, x_cols, ‘resp’, ‘weight’, sequence_length, datatype),
‘val’: JaneDataset(val_c, x_cols, ‘resp’, ‘weight’, sequence_length, datatype)
}

dataloaders = {
‘train’: DataLoader(datasets[‘train’], batch_size=batch_size, shuffle=False, num_workers=6),
‘val’: DataLoader(datasets[‘val’], batch_size=batch_size, shuffle=False, num_workers=6)
}

Enable CUDA: use GPUs for model computation

model = Network(input_size, sequence_length, hidden_size, num_layers, device, datatype)

Instantiate loss function

loss_function = nn.BCELoss()

Instantiate optimization algorithm

learning_rate = 0.01
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print(device)

Iterate over number of epochs

from time import time
stats = {‘train_acc’: [0.0] * epochs,‘val_acc’: [0.0] * epochs,‘train_loss’: [0.0] * epochs,‘val_loss’: [0.0] * epochs}
y_pred = []

for e in range(epochs):
t0 = time()
train_loss = 0
train_samples = 0
val_loss = 0
val_samples = 0
train_acc = 0
val_acc = 0

# Iterate over the entire training dataset
# one batch per iteration 
for inputs, labels, weights in dataloaders['train']: 
    # Enable CUDA: use GPUs for model computation
    inputs, labels, weights = inputs.to(device), labels.to(device), weights.to(device)   
    # Clear the gradients of all optimized tensors
    optimizer.zero_grad()
    # Forward pass
    outputs = model.forward(inputs, weights)
    loss = loss_function(outputs, labels)
       
    # Backward pass
    loss.backward()
    optimizer.step()
    
    # Calculate and print running training loss
    train_loss += loss.detach()       
    pred = torch.round(outputs)
    train_acc += torch.sum(pred == labels).detach()
    train_samples += batch_size

train_acc = train_acc.cpu().item() / train_samples
train_loss = train_loss.cpu().item() / train_samples

for inputs, labels, weights in dataloaders['val']:
    
    with torch.set_grad_enabled(False):
        # Enable CUDA: use GPUs for model computation
        inputs, labels, weights = inputs.to(device), labels.to(device), weights.to(device)   

        # Forward pass
        outputs = model.forward(inputs, weights)
        loss = loss_function(outputs, labels)

        # Calculate and print running validarion loss
        val_loss += loss.detach()
        pred = torch.round(outputs)
        val_acc += torch.sum(pred == labels).detach()
        val_samples += batch_size
        if e == epochs - 1:
            y_pred.extend(pred.cpu().tolist())
val_acc = val_acc.cpu().item() / val_samples
val_loss = val_loss.cpu().item() / val_samples
add_stats(stats, e, train_acc, val_acc, train_loss, val_loss)    

if ((e + 1) % 1 == 0) or (e == epochs):
    print(f'Epoch: {e + 1}, time: {time() - t0} seconds.')
    print('Training loss: %.20f , Validation loss: %.20f' % (train_loss * 1000, val_loss * 1000))
    print('Training acc:  %.4f , Validation acc:  %.4f' % (train_acc * 100, val_acc * 100))

print(‘Training Finished.’)

I seem to have seen what is causing the problem.
Can you give me something I can execute? To clarify my ideas.

What you have given as code is not well structured.

Let me try again, and see if I’m more lucky this time.

class Network(nn.Module):
    
    def __init__(self, input_size, sequence_length, hidden_size, num_layers, device, datatype):
        super(Network, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers=1, batch_first=True)
        self.dropout_layer = nn.Dropout(p=0.5)
        self.fc1 = nn.Linear(sequence_length* hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, int(hidden_size / 2))
        self.out = nn.Linear(int(hidden_size / 2) + 1, 1)
        self.LeakyReLU = nn.LeakyReLU(0.1)
        self.device = device
        self.datatype = datatype
        self.to(device)
        
        
    
    def forward(self, x, weight):
        weight = weight.unsqueeze(1)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_().to(device)
        x, hn = self.gru(x, h0)
        x = torch.flatten(x, 1)
        x = x.squeeze(1)
        x = self.dropout_layer(x)
        x = self.LeakyReLU(self.fc1(x))
        x = self.dropout_layer(x)
        x = self.fc2(x)
        x = self.dropout_layer(x)
        x = torch.cat((x, weight), dim=1)
        x = torch.sigmoid(self.out(x))
        
        return x.view(-1)



hidden_size = 128
num_layers = 1
input_size = 130 # I think
sequence_length = 10
num_classes = 2 # To be changed when the autoencoder comes into play
batch_size = 512
epochs = 100

datatype = torch.float32
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

datasets = {
    'train': JaneDataset(train_c, x_cols, 'resp', 'weight', sequence_length, datatype),
    'val': JaneDataset(val_c, x_cols, 'resp', 'weight', sequence_length, datatype)
}

dataloaders = {
    'train': DataLoader(datasets['train'], batch_size=batch_size, shuffle=True, num_workers=6),
    'val': DataLoader(datasets['val'], batch_size=batch_size, shuffle=True, num_workers=6)
}

# Enable CUDA: use GPUs for model computation

model = Network(input_size, sequence_length, hidden_size, num_layers, device, datatype)
# Instantiate loss function
loss_function = nn.BCELoss()

# Instantiate optimization algorithm 
learning_rate = 0.01
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print(device)




# Iterate over number of epochs
from time import time
stats = {'train_acc': [0.0] * epochs,'val_acc': [0.0] * epochs,'train_loss': [0.0] * epochs,'val_loss': [0.0] * epochs}
y_pred = []

for e in range(epochs):
    t0 = time()
    train_loss = 0
    train_samples = 0
    val_loss = 0
    val_samples = 0
    train_acc = 0
    val_acc = 0
    
    # Iterate over the entire training dataset
    # one batch per iteration 
    for inputs, labels, weights in dataloaders['train']: 
        # Enable CUDA: use GPUs for model computation
        
        inputs, labels, weights = inputs.to(device), labels.to(device), weights.to(device)   
        
        # Clear the gradients of all optimized tensors
        optimizer.zero_grad()
        # Forward pass
        outputs = model.forward(inputs, weights)
        print(outputs)
        loss = loss_function(outputs, labels)
           
        # Backward pass
        loss.backward()
        optimizer.step()
        # Calculate and print running training loss
        train_loss += loss.detach()       
        pred = torch.round(outputs)
        train_acc += torch.sum(pred == labels).detach()
        train_samples += batch_size
    
    train_acc = train_acc.cpu().item() / train_samples
    train_loss = train_loss.cpu().item() / train_samples

    for inputs, labels, weights in dataloaders['val']:
        
        with torch.set_grad_enabled(False):
            # Enable CUDA: use GPUs for model computation
            inputs, labels, weights = inputs.to(device), labels.to(device), weights.to(device)   
            outputs = model.forward(inputs, weights)
            loss = loss_function(outputs, labels)
            
            # Calculate and print running validarion loss
            val_loss += loss.detach()
            pred = torch.round(outputs)
            val_acc += torch.sum(pred == labels).detach()
            val_samples += batch_size
            if e == epochs - 1:
                y_pred.extend(pred.cpu().tolist())
    val_acc = val_acc.cpu().item() / val_samples
    val_loss = val_loss.cpu().item() / val_samples
    add_stats(stats, e, train_acc, val_acc, train_loss, val_loss)    
    
    if ((e + 1) % 1 == 0) or (e == epochs):
        #print(f'Epoch: {e + 1}, time: {time() - t0} seconds.')
        print()
        print('Training loss: %.20f , Validation loss: %.20f' % (train_loss * 1000, val_loss * 1000))
        print('Training acc:  %.4f , Validation acc:  %.4f' % (train_acc * 100, val_acc * 100))
        
print('Training Finished.')

Imports as well please.
I want something that I will execute directly.

Sure, do you want the data too?

import warnings
import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import torch 
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
torch.set_default_dtype(torch.float32)
torch.manual_seed(0)

Yes.
It will make debugging easier for me.
I can always do it differently if your dataset is private.