Hi,
For my seemingly bland GRU network, the training loss never seems to want to decrease. I suspect vanishing gradients, but I stuggle to see why it would be the case. I’ve yet to figure out why. The loss may change in the 8th to 10th digit, but that’s pretty much it. The input data is tabular MinMax-Scaled, and the output is a binary 0-1. I’d be thankful for any help, because this is not the first time this issue has occured to me and me not being able to solve it!
class Network(nn.Module):
def __init__(self, input_size, sequence_length, hidden_size, num_layers, device, datatype):
super(Network, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size, hidden_size, num_layers=1, batch_first=True)
self.dropout_layer = nn.Dropout(p=0.5)
self.fc1 = nn.Linear(sequence_length* hidden_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, int(hidden_size / 2))
self.out = nn.Linear(int(hidden_size / 2) + 1, 1)
self.device = device
self.datatype = datatype
self.to(device)
def forward(self, x, weight):
weight = weight.unsqueeze(1)
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_().to(device)
x, hn = self.gru(x, h0)
#x = torch.flatten(x, 1)
x = x.squeeze(1)
x = self.dropout_layer(x)
x = F.relu(self.fc1(x))
x = self.dropout_layer(x)
x = F.relu(self.fc2(x))
x = self.dropout_layer(x)
x = torch.cat((x, weight), dim=1)
x = torch.sigmoid(self.out(x))
x = x.squeeze(1)
return x
hidden_size = 128
num_layers = 1
input_size = 130 # I think
sequence_length = 1
num_classes = 2 # To be changed when the autoencoder comes into play
batch_size = 256 * 2
epochs = 100
datatype = torch.float32
device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)
datasets = {
‘train’: JaneDataset(train_c, x_cols, ‘resp’, ‘weight’, sequence_length, datatype),
‘val’: JaneDataset(val_c, x_cols, ‘resp’, ‘weight’, sequence_length, datatype)
}
dataloaders = {
‘train’: DataLoader(datasets[‘train’], batch_size=batch_size, shuffle=False, num_workers=6),
‘val’: DataLoader(datasets[‘val’], batch_size=batch_size, shuffle=False, num_workers=6)
}
Enable CUDA: use GPUs for model computation
model = Network(input_size, sequence_length, hidden_size, num_layers, device, datatype)
Instantiate loss function
loss_function = nn.BCELoss()
Instantiate optimization algorithm
learning_rate = 0.01
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print(device)
Iterate over number of epochs
from time import time
stats = {‘train_acc’: [0.0] * epochs,‘val_acc’: [0.0] * epochs,‘train_loss’: [0.0] * epochs,‘val_loss’: [0.0] * epochs}
y_pred = []
for e in range(epochs):
t0 = time()
train_loss = 0
train_samples = 0
val_loss = 0
val_samples = 0
train_acc = 0
val_acc = 0
# Iterate over the entire training dataset
# one batch per iteration
for inputs, labels, weights in dataloaders['train']:
# Enable CUDA: use GPUs for model computation
inputs, labels, weights = inputs.to(device), labels.to(device), weights.to(device)
# Clear the gradients of all optimized tensors
optimizer.zero_grad()
# Forward pass
outputs = model.forward(inputs, weights)
loss = loss_function(outputs, labels)
# Backward pass
loss.backward()
optimizer.step()
# Calculate and print running training loss
train_loss += loss.detach()
pred = torch.round(outputs)
train_acc += torch.sum(pred == labels).detach()
train_samples += batch_size
train_acc = train_acc.cpu().item() / train_samples
train_loss = train_loss.cpu().item() / train_samples
for inputs, labels, weights in dataloaders['val']:
with torch.set_grad_enabled(False):
# Enable CUDA: use GPUs for model computation
inputs, labels, weights = inputs.to(device), labels.to(device), weights.to(device)
# Forward pass
outputs = model.forward(inputs, weights)
loss = loss_function(outputs, labels)
# Calculate and print running validarion loss
val_loss += loss.detach()
pred = torch.round(outputs)
val_acc += torch.sum(pred == labels).detach()
val_samples += batch_size
if e == epochs - 1:
y_pred.extend(pred.cpu().tolist())
val_acc = val_acc.cpu().item() / val_samples
val_loss = val_loss.cpu().item() / val_samples
add_stats(stats, e, train_acc, val_acc, train_loss, val_loss)
if ((e + 1) % 1 == 0) or (e == epochs):
print(f'Epoch: {e + 1}, time: {time() - t0} seconds.')
print('Training loss: %.20f , Validation loss: %.20f' % (train_loss * 1000, val_loss * 1000))
print('Training acc: %.4f , Validation acc: %.4f' % (train_acc * 100, val_acc * 100))
print(‘Training Finished.’)