Why is my training error going up?

Why is my training error going up? I have made the learning rate very low, but this has not helped.

(I was having difficulty getting the hidden state to work. Maybe there’s a mistake there that is now causing the growing training error?)

import torch
import torch.nn as nn

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
# formula to create input sequences and scalar labels

def create_inout_sequences(input_data, tw):
    inout_seq = []
    L = len(input_data)
    for i in range(L-tw):
        train_seq = input_data[i:i+tw]
        train_label = input_data[i+tw:i+tw+1]
        inout_seq.append((train_seq ,train_label))
    return inout_seq
# create input sequences and scalar labels from data

# create random data that mimics actual data I am using
sampl = np.random.uniform(low=-3, high=3, size=(50,))
sampl.astype(np.float64)

# data
train_data_normalized = sampl.copy()

# convert data into tensors
train_data_normalized = torch.FloatTensor(train_data_normalized).view(-1)

# seq length
train_window = 10

# generate input sequences and scalar labels
train_inout_seq = create_inout_sequences(train_data_normalized, train_window)
class GRU(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=32, output_size=1, n_layers=2):
        super().__init__()
        
        self.hidden_layer_size = hidden_layer_size

        self.gru = nn.GRU(input_size, hidden_layer_size, n_layers)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = torch.stack((torch.zeros(1,1,self.hidden_layer_size),
                                        torch.zeros(1,1,self.hidden_layer_size)))

    def forward(self, input_seq):
        gru_out, self.hidden_cell = self.gru(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(gru_out.view(len(input_seq), -1))
        return predictions[-1]
model = GRU()

print(model)

# loss and optimization functions
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)
epochs = 10

for i in range(epochs):
    for seq, labels in train_inout_seq:
        
        optimizer.zero_grad()
        
        # reset hidden state
        model.hidden_cell = torch.stack((torch.zeros(1, 1, model.hidden_layer_size),
                                         torch.zeros(1, 1, model.hidden_layer_size)))
        
        # get rid of the 2nd dimension of hidden state to reduce hidden state from 4-d to 3-d
        # from (2, 1, 1, 32) to (2, 1, 32)
        model.hidden_cell = model.hidden_cell.squeeze(2)

        y_pred = model(seq)

        single_loss = loss_function(y_pred, labels)
        
        single_loss.backward()
        
        # clip gradient to prevent exploding gradient
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        
        optimizer.step()

    if i%2 == 0:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')
print("Finished training")

How do I prevent training error from going up after epoch 4?

epoch:   0 loss: 0.00021652
epoch:   2 loss: 0.00008253
epoch:   4 loss: 0.00004190
epoch:   6 loss: 0.00006440
epoch:   8 loss: 0.00010042
epoch:  10 loss: 0.00012543
epoch:  12 loss: 0.00014364
epoch:  14 loss: 0.00015832
epoch:  16 loss: 0.00017104
epoch:  18 loss: 0.00018215
epoch:  20 loss: 0.00019215
epoch:  22 loss: 0.00020187
epoch:  24 loss: 0.00021184
epoch:  26 loss: 0.00022200
epoch:  28 loss: 0.00023253
epoch:  29 loss: 0.0002379434

Why do you only return the last index in your forward?

the model predicts the next value (scalar), not an entire sequence

UPDATE:

I figured out what was wrong w/ the hidden state… I had two hidden states. I should have had only one since it’s a GRU.

The training error still increases during training so still looking for a answer for that.

self.hidden_cell = torch.zeros(1, 1, self.hidden_layer_size)
model.hidden_cell = torch.zeros(1, 1, model.hidden_layer_size)

UPDATE 2:

By reducing the number of GRU layers from 2 to 1, the model is training well. I don’t understand why the training error would go up with a more complex model - shouldn’t it at worst level off? Any advise for why this might have happened would be appreciated.