LSTM model giving different error while training and running on a pretrained model

Hi All,
I am new to Machine Learning and Pytorch and I was wondering if I could get some inputs from the community about the first LSTM modeI I built in Pytorch.

I am trying to model a loundspeaker response. I have measured data of the voltage applied and the excursion of the diaphragm. The input to the model will be the excursion and the output predicted will be the voltage.

Below is the sample of how the data looks like:

Here is my LSTM model:

class ExcursionRNN(nn.Module):
    def __init__(self, input_dim, n_hidden=256, n_layers=2,drop_prob=0.5):

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden

        ## Define the LSTM
        self.lstm = nn.LSTM(input_dim, n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        ## Define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        ## Define the final, linear output layer with output size 1
        self.linear = nn.Linear(n_hidden, 1)

    def forward(self, x, hc):

        ''' Forward pass through the network. These inputs are x, and the hidden/cell state `hc`. '''
         ## Get x, and the new hidden state (h, c) from the lstm  
        x, (h, c) = self.lstm(x, hc)
        ## pass x through the dropout layer
        x = self.dropout(x)
        # Stack up LSTM outputs using view
        x = x.contiguous().view(x.size()[0]*x.size()[1], self.n_hidden)
        ## Put x through the linear regression layer
        x = self.linear(x)
        # Return x and the hidden state (h, c)
        return x, (h, c)  

    def init_hidden(self, n_seqs):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # Randomize hidden state and cell state of LSTM
        hidden = (torch.randn(self.n_layers, n_seqs, self.n_hidden).double(),torch.randn(self.n_layers, n_seqs, self.n_hidden).double())

        return hidden
def train(net, input_data, target_data, epochs=10, n_seqs=10, n_steps=50, lr=0.001, clip=5, val_frac=0.2, cuda=False, print_every=10):
    ''' Training a network 
        net: excursionRNN network
        input_data: excursion data to train the network
        target_data: voltage data to train the network
        epochs: Number of epochs to train
        n_seqs: Number of mini-sequences per mini-batch, aka batch size
        n_steps: Number of excursion steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        cuda: Train with CUDA on a GPU
        print_every: Number of steps for printing training and validation loss
    net = net.double()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.MSELoss()
    # create training and validation data
    val_idx = int(len(input_data)*(1-val_frac))
    input_data, val_input_data = input_data[:val_idx], input_data[val_idx:]
    target_data, val_target_data = target_data[:val_idx], target_data[val_idx:]
    counter = 0
    for e in range(epochs):
        # initialize the hidden and cell state with random numbers
        h = net.init_hidden(n_seqs) 
        for x, y in get_batches(input_data, target_data, n_seqs, n_steps):
            counter += 1
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            inputs = inputs.type(torch.DoubleTensor)
            targets = targets.type(torch.DoubleTensor)

            h = tuple([ for each in h])
            # reshape the data such that it's shape is (n_seqs, n_steps, n_features=1)
            inputs = inputs.expand(n_seqs,n_steps,1)
            output, h = net.forward(inputs, h)
            output = output.view(output.size()[0]*output.size()[1])
            output = output.type(torch.DoubleTensor)
            loss = criterion(output, targets.contiguous().view(n_seqs*n_steps))

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(n_seqs)
                val_losses = []
                for x, y in get_batches(val_input_data, val_target_data, n_seqs, n_steps):
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    val_h = tuple([ for each in val_h])
                    inputs, targets = x, y

                    inputs = inputs.type(torch.DoubleTensor)
                    targets = targets.type(torch.DoubleTensor)
                    inputs = inputs.expand(n_seqs,n_steps,1)
                    with torch.no_grad():
                        output, val_h = net.forward(inputs, val_h)
                        output = output.view(output.size()[0]*output.size()[1])
                        val_loss = criterion(output, targets.contiguous().view(n_seqs*n_steps))
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
net = ExcursionRNN(input_dim = 1, n_hidden=64, n_layers=2, drop_prob=0.5)
n_seqs, n_steps = 10, 28
train(net, excursion_measured_mm, voltage_measured, epochs=10000, n_seqs=n_seqs, n_steps=n_steps, lr=0.001, cuda=False, print_every=10)  

I have 11000 samples of data to train my model on. Out of 11k, 80% are used for training and 20% for validation.

Since I am still learning to understand LSTM and implementing them I am not sure if my implementation is right or not. It would be really helpful if someone could look at the code and provide some feedback whether the implementation is correct or not.

Also, after running the model for 10000 epochs, I get the Mean Squared Error of 0.0046.
I save the model and open it and pass the training data through it and I get the MSE of 0.025, which doesn’t seek like it’s correct.

Below is the script for running the final model on the training dataset:

def get_output(net, data):
    net = net.double()
    n_steps = 28
    n_seqs = int(len(data)/n_steps)
    data = data.reshape(-1,len(data))
    data = data[:, :n_seqs*n_steps]
    h = net.init_hidden(1)
    output = []

    for n in range(0, data.shape[1], n_steps):
        h = tuple([ for each in h])

        inputs = data[:,n:n+n_steps]
        inputs = np.asarray(inputs)

        inputs = torch.from_numpy(inputs)
        inputs = inputs.type(torch.DoubleTensor)
        inputs = inputs.expand(1,n_steps,1)
        out, h = net.forward(inputs, h)
        out = out.detach().numpy()
        output = np.append(output, out)
    return output

I am not able to figure out what’s wrong in my script that runs the final model on the training data that the RME I am getting is an order magnitude higher than the loss I get when I am actually training the model.

I would really appreciate if someone to point out the error in my script/model.