LSTM for time-series prediction failing to learn

I’m currently working on building an LSTM network to forecast time-series data using PyTorch. I tried to share all the code pieces that I thought would be helpful, but please feel free to let me know if there’s anything further I can provide. I added some comments at the end of the post regarding what the underlying issue might be.

From the univariate time-series data indexed by date, I created 3 date features and split the data into training and validation sets as below.

# X_train
             weekday	monthday	hour
timestamp			
2015-01-08 17:00:00	3	8	17
2015-01-12 19:30:00	0	12	19
2014-12-01 15:30:00	0	1	15
2014-07-26 09:00:00	5	26	9
2014-10-17 20:30:00	4	17	20
...	...	...	...
2014-08-29 06:30:00	4	29	6
2014-10-13 14:30:00	0	13	14
2015-01-03 02:00:00	5	3	2
2014-12-06 16:00:00	5	6	16
2015-01-06 20:30:00	1	6	20
8256 rows × 3 columns

# y_train
                    value
timestamp	
2015-01-08 17:00:00	17871
2015-01-12 19:30:00	20321
2014-12-01 15:30:00	16870
2014-07-26 09:00:00	11209
2014-10-17 20:30:00	26144
...	...
2014-08-29 06:30:00	9008
2014-10-13 14:30:00	17698
2015-01-03 02:00:00	12850
2014-12-06 16:00:00	18277
2015-01-06 20:30:00	19640
8256 rows × 1 columns

# X_val
             weekday	monthday	hour
timestamp			
2015-01-08 07:00:00	3	8	7
2014-10-13 22:00:00	0	13	22
2014-12-07 01:30:00	6	7	1
2014-10-14 17:30:00	1	14	17
2014-10-25 09:30:00	5	25	9
...	...	...	...
2014-09-26 12:30:00	4	26	12
2014-10-08 16:00:00	2	8	16
2014-12-03 01:30:00	2	3	1
2014-09-11 08:00:00	3	11	8
2015-01-15 10:00:00	3	15	10
2064 rows × 3 columns

# y_val
	                value
timestamp	
2014-09-13 13:00:00	21345
2014-10-28 20:30:00	23210
2015-01-21 17:00:00	17001
2014-07-20 10:30:00	13936
2015-01-29 02:00:00	3604
...	...
2014-11-17 11:00:00	15247
2015-01-14 00:00:00	10584
2014-09-02 13:00:00	17698
2014-08-31 13:00:00	16652
2014-08-30 12:30:00	15775
2064 rows × 1 columns

Then, I transformed the values in the datasets by using MinMaxScaler from the sklearn library.

scaler = MinMaxScaler()
X_train_arr = scaler.fit_transform(X_train)
X_val_arr = scaler.transform(X_val)
y_train_arr = scaler.fit_transform(y_train)
y_val_arr = scaler.transform(y_val)

After converting these NumPy arrays into PyTorch Tensors, I created iterable datasets using TensorDataset and DataLoader classes provided by PyTorch.

from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable

train_features = torch.Tensor(X_train_arr)
train_targets = torch.Tensor(y_train_arr)

val_features = torch.Tensor(X_val_arr)
val_targets = torch.Tensor(y_val_arr)

train = TensorDataset(train_features, train_targets)
train_loader = DataLoader(train, batch_size=64, shuffle=False)

val = TensorDataset(val_features, val_targets)
val_loader = DataLoader(train, batch_size=64, shuffle=False)

Then, I defined my LSTM Model and train_step functions as follows:

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
        
        # Number of hidden layers
        self.layer_dim = layer_dim
        
        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        
        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        
        # Index hidden state of last time step
        out = self.fc(out[:, -1, :]) 
        return out
def make_train_step(model, loss_fn, optimizer):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        # Sets model to TRAIN mode
        model.train()
        # Makes predictions
        yhat = model(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        # Computes gradients
        loss.backward()
        # Updates parameters and zeroes gradients
        optimizer.step()
        optimizer.zero_grad()
        # Returns the loss
        return loss.item()
    
    # Returns the function that will be called inside the train loop
    return train_step

Finally, I start training my LSTM model in mini-batches with AdamOptimizer for 20 epochs, which is already long enough to see the model is not learning.

import torch.optim as optim

input_dim = n_features
hidden_dim = 64
layer_dim = 3
output_dim = 1

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)

criterion = nn.MSELoss(reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=1e-2)

train_losses = []
val_losses = []
train_step = make_train_step(model, criterion, optimizer)
n_epochs = 20
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(n_epochs):
    batch_losses = []
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.unsqueeze(dim=0).to(device)
        y_batch = y_batch.to(device)
        loss = train_step(x_batch, y_batch)
        batch_losses.append(loss)
    training_loss = np.mean(batch_losses)
    train_losses.append(training_loss)    
    with torch.no_grad():
        batch_val_losses = []
        for x_val, y_val in val_loader:
            x_val = x_val.unsqueeze(dim=0).to(device)
            y_val = y_val.to(device)        
            model.eval()
            yhat = model(x_val)
            val_loss = criterion(y_val, yhat).item()
            batch_val_losses.append(val_loss)
        validation_loss = np.mean(batch_val_losses)
        val_losses.append(validation_loss)
    
    print(f"[{epoch+1}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}")

And this is the output:

C:\Users\VS32XI\Anaconda3\lib\site-packages\torch\nn\modules\loss.py:446: UserWarning: Using a target size (torch.Size([1, 1])) that is different to the input size (torch.Size([64, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
  return F.mse_loss(input, target, reduction=self.reduction)
[1] Training loss: 0.0505	 Validation loss: 0.0315
[2] Training loss: 0.0317	 Validation loss: 0.0315
[3] Training loss: 0.0317	 Validation loss: 0.0315
[4] Training loss: 0.0317	 Validation loss: 0.0315
[5] Training loss: 0.0317	 Validation loss: 0.0315
[6] Training loss: 0.0317	 Validation loss: 0.0315
[7] Training loss: 0.0317	 Validation loss: 0.0315
[8] Training loss: 0.0317	 Validation loss: 0.0315
[9] Training loss: 0.0317	 Validation loss: 0.0315
[10] Training loss: 0.0317	 Validation loss: 0.0315
[11] Training loss: 0.0317	 Validation loss: 0.0315
[12] Training loss: 0.0317	 Validation loss: 0.0315
[13] Training loss: 0.0317	 Validation loss: 0.0315
[14] Training loss: 0.0317	 Validation loss: 0.0315
[15] Training loss: 0.0317	 Validation loss: 0.0315
[16] Training loss: 0.0317	 Validation loss: 0.0315
[17] Training loss: 0.0317	 Validation loss: 0.0315
[18] Training loss: 0.0317	 Validation loss: 0.0315
[19] Training loss: 0.0317	 Validation loss: 0.0315
[20] Training loss: 0.0317	 Validation loss: 0.0315

Note 1: Looking at the warning given, I’m not sure if that’s the real reason why the model is not learning well. After all, I’m trying to predict the future values in the time-series data; therefore, 1 would be a plausible output dimension.

Note 2: To train the model in mini-batches, I relied on the class DataLoader. When iterating over the X and Y batches in both train and validation DataLoaders, the dimensions of x_batches were 2, while the model expected 3. So, I used PyTorch’s unsqueeze function to match the expected dimension as in x_batch.unsqueeze(dim=0) . I’m not sure if this is how I should have gone about it, which could also be the issue.

1 Like

The issue was resolved once I used Tensor View to reshape the mini-batches for the features in the training and in the validation set. As a side note, view() enable fast and memory-efficient reshaping, slicing, and element-wise operations, by avoiding an explicit data copy.

It turned out that in the earlier implementation torch.unsqueeze() did not reshape the batches into tensors with the dimensions (batch size, timesteps, number of features). Instead, the function unsqueeze(dim=0) returns a new tensor with a singleton dimension inserted at the Oth index.

So, the mini batches for the feature sets is shaped as follows x_batch = x_batch.view([batch_size, -1, n_features]).to(device)

Then, the new training loop becomes:

for epoch in range(n_epochs):
    batch_losses = []
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.view([batch_size, -1, n_features]).to(device) # <---
        y_batch = y_batch.to(device)
        loss = train_step(x_batch, y_batch)
        batch_losses.append(loss)
    training_loss = np.mean(batch_losses)
    train_losses.append(training_loss)    
    with torch.no_grad():
        batch_val_losses = []
        for x_val, y_val in val_loader:
            x_val = x_val.view([batch_size, -1, n_features]).to(device) # <---
            y_val = y_val.to(device)        
            model.eval()
            yhat = model(x_val)
            val_loss = criterion(y_val, yhat).item()
            batch_val_losses.append(val_loss)
        validation_loss = np.mean(batch_val_losses)
        val_losses.append(validation_loss)
    
    print(f"[{epoch+1}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}")

Here’s the output:

[1] Training loss: 0.0235	 Validation loss: 0.0173
[2] Training loss: 0.0149	 Validation loss: 0.0086
[3] Training loss: 0.0083	 Validation loss: 0.0074
[4] Training loss: 0.0079	 Validation loss: 0.0069
[5] Training loss: 0.0076	 Validation loss: 0.0069

                          ...

[96] Training loss: 0.0025	 Validation loss: 0.0028
[97] Training loss: 0.0024	 Validation loss: 0.0027
[98] Training loss: 0.0027	 Validation loss: 0.0033
[99] Training loss: 0.0027	 Validation loss: 0.0030
[100] Training loss: 0.0023	 Validation loss: 0.0028

When i use x_batch = x_batch.view([batch_size, -1, n_features]) throws the following error: shape ‘[128, -1, 10]’ is invalid for input of size 840. how can this be solved?