Runtime error using Dataloader [please help]

Hello everyone :wave:

I am working on a simple LSTM demo but I keep running into the following error during training:

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 90 and 89 in dimension 1

Any help would be greatly appreciated! :pray:

Full stack trace:


RuntimeError: Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py", line 68, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py", line 68, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py", line 43, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 90 and 89 in dimension 1 at /pytorch/aten/src/TH/generic/THTensor.cpp:711

Hyperparameters:

SEQ_LENGTH = 90 # 90 day average
BATCH_SIZE = 2
EPOCHS = 100
NUM_FEATURES = 4
HIDDEN_SIZE = 32
NUM_LAYERS = 2
DROPOUT = 0.2
NUM_DIR = 1
LEARNING_RATE = 0.002

Dataset:

class TimeSeriesDataset(data.Dataset):
  def __init__(self, samples, targets, seq_length):
        'Initialization'
        self.samples = samples
        self.targets = targets
        self.seq_length = seq_length

  def __getitem__(self, index):
        x = torch.tensor(self.samples.iloc[index:index + self.seq_length].values).float()
        y = torch.tensor(self.targets.iloc[index:index + self.seq_length].values).float()
        return x, y

  def __len__(self):
        return len(self.samples)

Dataloader:

training_dataset = TimeSeriesDataset(x_train, y_train, SEQ_LENGTH)
test_dataset = TimeSeriesDataset(x_test, y_test, SEQ_LENGTH)

training_generator = torch.utils.data.DataLoader(
            training_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=4,
            drop_last=True)
test_generator = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=4,
            drop_last=True)

Training code:

def training(model, epochs, state_dim):
  
  for epoch in range(epochs):

      # Initialize states
      # (num_layers * num_directions, batch, hidden_size)
      states = (torch.zeros(state_dim).to(device), torch.zeros(state_dim).to(device))

      # Training
      for step, (x_batch, y_batch) in enumerate(training_generator):
          x_batch = x_batch.permute(1, 0, 2)
          y_batch = y_batch.permute(1, 0).unsqueeze(dim=2)
          #
          # Move to GPU
          x_batch, y_batch = x_batch.to(device), y_batch.to(device)
          # (seq_len, batch, input_size)

          states = [state.detach() for state in states]
          prediction, states = model(x_batch, states)

          model.zero_grad()
          loss = criterion(prediction, y_batch)
          loss.backward()
          optimizer.step()

          print('Epoch [{}/{}], Step[{}], Loss: {:.4f}'
                .format(epoch+1, epochs, step, loss.item()))


training(model, EPOCHS, state_dim=(NUM_LAYERS * NUM_DIR, BATCH_SIZE, HIDDEN_SIZE))

Another perplexing detail: If I set the batch size to 1, the data is loaded successfully and the model trains for several epochs, but with a batch size bigger than 1 I get the aforementioned error :thinking:

Based on the error message it looks like the samples are stacked in dim0 and have apparently a different length (90 vs 89).
Are you loading the samples as [seq_len, batch_size, features]?
Note that the default collate_fn will try to stack the tensors in dim0, which will increase your seq_len, if you are using the aforementioned format.