RuntimeError: stack expects each tensor to be equal size, but got [31435, 2] at entry 0 and [39327, 2] [Tabular data]]

venkat.r · November 20, 2021, 7:10pm

I am trying to generate batches from the Dataloader from a tabular csv data. The Dataloader doesn’t seem to render the last batch and shows unequal size error.

My Dataset is Tabular with [m x 23] where at the moment i am only processing 2 features i.e [‘account_status’, ‘has_paid’].

from src.utils import load_config
from torch.utils.data import DataLoader
import pandas as pd
import torch
from src.dataset import one_hot_encode, zero_pad

def one_hot_encode(data, feature):
    # one hot encode one feature
    # based on the n unique it has

    n = len(set(data[feature]))
    feat = torch.tensor((data[feature])).to(torch.int64)
    feat = FN.one_hot(feat, num_classes=n)

    return feat

class FeatureDataset(torch.utils.data.Dataset):

    def __init__(self, data, feat_config):
        self.data = data
        self.feat_config = feat_config
        self.len = data.shape[0]

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        df = self.data
        df = df[-df['default'].isna()]
        defaults = ['uuid', 'default']
        features_to_use = ['account_status', 'has_paid']
        cat_feats = [f for f in self.feat_config.keys() if self.feat_config[f]["type"] == "cat"]
        cur_df = df[[*defaults, *features_to_use]]

        cur_df = cur_df.iloc[:index, :]

        for feat in cat_feats:
            cur_df[feat].fillna(0, inplace=True)

        # get one hots
        for feat in features_to_use:
            onehots = one_hot_encode(cur_df, feat)

        return onehots

file = pd.read_csv('data/dataset.csv', sep=';')
config = load_config('features.yml')
my_dataset = FeatureDataset(data=file, feat_config=config)
my_loader = DataLoader(my_dataset, batch_size=10, num_workers=1, shuffle=True, drop_last=True)

for i, sample in enumerate(my_loader, 0):
    print(i)

Stack error shows like this.

---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

/tmp/ipykernel_28183/2352520513.py in <module>
     42 my_loader = DataLoader(my_dataset, batch_size=10, num_workers=1, shuffle=True, drop_last=True)
     43 
---> 44 for i, sample in enumerate(my_loader, 0):
     45     print(i)
     46     # print(i, print(type(sample), sample.shape))

~/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __next__(self)
    519             if self._sampler_iter is None:
    520                 self._reset()
--> 521             data = self._next_data()
    522             self._num_yielded += 1
    523             if self._dataset_kind == _DatasetKind.Iterable and \

~/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self)
   1201             else:
   1202                 del self._task_info[idx]
-> 1203                 return self._process_data(data)
   1204 
   1205     def _try_put_index(self):

~/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _process_data(self, data)
   1227         self._try_put_index()
   1228         if isinstance(data, ExceptionWrapper):
-> 1229             data.reraise()
   1230         return data
   1231 

~/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/_utils.py in reraise(self)
    432             # instantiate since we don't know how to
    433             raise RuntimeError(msg) from None
--> 434         raise exception
    435 
    436 

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/venkat/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/venkat/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/home/venkat/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [37952, 2] at entry 0 and [47833, 2] at entry 1

Any idea what is going wrong here and how can i fix it?

ptrblck · November 21, 2021, 12:27am

The error is raised as it seems your custom Dataset is returning samples with a variable shape, which will fail in the default collate_fn in the DataLoader when the batch is created via torch.stack.
You would have to make sure to return samples with an equal length (e.g. by padding, slicing etc.) or you could use a custom collate_fn, which would create a list of samples instead of a tensor.
In the latter case you would then have to check if your model is able to process a list of inputs or if other steps are necessary.

aldrinjenson · September 25, 2022, 9:02am

I was stuck with this error for quite some time and the fix actually turned out to be easy.
You just need to add a trasnform function to the dataloaders, eg: Resize(128).
If anyone in future got this same error due to your images being of different sizes, try adding a transform function(item_tfms).

eg:

my_loader = DataLoader(my_dataset, batch_size=10, num_workers=1, shuffle=True, drop_last=True, 
item_tfms=Resize(128)
)