I am trying to generate batches from the Dataloader from a tabular csv data. The Dataloader doesn’t seem to render the last batch and shows unequal size error.
My Dataset is Tabular with [m x 23] where at the moment i am only processing 2 features i.e [‘account_status’, ‘has_paid’].
from src.utils import load_config
from torch.utils.data import DataLoader
import pandas as pd
import torch
from src.dataset import one_hot_encode, zero_pad
def one_hot_encode(data, feature):
# one hot encode one feature
# based on the n unique it has
n = len(set(data[feature]))
feat = torch.tensor((data[feature])).to(torch.int64)
feat = FN.one_hot(feat, num_classes=n)
return feat
class FeatureDataset(torch.utils.data.Dataset):
def __init__(self, data, feat_config):
self.data = data
self.feat_config = feat_config
self.len = data.shape[0]
def __len__(self):
return self.len
def __getitem__(self, index):
df = self.data
df = df[-df['default'].isna()]
defaults = ['uuid', 'default']
features_to_use = ['account_status', 'has_paid']
cat_feats = [f for f in self.feat_config.keys() if self.feat_config[f]["type"] == "cat"]
cur_df = df[[*defaults, *features_to_use]]
cur_df = cur_df.iloc[:index, :]
for feat in cat_feats:
cur_df[feat].fillna(0, inplace=True)
# get one hots
for feat in features_to_use:
onehots = one_hot_encode(cur_df, feat)
return onehots
file = pd.read_csv('data/dataset.csv', sep=';')
config = load_config('features.yml')
my_dataset = FeatureDataset(data=file, feat_config=config)
my_loader = DataLoader(my_dataset, batch_size=10, num_workers=1, shuffle=True, drop_last=True)
for i, sample in enumerate(my_loader, 0):
print(i)
Stack error shows like this.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_28183/2352520513.py in <module>
42 my_loader = DataLoader(my_dataset, batch_size=10, num_workers=1, shuffle=True, drop_last=True)
43
---> 44 for i, sample in enumerate(my_loader, 0):
45 print(i)
46 # print(i, print(type(sample), sample.shape))
~/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
~/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self)
1201 else:
1202 del self._task_info[idx]
-> 1203 return self._process_data(data)
1204
1205 def _try_put_index(self):
~/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _process_data(self, data)
1227 self._try_put_index()
1228 if isinstance(data, ExceptionWrapper):
-> 1229 data.reraise()
1230 return data
1231
~/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/_utils.py in reraise(self)
432 # instantiate since we don't know how to
433 raise RuntimeError(msg) from None
--> 434 raise exception
435
436
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/venkat/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/home/venkat/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/home/venkat/.cache/pypoetry/virtualenvs/ponzi-Y6EpCXnx-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [37952, 2] at entry 0 and [47833, 2] at entry 1
Any idea what is going wrong here and how can i fix it?