DataLoader() for aggregated/clustered/panel data

alvarogutyerrez · February 9, 2023, 6:18pm

Say I have a data set with multiple observations per individual (also known as panel data). Hence, I want to sample them together; that is to say I want to sample my dataset at the level of the INDIVIDUALS, not at the level of the observations (or rows).

That being said, imagine we have the following data, where I identify my INDIVIDUALS in column id_ind, so the first two rows have a 1 in id_ind since these two observations belong to the first individual. Then, the 3rd and the 4th rows belong to the second individual (id_ind == 2), and so forth…

import pandas as pd
X =  pd.DataFrame.from_dict({'x1_1': {0: -0.1766214634108258, 1: 1.645852185286492, 2: -0.13348860101031038, 3: 1.9681043689968933, 4: -1.7004428240831382, 5: 1.4580091413853749, 6: 0.06504113741068565, 7: -1.2168493676768384, 8: -0.3071304478616376, 9: 0.07121332925591593}, 'x1_2': {0: -2.4207773498298844, 1: -1.0828751040719462, 2: 2.73533787008624, 3: 1.5979611987152071, 4: 0.08835542172064115, 5: 1.2209786277076156, 6: -0.44205979195950784, 7: -0.692872860268244, 8: 0.0375521181289943, 9: 0.4656030062266639}, 'x1_3': {0: -1.548320898226322, 1: 0.8457342014424675, 2: -0.21250514722879738, 3: 0.5292389938329516, 4: -2.593946520223666, 5: -0.6188958526077123, 6: 1.6949245117526974, 7: -1.0271341091035742, 8: 0.637561891142571, 9: -0.7717170035055559}, 'x2_1': {0: 0.3797245517345564, 1: -2.2364391598508835, 2: 0.6205947900678905, 3: 0.6623865847688559, 4: 1.562036259999875, 5: -0.13081282910947759, 6: 0.03914373833251773, 7: -0.995761652421108, 8: 1.0649494418154162, 9: 1.3744782478849122}, 'x2_2': {0: -0.5052556836786106, 1: 1.1464291788297152, 2: -0.5662380273138174, 3: 0.6875729143723538, 4: 0.04653136473130827, 5: -0.012885303852347407, 6: 1.5893672346098884, 7: 0.5464286050059511, 8: -0.10430829457707284, 9: -0.5441755265313813}, 'x2_3': {0: -0.9762973303149007, 1: -0.983731467806563, 2: 1.465827578266328, 3: 0.5325950414202745, 4: -1.4452121324204903, 5: 0.8148816373643869, 6: 0.470791989780882, 7: -0.17951636294180473, 8: 0.7351814781280054, 9: -0.28776723200679066}, 'x3_1': {0: 0.12751822396637064, 1: -0.21926633684030983, 2: 0.15758799357206943, 3: 0.5885412224632464, 4: 0.11916562911189271, 5: -1.6436210334529249, 6: -0.12444368631987467, 7: 1.4618564171802453, 8: 0.6847234328916137, 9: -0.23177118858569187}, 'x3_2': {0: -0.6452955690715819, 1: 1.052094761527654, 2: 0.20190339195326157, 3: 0.6839430295237913, 4: -0.2607691613858866, 5: 0.3315513026670213, 6: 0.015901139336566113, 7: 0.15243420084881903, 8: -0.7604225072161022, 9: -0.4387652927008854}, 'x3_3': {0: -1.067058994377549, 1: 0.8026914180717286, 2: -1.9868531745912268, 3: -0.5057770735303253, 4: -1.6589569342151713, 5: 0.358172252880764, 6: 1.9238983803281329, 7: 2.2518318810978246, 8: -1.2781475121874357, 9: -0.7103081175166167}})
Y = pd.DataFrame.from_dict({'CHOICE': {0: 1.0, 1: 1.0, 2: 2.0, 3: 2.0, 4: 3.0, 5: 2.0, 6: 1.0, 7: 1.0, 8: 2.0, 9: 2.0}})
Z = pd.DataFrame.from_dict({'z1': {0: 2.4196730570917233, 1: 2.4196730570917233, 2: 2.822802255159467, 3: 2.822802255159467, 4: 2.073171091633643, 5: 2.073171091633643, 6: 2.044165101485163, 7: 2.044165101485163, 8: 2.4001241292606275, 9: 2.4001241292606275}, 'z2': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 0.0, 9: 0.0}, 'z3': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 2.0, 5: 2.0, 6: 2.0, 7: 2.0, 8: 3.0, 9: 3.0}})
id = pd.DataFrame.from_dict({'id_choice': {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0, 5: 6.0, 6: 7.0, 7: 8.0, 8: 9.0, 9: 10.0}, 'id_ind': {0: 1.0, 1: 1.0, 2: 2.0, 3: 2.0, 4: 3.0, 5: 3.0, 6: 4.0, 7: 4.0, 8: 5.0, 9: 5.0}} )

# Create a dataframe with all the data 
data = pd.concat([id, X, Z, Y], axis=1)

print(data.head(4))
   id_choice  id_ind      x1_1      x1_2      x1_3      x2_1      x2_2  \
0        1.0     1.0 -0.176621 -2.420777 -1.548321  0.379725 -0.505256   
1        2.0     1.0  1.645852 -1.082875  0.845734 -2.236439  1.146429   
2        3.0     2.0 -0.133489  2.735338 -0.212505  0.620595 -0.566238   
3        4.0     2.0  1.968104  1.597961  0.529239  0.662387  0.687573   

       x2_3      x3_1      x3_2      x3_3        z1   z2   z3  CHOICE  
0 -0.976297  0.127518 -0.645296 -1.067059  2.419673  0.0  1.0     1.0  
1 -0.983731 -0.219266  1.052095  0.802691  2.419673  0.0  1.0     1.0  
2  1.465828  0.157588  0.201903 -1.986853  2.822802  0.0  1.0     2.0  
3  0.532595  0.588541  0.683943 -0.505777  2.822802  0.0  1.0     2.0

Now, I have written the ChoiceDataset class around the primitive torch.utils.data.Dataset. Unfortunately, it is sampling at the level of the observations.

# Create a dictionary with the data
data_dict = {'idx': id,
            'X':   X,
            'Z':   Z,
            'Y':   Y}
# Create a pytorch.Dataset class
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
class ChoiceDataset(Dataset):
    def __init__(self, data):
        self.Y = torch.LongTensor(data['Y'].values -1).reshape(len(data['Y'].index),1)
        self.J = torch.unique(self.Y).shape[0]
        self.id = torch.LongTensor(data['idx']['id_ind'].values).reshape(len(data['idx']['id_ind'].index),1)
        self.N = torch.unique(self.id).shape[0] # Total number of individuals
        _,self.t_n = self.id.unique(return_counts=True) 
        self.N_t = self.t_n.sum(axis=0).item() #total number of observations
        self.X_wide = torch.DoubleTensor(data['X'].values)
        self.K  = int(self.X_wide.shape[1] / self.J)
        self.Z = torch.DoubleTensor(data['Z'].values)
        self.X = self.X_wide.reshape(self.N_t ,self.K, self.J)
    def __len__(self):
        # return a dictionary with the data dimensions
        # __len__ is equal to the number of individual here (not the observations)
        #  since this is the level at which I want to sample from
        return self.N 
    def __getitem__(self, idx):
        # return a dictionary with the data
        return {'Y': self.Y[idx], 'X': self.X[idx], 'id': self.id[idx], 'Z': self.Z[idx]}

As you can see below, it is sampling at the level of the observations. Could you please suggest some changes to make it sample at the level of individuals?

df_train = ChoiceDataset(data_dict)
data_train = DataLoader(df_train, batch_size=3, shuffle=False, num_workers=0)
for batch_idx, data in enumerate(data_train):
    print('batch_idx:',batch_idx) 
    print(data['Y'].shape)

#batch_idx: 0
#torch.Size([3, 1]) # takes first 3 observations
#batch_idx: 1
#torch.Size([2, 1]) # takes the last 2 observations

Update

When selecting the observations that belong to each individual, by using the following __getitem__() function, I was expecting to solve the problem.

    def __getitem__(self, idx):
        # return a dictionary with the data
        # Get the position of individual idx in the dataset
        ind_position = torch.where(self.id == idx)[0]
        return {'Y': self.Y[ind_position], 'X': self.X[ind_position], 'id': self.id[ind_position], 'Z': self.Z[ind_position]}

However, I am getting the following error, which, if I am reading it correctly, is telling me that the, internally, torch.stack() is meant to receive only tensors of the same size (probably [1,:,…,:] row?) when putting the batches together. Unfortunately, I am still stuck with this.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
c:\Users\u0133260\Documents\_local_git_repos\MixTasteNet_project\MixTasteNet_local\CODE\SO_Q\dataloader.py in line 53
     52 df_train =ChoiceDataset(data_dict)
     53 data_train = DataLoader(df_train, batch_size=2, shuffle=False, num_workers=0)
---> 54 for batch_idx, data in enumerate(data_train):
     55    print('batch_idx:',batch_idx) 

File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\dataloader.py:681, in _BaseDataLoaderIter.__next__(self)
    678 if self._sampler_iter is None:
    679     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    680     self._reset()  # type: ignore[call-arg]
--> 681 data = self._next_data()
    682 self._num_yielded += 1
    683 if self._dataset_kind == _DatasetKind.Iterable and \
    684         self._IterableDataset_len_called is not None and \
    685         self._num_yielded > self._IterableDataset_len_called:

File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\dataloader.py:721, in _SingleProcessDataLoaderIter._next_data(self)
    719 def _next_data(self):
    720     index = self._next_index()  # may raise StopIteration
--> 721     data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    722     if self._pin_memory:
    723         data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\_utils\fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
     50 else:
     51     data = self.dataset[possibly_batched_index]
---> 52 return self.collate_fn(data)

File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\_utils\collate.py:160, in default_collate(batch)
    158 elif isinstance(elem, collections.abc.Mapping):
    159     try:
--> 160         return elem_type({key: default_collate([d[key] for d in batch]) for key in elem})
    161     except TypeError:
    162         # The mapping type may not support `__init__(iterable)`.
    163         return {key: default_collate([d[key] for d in batch]) for key in elem}

File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\_utils\collate.py:160, in <dictcomp>(.0)
    158 elif isinstance(elem, collections.abc.Mapping):
    159     try:
--> 160         return elem_type({key: default_collate([d[key] for d in batch]) for key in elem})
    161     except TypeError:
    162         # The mapping type may not support `__init__(iterable)`.
    163         return {key: default_collate([d[key] for d in batch]) for key in elem}

File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\_utils\collate.py:141, in default_collate(batch)
    139         storage = elem.storage()._new_shared(numel, device=elem.device)
    140         out = elem.new(storage).resize_(len(batch), *list(elem.size()))
--> 141     return torch.stack(batch, 0, out=out)
    142 elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
    143         and elem_type.__name__ != 'string_':
    144     if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
    145         # array of string classes and object

RuntimeError: stack expects each tensor to be equal size, but got [0, 1] at entry 0 and [2, 1] at entry 1

PS: crossposted on Stackoverflow: python - PyTorch: `DataLoader()` for aggregated/clustered/panel data - Stack Overflow

ptrblck · February 10, 2023, 3:01pm

The error message points towards an empty sample in your batch so check if you are properly selecting the individuals in your __getitem__ method.

alvarogutyerrez · February 10, 2023, 3:46pm

You have a good eye there. Indeed, the selector index started from 0, and the individual index started from 1. Hence, the code below solves the issue. Thank you very much!!!

def __getitem__(self, idx):
    # return a dictionary with the data
    # Get the position of individual idx in the dataset
    ind_position = torch.where(self.id == idx + 1)[0]
    return {'Y': self.Y[ind_position], 'X': self.X[ind_position], 'id': self.id[ind_position], 'Z': self.Z[ind_position]}