Say I have a data set with multiple observations per individual (also known as panel data). Hence, I want to sample them together; that is to say I want to sample my dataset at the level of the INDIVIDUALS, not at the level of the observations (or rows).
That being said, imagine we have the following data, where I identify my INDIVIDUALS in column id_ind
, so the first two rows have a 1
in id_ind
since these two observations belong to the first individual
. Then, the 3rd and the 4th rows belong to the second individual
(id_ind == 2
), and so forth…
import pandas as pd
X = pd.DataFrame.from_dict({'x1_1': {0: -0.1766214634108258, 1: 1.645852185286492, 2: -0.13348860101031038, 3: 1.9681043689968933, 4: -1.7004428240831382, 5: 1.4580091413853749, 6: 0.06504113741068565, 7: -1.2168493676768384, 8: -0.3071304478616376, 9: 0.07121332925591593}, 'x1_2': {0: -2.4207773498298844, 1: -1.0828751040719462, 2: 2.73533787008624, 3: 1.5979611987152071, 4: 0.08835542172064115, 5: 1.2209786277076156, 6: -0.44205979195950784, 7: -0.692872860268244, 8: 0.0375521181289943, 9: 0.4656030062266639}, 'x1_3': {0: -1.548320898226322, 1: 0.8457342014424675, 2: -0.21250514722879738, 3: 0.5292389938329516, 4: -2.593946520223666, 5: -0.6188958526077123, 6: 1.6949245117526974, 7: -1.0271341091035742, 8: 0.637561891142571, 9: -0.7717170035055559}, 'x2_1': {0: 0.3797245517345564, 1: -2.2364391598508835, 2: 0.6205947900678905, 3: 0.6623865847688559, 4: 1.562036259999875, 5: -0.13081282910947759, 6: 0.03914373833251773, 7: -0.995761652421108, 8: 1.0649494418154162, 9: 1.3744782478849122}, 'x2_2': {0: -0.5052556836786106, 1: 1.1464291788297152, 2: -0.5662380273138174, 3: 0.6875729143723538, 4: 0.04653136473130827, 5: -0.012885303852347407, 6: 1.5893672346098884, 7: 0.5464286050059511, 8: -0.10430829457707284, 9: -0.5441755265313813}, 'x2_3': {0: -0.9762973303149007, 1: -0.983731467806563, 2: 1.465827578266328, 3: 0.5325950414202745, 4: -1.4452121324204903, 5: 0.8148816373643869, 6: 0.470791989780882, 7: -0.17951636294180473, 8: 0.7351814781280054, 9: -0.28776723200679066}, 'x3_1': {0: 0.12751822396637064, 1: -0.21926633684030983, 2: 0.15758799357206943, 3: 0.5885412224632464, 4: 0.11916562911189271, 5: -1.6436210334529249, 6: -0.12444368631987467, 7: 1.4618564171802453, 8: 0.6847234328916137, 9: -0.23177118858569187}, 'x3_2': {0: -0.6452955690715819, 1: 1.052094761527654, 2: 0.20190339195326157, 3: 0.6839430295237913, 4: -0.2607691613858866, 5: 0.3315513026670213, 6: 0.015901139336566113, 7: 0.15243420084881903, 8: -0.7604225072161022, 9: -0.4387652927008854}, 'x3_3': {0: -1.067058994377549, 1: 0.8026914180717286, 2: -1.9868531745912268, 3: -0.5057770735303253, 4: -1.6589569342151713, 5: 0.358172252880764, 6: 1.9238983803281329, 7: 2.2518318810978246, 8: -1.2781475121874357, 9: -0.7103081175166167}})
Y = pd.DataFrame.from_dict({'CHOICE': {0: 1.0, 1: 1.0, 2: 2.0, 3: 2.0, 4: 3.0, 5: 2.0, 6: 1.0, 7: 1.0, 8: 2.0, 9: 2.0}})
Z = pd.DataFrame.from_dict({'z1': {0: 2.4196730570917233, 1: 2.4196730570917233, 2: 2.822802255159467, 3: 2.822802255159467, 4: 2.073171091633643, 5: 2.073171091633643, 6: 2.044165101485163, 7: 2.044165101485163, 8: 2.4001241292606275, 9: 2.4001241292606275}, 'z2': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 0.0, 9: 0.0}, 'z3': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 2.0, 5: 2.0, 6: 2.0, 7: 2.0, 8: 3.0, 9: 3.0}})
id = pd.DataFrame.from_dict({'id_choice': {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0, 5: 6.0, 6: 7.0, 7: 8.0, 8: 9.0, 9: 10.0}, 'id_ind': {0: 1.0, 1: 1.0, 2: 2.0, 3: 2.0, 4: 3.0, 5: 3.0, 6: 4.0, 7: 4.0, 8: 5.0, 9: 5.0}} )
# Create a dataframe with all the data
data = pd.concat([id, X, Z, Y], axis=1)
print(data.head(4))
id_choice id_ind x1_1 x1_2 x1_3 x2_1 x2_2 \
0 1.0 1.0 -0.176621 -2.420777 -1.548321 0.379725 -0.505256
1 2.0 1.0 1.645852 -1.082875 0.845734 -2.236439 1.146429
2 3.0 2.0 -0.133489 2.735338 -0.212505 0.620595 -0.566238
3 4.0 2.0 1.968104 1.597961 0.529239 0.662387 0.687573
x2_3 x3_1 x3_2 x3_3 z1 z2 z3 CHOICE
0 -0.976297 0.127518 -0.645296 -1.067059 2.419673 0.0 1.0 1.0
1 -0.983731 -0.219266 1.052095 0.802691 2.419673 0.0 1.0 1.0
2 1.465828 0.157588 0.201903 -1.986853 2.822802 0.0 1.0 2.0
3 0.532595 0.588541 0.683943 -0.505777 2.822802 0.0 1.0 2.0
Now, I have written the ChoiceDataset
class around the primitive torch.utils.data.Dataset
. Unfortunately, it is sampling at the level of the observations.
# Create a dictionary with the data
data_dict = {'idx': id,
'X': X,
'Z': Z,
'Y': Y}
# Create a pytorch.Dataset class
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
class ChoiceDataset(Dataset):
def __init__(self, data):
self.Y = torch.LongTensor(data['Y'].values -1).reshape(len(data['Y'].index),1)
self.J = torch.unique(self.Y).shape[0]
self.id = torch.LongTensor(data['idx']['id_ind'].values).reshape(len(data['idx']['id_ind'].index),1)
self.N = torch.unique(self.id).shape[0] # Total number of individuals
_,self.t_n = self.id.unique(return_counts=True)
self.N_t = self.t_n.sum(axis=0).item() #total number of observations
self.X_wide = torch.DoubleTensor(data['X'].values)
self.K = int(self.X_wide.shape[1] / self.J)
self.Z = torch.DoubleTensor(data['Z'].values)
self.X = self.X_wide.reshape(self.N_t ,self.K, self.J)
def __len__(self):
# return a dictionary with the data dimensions
# __len__ is equal to the number of individual here (not the observations)
# since this is the level at which I want to sample from
return self.N
def __getitem__(self, idx):
# return a dictionary with the data
return {'Y': self.Y[idx], 'X': self.X[idx], 'id': self.id[idx], 'Z': self.Z[idx]}
As you can see below, it is sampling at the level of the observations. Could you please suggest some changes to make it sample at the level of individuals?
df_train = ChoiceDataset(data_dict)
data_train = DataLoader(df_train, batch_size=3, shuffle=False, num_workers=0)
for batch_idx, data in enumerate(data_train):
print('batch_idx:',batch_idx)
print(data['Y'].shape)
#batch_idx: 0
#torch.Size([3, 1]) # takes first 3 observations
#batch_idx: 1
#torch.Size([2, 1]) # takes the last 2 observations
Update
When selecting the observations that belong to each individual, by using the following __getitem__()
function, I was expecting to solve the problem.
def __getitem__(self, idx):
# return a dictionary with the data
# Get the position of individual idx in the dataset
ind_position = torch.where(self.id == idx)[0]
return {'Y': self.Y[ind_position], 'X': self.X[ind_position], 'id': self.id[ind_position], 'Z': self.Z[ind_position]}
However, I am getting the following error, which, if I am reading it correctly, is telling me that the, internally, torch.stack()
is meant to receive only tensors of the same size (probably [1,:,…,:] row?) when putting the batches together. Unfortunately, I am still stuck with this.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
c:\Users\u0133260\Documents\_local_git_repos\MixTasteNet_project\MixTasteNet_local\CODE\SO_Q\dataloader.py in line 53
52 df_train =ChoiceDataset(data_dict)
53 data_train = DataLoader(df_train, batch_size=2, shuffle=False, num_workers=0)
---> 54 for batch_idx, data in enumerate(data_train):
55 print('batch_idx:',batch_idx)
File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\dataloader.py:681, in _BaseDataLoaderIter.__next__(self)
678 if self._sampler_iter is None:
679 # TODO(https://github.com/pytorch/pytorch/issues/76750)
680 self._reset() # type: ignore[call-arg]
--> 681 data = self._next_data()
682 self._num_yielded += 1
683 if self._dataset_kind == _DatasetKind.Iterable and \
684 self._IterableDataset_len_called is not None and \
685 self._num_yielded > self._IterableDataset_len_called:
File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\dataloader.py:721, in _SingleProcessDataLoaderIter._next_data(self)
719 def _next_data(self):
720 index = self._next_index() # may raise StopIteration
--> 721 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
722 if self._pin_memory:
723 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\_utils\fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
50 else:
51 data = self.dataset[possibly_batched_index]
---> 52 return self.collate_fn(data)
File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\_utils\collate.py:160, in default_collate(batch)
158 elif isinstance(elem, collections.abc.Mapping):
159 try:
--> 160 return elem_type({key: default_collate([d[key] for d in batch]) for key in elem})
161 except TypeError:
162 # The mapping type may not support `__init__(iterable)`.
163 return {key: default_collate([d[key] for d in batch]) for key in elem}
File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\_utils\collate.py:160, in <dictcomp>(.0)
158 elif isinstance(elem, collections.abc.Mapping):
159 try:
--> 160 return elem_type({key: default_collate([d[key] for d in batch]) for key in elem})
161 except TypeError:
162 # The mapping type may not support `__init__(iterable)`.
163 return {key: default_collate([d[key] for d in batch]) for key in elem}
File c:\Users\u0133260\Anaconda3\envs\pyt\lib\site-packages\torch\utils\data\_utils\collate.py:141, in default_collate(batch)
139 storage = elem.storage()._new_shared(numel, device=elem.device)
140 out = elem.new(storage).resize_(len(batch), *list(elem.size()))
--> 141 return torch.stack(batch, 0, out=out)
142 elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
143 and elem_type.__name__ != 'string_':
144 if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
145 # array of string classes and object
RuntimeError: stack expects each tensor to be equal size, but got [0, 1] at entry 0 and [2, 1] at entry 1
PS: crossposted on Stackoverflow: python - PyTorch: `DataLoader()` for aggregated/clustered/panel data - Stack Overflow