Dataloader custom collate for different input sizes

Brenda95X · April 1, 2020, 12:48am

Hello,

I’m a fairly new Pytorch user and wondering if anyone could help me with this problem associated with Dataloader.

Here’s a screenshot of my dataframe, inputs are values from ‘y+, index, Re_tau, DU_DY, Y’ column.

Every point in this dataframe, DU_DY & Y always have the same size. However, for different Re_tau values, the size for DU_DY are different (hence, so is the size for Y). And this is where the problem starts in network training. I get a feeling the training data for DU_DY & Y wasn’t split correctly, then checked a few threads and see many CNNs have encountered the same problem and they were solved by calling a custom collate_fn in Dataloader. I attempted to implement my own collate_fn but failed after many tried…

This dataframe ‘df_train’ is first converted to datasets ‘ds_train’, then convert to dataloader.
trainloader = DataLoader(ds_train, batch_size=5, shuffle=True)

Could anyone point me into the right direction on how to write a collate_fn for this different input size problem? I heard there was a Pytorch preview version solve this problem automatically, sadly not the latest stable version.

ptrblck · April 1, 2020, 6:17am

Have a look at this approach posted by @vfdev-5 and let us know, if that helps or not.

Brenda95X · April 1, 2020, 4:14pm

I’ve tried a similar collate_fn before, but doesn’t seem to work if my ds_train is a list that contains tensors.

This is the error meassage I got:
TypeError: expected np.ndarray (got list)

class ChannelDataset(Dataset):

    def __init__(self, df, input_labels, target_label): # input df is a dataframe
        toArray = lambda x: np.atleast_1d(x)  # convert scalar to array
        toTensor = lambda x: torch.from_numpy(x)  # convert array to tensor
        toFloat = lambda x: x.float() if x.dtype == torch.float64 else x  # convert double to float

        self.df = df.applymap(toArray).applymap(toTensor).applymap(toFloat)

        self.inputs = self.df[input_labels].values
        self.target = self.df[target_label].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        inputs = list(self.inputs[index]) # have problem with using torch.cat()
        target = list(self.target[index])[0]

        return inputs, target


class Trainer:
    def _step(self, data_batch, optimizer=None):

        inputs_batch, target_batch = data_batch
        pred_batch = self.model(*inputs_batch)
        loss = self._criterion(pred_batch, target_batch)

        if optimizer is not None:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        return loss.item(), len(target_batch)   
   
 
    def fit(self, df_train, df_val, df_test, batch_size=1, print_freq=1, max_epochs=1000, min_epochs=0, earlystopping=False, patience=0):

        self.df_train = df_train
        self.df_val = df_val

        # Convert dataframes to datasets
        ds_train = ChannelDataset(df_train, self.input_labels, self.target_label)
        ds_val = ChannelDataset(df_val, self.input_labels, self.target_label)
        ds_test = ChannelDataset(df_test, self.input_labels, self.target_label)

        # Convert datasets to dataloaders
        trainloader = DataLoader(ds_train, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: [(torch.from_numpy(dp_train[0]), torch.tensor(dp_train[1])) for dp_train in batch])
        valloader = DataLoader(ds_val, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: [(torch.from_numpy(dp[0]), torch.tensor(dp[1])) for dp in batch])
        testloader = DataLoader(ds_test, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: [(torch.from_numpy(dp[0]), torch.tensor(dp[1])) for dp in batch])#collate_fn = my_collate,

        # Initialize loss_history
        if self.loss_history is None:
            self.loss_history = {'train': [], 'val': [], 'test': []}

        while keep_going:  # In each epoch
            # Set model to train mode
            self.model.train()

            # Loop over trainloader
            losses, nums = zip(*[self._step(data_batch, self._optimizer) for data_batch in trainloader])
            train_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)

            # Set model to eval mode
            self.model.eval()

            # Loop over valloader
            with torch.no_grad():
                losses, nums = zip(*[self._step(data_batch) for data_batch in valloader])
            val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)

            # Loop over testloader
            with torch.no_grad():
                losses, nums = zip(*[self._step(data_batch) for data_batch in testloader])
            test_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)

            # Append losses
            self.loss_history['train'].append(train_loss)
            self.loss_history['val'].append(val_loss)
            self.loss_history['test'].append(test_loss)

ptrblck · April 2, 2020, 2:22am

Since ChannelDataset returns lists, torch.from_numpy won’t work and you could try to use torch.tensor(dp[0]) instead.

Brenda95X · April 2, 2020, 2:56am

I tried that earlier, the error for torch.tensor(dp[0]) is:
only one element tensors can be converted to Python scalars

Does this has anything to do with DU_DY and Y has a bracket grouping all data?

ptrblck · April 2, 2020, 5:41am

I’m not sure what shape and data type self.inputs and self.targets is, but would it work if you remove the list call and pass the arrays directly to the collate function, which will then create the list?

Brenda95X · April 2, 2020, 5:52pm

print(self.inputs) has a shape of (665, 5) and the following structure:

[[tensor([292.934])  tensor([134], dtype=torch.int32)
  tensor([1.1, 1.2, 1.3, 1.4, ..., 1.153]) # DU_DY, size of 153
  tensor([2.1, 2.2, 2.3, 2.4, ..., 2.153]) # Y, size of 153
  tensor([550.123])] # this Re_tau input determines size for DU_DY & Y
 [tensor([50.433])  tensor([97], dtype=torch.int32)
  tensor([1.1, 1.2, 1.3, 1.4, ..., 1.256]) # DU_DY, size of 256
  tensor([2.1, 2.2, 2.3, 2.4, ..., 2.256]) # Y, size of 256
  tensor([1000.23])] # Re_tau input determines size for DU_DY & Y
 [another training data with different DU_DY & Y size]
 ...
 [another training data with different DU_DY & Y size]]

print(self.target) has a shape of (665, 1) this structure:

[[tensor([-1.3254])]
 [tensor([-1.9446])]
 ...
 [tensor( [-1.4099])]
 [tensor( [-1.4342])]
 [tensor( [0.])]]

Before getting rid of the list() in class ChannelDataset: element in dataset ds_train has the same list format as ds_train[0]:

([tensor([292.934]), tensor([134], dtype=torch.int32), tensor([1.1, 1.2, 1.3, 1.4, ..., 1.153]), tensor([2.1, 2.2, 2.3, 2.4, ..., 2.153]), tensor([550.123])], tensor([-1.3254]))

If I get rid of the list() in ChannelDataset(), the error message upon execution is

TypeError                                 Traceback (most recent call last)
<ipython-input-4-4e3f094c5224> in <module>
     35 
     36 '''Fit model'''
---> 37 trainer.fit(df_train, df_val, df_test, batch_size=3, print_freq=1, max_epochs=1, earlystopping=False, patience=30)
     38 
     39 scores = {'Train': evaluate(trainer, df_train),

~\Desktop\IACS\Code\Demo\utils_train.py in fit(self, df_train, df_val, df_test, batch_size, print_freq, max_epochs, min_epochs, earlystopping, patience)
    246 
    247             # Loop over trainloader
--> 248             losses, nums = zip(*[self._step(data_batch, self._optimizer) for data_batch in trainloader])
    249             train_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
    250 

~\Desktop\IACS\Code\Demo\utils_train.py in <listcomp>(.0)
    246 
    247             # Loop over trainloader
--> 248             losses, nums = zip(*[self._step(data_batch, self._optimizer) for data_batch in trainloader])
    249             train_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
    250 

~\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
    343 
    344     def __next__(self):
--> 345         data = self._next_data()
    346         self._num_yielded += 1
    347         if self._dataset_kind == _DatasetKind.Iterable and \

~\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
    383     def _next_data(self):
    384         index = self._next_index()  # may raise StopIteration
--> 385         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    386         if self._pin_memory:
    387             data = _utils.pin_memory.pin_memory(data)

~\Anaconda3\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
     45         else:
     46             data = self.dataset[possibly_batched_index]
---> 47         return self.collate_fn(data)

~\Desktop\IACS\Code\Demo\utils_train.py in my_collate(batch)
    198 
    199         def my_collate(batch):
--> 200             output = [(torch.tensor(dp[0]), torch.tensor(dp[1])) for dp in batch]
    201 
    202             return output

~\Desktop\IACS\Code\Demo\utils_train.py in <listcomp>(.0)
    198 
    199         def my_collate(batch):
--> 200             output = [(torch.tensor(dp[0]), torch.tensor(dp[1])) for dp in batch]
    201 
    202             return output

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

I guess now the ds_train is something like

[(array([tensor([292.934]), tensor([134], dtype=torch.int32), tensor([1.1, 1.2, 1.3, 1.4, ..., 1.153]), tensor([2.1, 2.2, 2.3, 2.4, ..., 2.153]), tensor([550.123])], dtype=object), tensor([-1.3254])),
(array(...)), ...]

I’m not too sure how to fix the dtype=object issue.