Got error in custom dataloader

bajibabu · June 1, 2017, 1:03pm

I wrote a custom dataloader by following the examples in pytorch. This is the code I have written

import torch
import torch.utils.data
import numpy as np


# reads the binary file and return the data in ascii format
def _read_binary_file(fname, dim):
    with open(fname, 'rb') as fid:
        data = np.fromfile(fid, dtype=np.float32)
    assert data.shape[0] % dim == 0.0
    data = data.reshape(-1, dim)
    return data, data.shape[0]

class LoadDataset(torch.utils.data.Dataset):
    """
    Custom dataset compatible with torch.utils.data.DataLoader
    """
    def __init__(self, x_files_list, y_files_list, in_dim, out_dim):
        """Set the path for data

        Args:
            x_files_list: list of input files with full path
            y_files_list: list of target files with full path
            x_dim: input dimension
            y_dim: output dimension
        """
        self.x_files_list = x_files_list
        self.y_files_list = y_files_list
        self.in_dim = in_dim
        self.out_dim = out_dim

    def __getitem__(self, index):
        """Returns one data pair (x_data, y_data)."""
        x_file = self.x_files_list[index]
        y_file = self.y_files_list[index]
        x_data, no_frames_x = _read_binary_file(x_file, self.in_dim)
        y_data, no_frames_y = _read_binary_file(y_file, self.out_dim)

        assert (no_frames_x == no_frames_y)

        return (torch.Tensor(x_data), torch.Tensor(y_data))
    
    def __len__(self):
        return len(self.x_files_list)

def get_loader(x_files_list, y_files_list, in_dim, out_dim, batch_size,
               shuffle, num_workers):
    # Custom dataset
    data = LoadDataset(x_files_list=x_files_list,
                    y_files_list=y_files_list,
                    in_dim=in_dim,
                    out_dim=out_dim)
    
    # Data loader
    # This will return (x_data, y_data) for every iteration
    # x_data: tensor of shape (batch_size, in_dim)
    # y_data: tensor of shape (batch_size, out_dim)
    data_loader = torch.utils.data.DataLoader(dataset=data,
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers)
    return data_loader


if __name__ == "__main__":
    x_files_list_file = '/work/shared/vocomp/x_files_list'
    y_files_list_file = '/work/shared/vocomp/y_files_list' 
    in_dim = 335
    out_dim = 262
    
    with open(x_files_list_file, 'r') as fid:
        x_files_list = [l.strip() for l in fid.readlines()]

    with open(y_files_list_file, 'r') as fid:
        y_files_list = [l.strip() for l in fid.readlines()]
    
    x_files_list = x_files_list[0:len(y_files_list)]

    data_loader = get_loader(x_files_list, y_files_list, in_dim, out_dim, 64, False, 1)
    for i, (x_data, y_data) in enumerate(data_loader):
        print i, x_data.size(), y_data.size()

Output is

  File "dataprovider_pytorch.py", line 80, in <module>
    for i, (x_data, y_data) in enumerate(data_loader):
  File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 174, in __next__
    return self._process_next_batch(batch)
  File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 198, in _process_next_batch
    raise batch.exc_type(batch.exc_msg)
RuntimeError: Traceback (most recent call last):
  File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 32, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 81, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 68, in default_collate
    return torch.stack(batch, 0)
  File "/l/SRC/python/lib/python2.7/site-packages/torch/functional.py", line 56, in stack
    return torch.cat(list(t.unsqueeze(dim) for t in sequence), dim)
RuntimeError: inconsistent tensor sizes at /data/users/soumith/builder/wheel/pytorch-src/torch/lib/TH/generic/THTensorMath.c:2548

smth · June 16, 2017, 12:59pm

As the error says:

RuntimeError: inconsistent tensor sizes at /data/users/soumith/builder/wheel/pytorch-src/torch/lib/TH/generic/THTensorMath.c:2548

Tensors from the returned data loader are of various different sizes, so the collate function is not able to concatenate them into a mini-batch