I wrote a custom dataloader by following the examples in pytorch. This is the code I have written
import torch
import torch.utils.data
import numpy as np
# reads the binary file and return the data in ascii format
def _read_binary_file(fname, dim):
with open(fname, 'rb') as fid:
data = np.fromfile(fid, dtype=np.float32)
assert data.shape[0] % dim == 0.0
data = data.reshape(-1, dim)
return data, data.shape[0]
class LoadDataset(torch.utils.data.Dataset):
"""
Custom dataset compatible with torch.utils.data.DataLoader
"""
def __init__(self, x_files_list, y_files_list, in_dim, out_dim):
"""Set the path for data
Args:
x_files_list: list of input files with full path
y_files_list: list of target files with full path
x_dim: input dimension
y_dim: output dimension
"""
self.x_files_list = x_files_list
self.y_files_list = y_files_list
self.in_dim = in_dim
self.out_dim = out_dim
def __getitem__(self, index):
"""Returns one data pair (x_data, y_data)."""
x_file = self.x_files_list[index]
y_file = self.y_files_list[index]
x_data, no_frames_x = _read_binary_file(x_file, self.in_dim)
y_data, no_frames_y = _read_binary_file(y_file, self.out_dim)
assert (no_frames_x == no_frames_y)
return (torch.Tensor(x_data), torch.Tensor(y_data))
def __len__(self):
return len(self.x_files_list)
def get_loader(x_files_list, y_files_list, in_dim, out_dim, batch_size,
shuffle, num_workers):
# Custom dataset
data = LoadDataset(x_files_list=x_files_list,
y_files_list=y_files_list,
in_dim=in_dim,
out_dim=out_dim)
# Data loader
# This will return (x_data, y_data) for every iteration
# x_data: tensor of shape (batch_size, in_dim)
# y_data: tensor of shape (batch_size, out_dim)
data_loader = torch.utils.data.DataLoader(dataset=data,
batch_size=batch_size,
shuffle=shuffle,
num_workers=num_workers)
return data_loader
if __name__ == "__main__":
x_files_list_file = '/work/shared/vocomp/x_files_list'
y_files_list_file = '/work/shared/vocomp/y_files_list'
in_dim = 335
out_dim = 262
with open(x_files_list_file, 'r') as fid:
x_files_list = [l.strip() for l in fid.readlines()]
with open(y_files_list_file, 'r') as fid:
y_files_list = [l.strip() for l in fid.readlines()]
x_files_list = x_files_list[0:len(y_files_list)]
data_loader = get_loader(x_files_list, y_files_list, in_dim, out_dim, 64, False, 1)
for i, (x_data, y_data) in enumerate(data_loader):
print i, x_data.size(), y_data.size()
Output is
File "dataprovider_pytorch.py", line 80, in <module>
for i, (x_data, y_data) in enumerate(data_loader):
File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 174, in __next__
return self._process_next_batch(batch)
File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 198, in _process_next_batch
raise batch.exc_type(batch.exc_msg)
RuntimeError: Traceback (most recent call last):
File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 32, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 81, in default_collate
return [default_collate(samples) for samples in transposed]
File "/l/SRC/python/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 68, in default_collate
return torch.stack(batch, 0)
File "/l/SRC/python/lib/python2.7/site-packages/torch/functional.py", line 56, in stack
return torch.cat(list(t.unsqueeze(dim) for t in sequence), dim)
RuntimeError: inconsistent tensor sizes at /data/users/soumith/builder/wheel/pytorch-src/torch/lib/TH/generic/THTensorMath.c:2548