My data loader is very slow. I have around 6700 .pt files of around 80 kb each and when using my custom dataset, the data loader takes a long time, around 30 mins per epoch. It is very quick if i use a limited dataset, i.e. 1000 .pt files. Increasing number of workers allow for faster iteration, but it takes a long time to initialise each workers after each batch. As i work with graphs i use the Pytorch Geometric dataloader, but it should not be much different to the Pytorch loader. Should I save my graphs in another format instead of .pt files? Not sure I understand why this is happening. Here is the code:
class CDataset(Dataset):
def __init__(self, root, pre_filter=None, pre_transform=None):
super(CDataset, self).__init__(root, pre_filter, pre_transform)
def atoi(self, text):
return int(text) if text.isdigit() else text
def natural_keys(self, text):
return [ self.atoi(c) for c in re.split(r'(\d+)', text) ]
@property
def raw_file_names(self):
path_to_raw = os.listdir(self.root+"/raw")
path_to_raw.sort(key=self.natural_keys)
return path_to_raw
@property
def processed_file_names(self):
names = []
for i in range(len(self.raw_paths)):
names.append('data_{}.pt'.format(i))
names.sort(key=self.natural_keys)
return names
def download(self):
pass
def process(self):
i = 0
for raw_path in self.raw_paths:
data = torch.load(raw_path)
data = data if self.pre_filter is None else self.pre_filter(data)
data = data if self.pre_transform is None else self.pre_transform(data)
torch.save(data, osp.join(self.processed_dir, 'data_{}.pt'.format(i)))
i += 1
def len(self):
return len(self.processed_file_names)
def get(self, idx):
data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(idx)))
return data