Pytorch dataloader very slow with hdf5 data

I’m have a very large dataset in hdf5 format which I can not load in memory all at once. I’m using custom dataset from torch here’s the code

import time
from utils import get_vocab_and_skipgrams
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os
import h5py
import numpy as np
import torch

class CustomSkipGramDataset(Dataset):
    def __init__(self, filename, window_size, data_dir="training_data", data_exists=True):
        self.window_size = window_size
        self.filename = filename
        self.data_exists = data_exists
        self.vocab_path = os.path.join(data_dir, "vocab.npy")
        self.hdf5_path = os.path.join(data_dir, "skipgram.h5")
        
        if not data_exists:
            get_vocab_and_skipgrams(filename, data_dir)
        
        self.vocab = np.load(self.vocab_path, allow_pickle=True).tolist()
        self.vocab_size = len(self.vocab)
        self.hf = h5py.File(self.hdf5_path, "r")
        self.dataset = self.hf["positive_skips"]
        
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self, index):
        
        x, y = self.dataset[index]
        return x, y

Now when I’m loading it directly like this

with h5py.File("./training_data/skipgram.h5") as hf:
    dataset = hf["positive_skips"]
    for a in range(1,100):
        print(torch.tensor(dataset[a:100*a]))

it is indeed very fast compared to torch custom dataset. 100x faster almost. I know I’m doing something wrong.

Random suggestion, have you tried to disable multiprocessing? I’m not sure how well both would interact.

There is certainly some troubles with pickling the CustomSkipGramDataset instances with multiprocessing.
To work with HDF5 spirit with dataset, do the following:

  • clear the init method from any high computation thing, don’t store large object in class attributes.
  • remove the getitem and bring the __iter__method instead.
  • code your __iter__method like the following:
def __iter__(self):
  open_hdf5()
  prepare_data()
  for entry in data:
    yield item