H5py getitem dataloader is too slow

agent04 · July 25, 2020, 5:43am

Hello,

Reading from h5 files using dataloader is too slow. Here I am pasting the code I have used to store and read files from hdf5.

So I have 4 folders containing 10 h5 files each. I want to fetch arrays from each folder’s h5 file…
I mean… if pt-1.h5 is in my folder 1 and another pt-1.h5 is in my folder 2 and so on… my record will be first index from pt-1.h5 from folder 1 and first index from pt-1.h5 from folder2 and so on…

size of each pt-{}.h5 is [40000,224,224,3]

This code runs without an error… but my concern here is … it is damn slow… it takes 50s/it reading from the dataloader num workers 4 and batch size 64.

Can someone help me with how can I make it faster or any other way around if I can refactor the code… or I have to try something else to store the data…

PLEASE HELP!!!

def store_many_hdf5(dir_name, images, counter=1):
    """ Stores an array of images to HDF5.
        Parameters:
        ---------------
        dir_name    images to store in dir
        images       images array, to be stored
    """
    
    # Create a new HDF5 file
    file = h5py.File(hdf5_dir / f"pt-{counter}.h5", "w")

    Create a dataset in the file
    dataset = file.create_dataset(
        "images", np.shape(images), h5py.h5t.STD_U8BE, data=images, compression='gzip'
    )
    
    file.close()

def read_many_hdf5(hdf5_file):
    """ 
    Reads image from HDF5.
    :param hdf5_file: h5 file name
    :returns (object):
    """
    images = []

    # Open the HDF5 file
    file = h5py.File(hdf5_file, "r", swmr=True, libver='latest')

    images = file["/images"]
    
    return images

class H5Dataset(Dataset):
    def __init__(self, data_path="./data/", n=40000, transforms=None):
        """
        The files are stored in h5 format.
        Total Length of dataset is fixed to : 400k
        Records per h5 file except background : 40k
        Records in h5 file for background : 100
        """
        
        self.data_path = data_path
        self.transforms = transforms
            
        self.length = 400000
        self.batch_size = 40000
        self.bg_batch = 4000

    def _open_data(self):
        self.files = {'bg':[],'bg_fg':[],'bg_fg_mask':[],'depth':[]}

        for folder in glob.glob(self.data_path+"*"):
            a = glob.glob(folder+"/*")
            b = [int(i.rsplit("/")[-1].split("-")[1].split(".")[0]) for i in a]
            c = list(zip(a,b))
            d = (sorted(c, key = lambda x : x[1]))
            e = [f for f,j in d ]
            
            fol = '_'.join(folder.rsplit("/")[-1].split("_")[:-1])
            self.files[fol] = [read_many_hdf5(f) for f in e]
    
    def __getitem__(self, index):

        if not hasattr(self, 'files'):
            self._open_data()

        q, r = index//self.batch_size, index%self.batch_size
        bg_r = index//self.bg_batch 

        bg_record = self.files["bg"][0][bg_r]/255.0
        bg_fg_record = self.files["bg_fg"][q][r]/255.0
        bg_fg_mask_record = self.files["bg_fg_mask"][q][r]/255.0
        depth_record = self.files["depth"][q][r]/255.0

        record = {
            'bg' : bg_record,
            'bg_fg' : bg_fg_record,
            'mask' : bg_fg_mask_record,
            'depth' : depth_record,
        }
        
        return record
        
    def __len__(self):
        return self.length

class DataLoader(object):
    def __init__(self, config):
        self.config = config
        self.num_workers = self.config["num_workers"]
        self.pin_memory = self.config["pin_memory"]
        self.batch_size = self.config["batch_size"]
        
        self.kwargs = {'num_workers':self.num_workers, 'pin_memory':self.pin_memory} if self.config['use_cuda'] else {}

        ddata = DownloadData(self.config)
        validation_split = 0.2

        dataset_size = len(ddata.dataobj)
        indices = list(range(dataset_size))
        split = int(np.floor(validation_split * dataset_size))
        
        # np.random.seed(42)
        # np.random.shuffle(indices)

        train_indices, val_indices = indices[split:], indices[:split]

        train_sampler = SubsetRandomSampler(train_indices)
        valid_sampler = SubsetRandomSampler(val_indices)

        self.train_loader = torch.utils.data.DataLoader(ddata.dataobj, batch_size=self.batch_size, 
                                                        sampler=train_sampler, **self.kwargs)

        self.valid_loader = torch.utils.data.DataLoader(ddata.dataobj, batch_size=self.batch_size,
                                                        sampler=valid_sampler, **self.kwargs)

Soheila_Hesaraki · August 5, 2022, 3:26pm

could you find a solution?