Hello,
Reading from h5 files using dataloader is too slow. Here I am pasting the code I have used to store and read files from hdf5.
So I have 4 folders containing 10 h5 files each. I want to fetch arrays from each folder’s h5 file…
I mean… if pt-1.h5
is in my folder 1 and another pt-1.h5 is in my folder 2 and so on… my record will be first index from pt-1.h5 from folder 1 and first index from pt-1.h5 from folder2 and so on…
size of each pt-{}.h5 is [40000,224,224,3]
This code runs without an error… but my concern here is … it is damn slow… it takes 50s/it reading from the dataloader num workers 4 and batch size 64.
Can someone help me with how can I make it faster or any other way around if I can refactor the code… or I have to try something else to store the data…
PLEASE HELP!!!
def store_many_hdf5(dir_name, images, counter=1):
""" Stores an array of images to HDF5.
Parameters:
---------------
dir_name images to store in dir
images images array, to be stored
"""
# Create a new HDF5 file
file = h5py.File(hdf5_dir / f"pt-{counter}.h5", "w")
Create a dataset in the file
dataset = file.create_dataset(
"images", np.shape(images), h5py.h5t.STD_U8BE, data=images, compression='gzip'
)
file.close()
def read_many_hdf5(hdf5_file):
"""
Reads image from HDF5.
:param hdf5_file: h5 file name
:returns (object):
"""
images = []
# Open the HDF5 file
file = h5py.File(hdf5_file, "r", swmr=True, libver='latest')
images = file["/images"]
return images
class H5Dataset(Dataset):
def __init__(self, data_path="./data/", n=40000, transforms=None):
"""
The files are stored in h5 format.
Total Length of dataset is fixed to : 400k
Records per h5 file except background : 40k
Records in h5 file for background : 100
"""
self.data_path = data_path
self.transforms = transforms
self.length = 400000
self.batch_size = 40000
self.bg_batch = 4000
def _open_data(self):
self.files = {'bg':[],'bg_fg':[],'bg_fg_mask':[],'depth':[]}
for folder in glob.glob(self.data_path+"*"):
a = glob.glob(folder+"/*")
b = [int(i.rsplit("/")[-1].split("-")[1].split(".")[0]) for i in a]
c = list(zip(a,b))
d = (sorted(c, key = lambda x : x[1]))
e = [f for f,j in d ]
fol = '_'.join(folder.rsplit("/")[-1].split("_")[:-1])
self.files[fol] = [read_many_hdf5(f) for f in e]
def __getitem__(self, index):
if not hasattr(self, 'files'):
self._open_data()
q, r = index//self.batch_size, index%self.batch_size
bg_r = index//self.bg_batch
bg_record = self.files["bg"][0][bg_r]/255.0
bg_fg_record = self.files["bg_fg"][q][r]/255.0
bg_fg_mask_record = self.files["bg_fg_mask"][q][r]/255.0
depth_record = self.files["depth"][q][r]/255.0
record = {
'bg' : bg_record,
'bg_fg' : bg_fg_record,
'mask' : bg_fg_mask_record,
'depth' : depth_record,
}
return record
def __len__(self):
return self.length
class DataLoader(object):
def __init__(self, config):
self.config = config
self.num_workers = self.config["num_workers"]
self.pin_memory = self.config["pin_memory"]
self.batch_size = self.config["batch_size"]
self.kwargs = {'num_workers':self.num_workers, 'pin_memory':self.pin_memory} if self.config['use_cuda'] else {}
ddata = DownloadData(self.config)
validation_split = 0.2
dataset_size = len(ddata.dataobj)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
# np.random.seed(42)
# np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
self.train_loader = torch.utils.data.DataLoader(ddata.dataobj, batch_size=self.batch_size,
sampler=train_sampler, **self.kwargs)
self.valid_loader = torch.utils.data.DataLoader(ddata.dataobj, batch_size=self.batch_size,
sampler=valid_sampler, **self.kwargs)