I have a very large dataset (>2 TB raw) consisting of over 20 million images (44 x 44 with 10 channels). At the moment I convert my raw data to a HDF5 format with LZF compression. I then use PyTorch data loader when training my model but, after profiling I see my data loader takes 99.8% of the time during 1 epoch of training (1 epoch takes about 15 hrs currently on an A100).
Is there something I am doing wrong, can I construct my dataloader/dataset in a better way?
class PionDataset(Dataset):
def __init__(self, hdf5_file, transform=None):
self.hdf5_file = hdf5_file
self.file = h5py.File(self.hdf5_file, 'r')
self.dataset_length = self.file['EnergyLKr'].shape[0]
self.labels = self.file['labels'][:]
# Optional: apply transforms
self.transform = transform
def __len__(self):
return self.dataset_length
def get_target_frequencies(self):
return torch.bincount(torch.tensor(self.file['labels'], dtype=torch.long))
def __getitem__(self, idx):
momentum = np.float32(self.file['momentum'][idx]) # MeV by default
is_edge = torch.tensor(self.file['IsEdgeCluster'][idx], dtype=torch.float32)
category = torch.tensor(self.file['labels'][idx], dtype=torch.long)
# Read the image from the branch (44 x 44 x 10)
image = torch.tensor(self.file['EnergyLKr'][idx], dtype=torch.float32) / momentum
if self.transform:
image = self.transform(image)
# Return the image and labels as a tuple
return image, is_edge, category
def close(self):
self.file.close()
transform = transforms.Compose([
v2.RandomVerticalFlip(p=0.5)
])
# Create the dataset
hdf5_file = "/path/to/my/train.hdf5"
dataset = PionDataset(hdf5_file, transform=transform)
train_loader = DataLoader(
dataset,
batch_size=1024,
shuffle=True,
num_workers=2,
prefetch_factor=1,
pin_memory=True,
persistent_workers=True
)
# ...do model training
I noticed that using num_workers > 0 slows the data loader even more! Doing something simple even like this:
batch = next(iter(train_loader))
takes more than 20 seconds.