How to modify my dataset code

I am trying to use dataloader for training. The dataset is 150G, which are all .npz files. Due to the limitation of memory size, only one sample is read at a time from the disk. The following is part of the code.

    class VimeoDataset(Dataset):
    def __init__(self, mode, batch_size=32, num_workers = 8, num_gpus = 4):
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.num_gpus = num_gpus
        self.mode = mode
        self.h = 256
        self.w = 448
        xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0)
        yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1)
        self.grid = np.stack((xx,yy),2).copy()

        count = self.batch_size * self.num_workers * self.num_gpus
        if self.mode == 'train':
            filelist = glob('/data/vimeoFlow2/dataset/train/*.npz')
            self.npzs = [filelist[i:i + count] for i in range(0, len(filelist), count)]
            filelist = glob('/data/vimeoFlow2/dataset/val/*.npz')
            self.npzs = [filelist[i:i + count] for i in range(0, len(filelist), count)]

    def __len__(self):
        return len(self.npzs)

    def load_data(self, index): = []
        self.flow_data = []

        for i in range(len(self.npzs[index])):
            f = np.load(self.npzs[index][i])
            if self.mode == 'train':
                self.flow_data.append(np.zeros((256, 448, 4)))    

    def getimg(self, index):
        data = self.meta_data[index]
        img0 = data[0:3].transpose(1, 2, 0)
        img1 = data[3:6].transpose(1, 2, 0)
        gt = data[6:9].transpose(1, 2, 0)
        flow_gt = (self.flow_data[index]).transpose(1, 2, 0)
        return img0, gt, img1, flow_gt
    def __getitem__(self, index):        
        img0, gt, img1, flow_gt = self.getimg(index)
    dataset = VimeoDataset(mode = 'train',  batch_size=32, num_workers = 8, num_gpus = 4)
    sampler = DistributedSampler(dataset)
    train_data = DataLoader(dataset, batch_size=args.batch_size, pin_memory=True, num_workers=args.num_workers, drop_last=True, sampler=sampler)
    dataset_val = VimeoDataset(mode = 'val',  batch_size=32, num_workers = 8, num_gpus = 4)
    val_data = DataLoader(dataset_val, batch_size=args.batch_size, pin_memory=True, num_workers=args.num_workers)

However, reading data from the disk one by one causes the dataloader to be very time-consuming. So I want to improve this program, first load the amount of data of num_gpus×num_workers×batch_size into the memory, then read the data from the memory with getitem, and finally replace the data in the memory after each iteration. But I still don’t know how to achieve it. I have tried my idea as in the code above. I don’t konw how to allocate the load_data function parameters.

