I am trying to use dataloader for training. The dataset is 150G, which are all .npz files. Due to the limitation of memory size, only one sample is read at a time from the disk. The following is part of the code.
class VimeoDataset(Dataset):
def __init__(self, mode, batch_size=32, num_workers = 8, num_gpus = 4):
self.batch_size = batch_size
self.num_workers = num_workers
self.num_gpus = num_gpus
self.mode = mode
self.load_data()
self.h = 256
self.w = 448
xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0)
yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1)
self.grid = np.stack((xx,yy),2).copy()
self.npzs=[]
count = self.batch_size * self.num_workers * self.num_gpus
if self.mode == 'train':
filelist = glob('/data/vimeoFlow2/dataset/train/*.npz')
self.npzs = [filelist[i:i + count] for i in range(0, len(filelist), count)]
else:
filelist = glob('/data/vimeoFlow2/dataset/val/*.npz')
self.npzs = [filelist[i:i + count] for i in range(0, len(filelist), count)]
def __len__(self):
return len(self.npzs)
def load_data(self, index):
self.data = []
self.flow_data = []
for i in range(len(self.npzs[index])):
f = np.load(self.npzs[index][i])
self.data.append(f['i0i1gt'])
if self.mode == 'train':
self.flow_data.append(f['ft0ft1'])
else:
self.flow_data.append(np.zeros((256, 448, 4)))
def getimg(self, index):
data = self.meta_data[index]
img0 = data[0:3].transpose(1, 2, 0)
img1 = data[3:6].transpose(1, 2, 0)
gt = data[6:9].transpose(1, 2, 0)
flow_gt = (self.flow_data[index]).transpose(1, 2, 0)
return img0, gt, img1, flow_gt
def __getitem__(self, index):
img0, gt, img1, flow_gt = self.getimg(index)
...
...
dataset = VimeoDataset(mode = 'train', batch_size=32, num_workers = 8, num_gpus = 4)
sampler = DistributedSampler(dataset)
train_data = DataLoader(dataset, batch_size=args.batch_size, pin_memory=True, num_workers=args.num_workers, drop_last=True, sampler=sampler)
dataset_val = VimeoDataset(mode = 'val', batch_size=32, num_workers = 8, num_gpus = 4)
val_data = DataLoader(dataset_val, batch_size=args.batch_size, pin_memory=True, num_workers=args.num_workers)
However, reading data from the disk one by one causes the dataloader to be very time-consuming. So I want to improve this program, first load the amount of data of num_gpus×num_workers×batch_size into the memory, then read the data from the memory with getitem, and finally replace the data in the memory after each iteration. But I still don’t know how to achieve it. I have tried my idea as in the code above. I don’t konw how to allocate the load_data function parameters.