I have around a million images I want to train spread across around 50 folders. Do I have to load in all the filenames of every image. Right now, I have my Dataset set up so it does an os.walk across all the directories and retrieves a list of all filenames in the init function. This takes several hours. Is there a way to just start on one folder and start loading image paths for training while simultaneously loading the batch size of images and starting training? Below is my code.
def __init__( self, transform=None, config=None): self.samples =  for crop_path in config.dataset_crops_path_list: for root, subdirs, filenames in os.walk(crop_path): file_name_list = [f for f in filenames] #MAJOR TIME SINK for filename in file_name_list: self.samples.append(os.path.join(root, filename)) self.targets =  for ii in tqdm(self.samples): image_label = os.path.basename(ii) self.targets.append(image_label) self.transform = transform self.output_shape = config.input_sz def __len__(self): return len(self.samples) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() image_path = self.samples[idx] image = io.imread(image_path) image_label = self.targets[idx] non_transformed_image = transform.resize(image.copy(), (self.output_shape, self.output_shape), preserve_range=True) if self.transform: image = self.transform(image) return image, image_label return non_transformed_image, image_label