Hi,
I have around a million images I want to train spread across around 50 folders. Do I have to load in all the filenames of every image. Right now, I have my Dataset set up so it does an os.walk across all the directories and retrieves a list of all filenames in the init function. This takes several hours. Is there a way to just start on one folder and start loading image paths for training while simultaneously loading the batch size of images and starting training? Below is my code.
Thank you!
class Custom_Dataset(Dataset):
def __init__( self, transform=None, config=None): self.samples = [] for crop_path in config.dataset_crops_path_list: for root, subdirs, filenames in os.walk(crop_path): file_name_list = [f for f in filenames] #MAJOR TIME SINK for filename in file_name_list: self.samples.append(os.path.join(root, filename)) self.targets = [] for ii in tqdm(self.samples): image_label = os.path.basename(ii) self.targets.append(image_label) self.transform = transform self.output_shape = config.input_sz def __len__(self): return len(self.samples) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() image_path = self.samples[idx] image = io.imread(image_path) image_label = self.targets[idx] non_transformed_image = transform.resize(image.copy(), (self.output_shape, self.output_shape), preserve_range=True) if self.transform: image = self.transform(image) return image, image_label return non_transformed_image, image_label