Hello,
I have this issue where i want to load images from a path that i have labeled in a CSV. What i want to do is to draw the 315 first entries in the CSV for the first batch, the next 315 entries for the second and so on (or a way to specify which entries i want in each batch). The first block of code gives an example of this where the data loaded is randomly for some reason even though i have shuffle=True
The second block has another implementation of this which gives me an implementation error and also takes a really long time since it’s loading everything into ram (> 600 000 images) How do I load data in the right way in a time efficient manner, can I use use cache to store the images in ram to imporve speed? The folder is 76 megabytes on disk so it shouldnt be a problem.
class MyDataset(Dataset):
def __init__(self, csv_file, root_dir, transform = None):
self.annotations = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self, index):
img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 1])
im = Image.open(img_path, mode="r")
im.draft("F", (16, 19))
image = np.asarray(im)
y_label = torch.tensor(self.annotations.iloc[index, 3])
if self.transform:
image = self.transform(image)
return (image, y_label)
dataset = MyDataset(csv_file = csv_path,
root_dir=folder_path,
transform = transforms.ToTensor())
train_set, test_set = torch.utils.data.random_split(dataset, [int(0.8*len(dataset)), int(0.2*len(dataset)+1)])
train_loader = DataLoader(dataset=train_set, batch_size=315, shuffle=False, **kwargs)
test_loader = DataLoader(dataset=test_set, batch_size=315, shuffle=False, **kwargs)
class MyDataset(Dataset):
def __init__(self, csv_file, root_dir, transform = None):
self.annotations = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
data = []
for i in range(len(self.annotations)):
img_path = os.path.join(self.root_dir, self.annotations.iloc[i, 1])
im = Image.open(img_path, mode="r")
im.draft("F", (16, 19))
image = np.asarray(im)
y_label = torch.tensor(self.annotations.iloc[i, 3])
if self.transform:
image = self.transform(image)
data.append((image, y_label))
def __len__(self):
return len(self.annotations)
def __getitem__(self, index):
return self.data[index]
dataset = MyDataset(csv_file = csv_path,
root_dir=folder_path,
transform = transforms.ToTensor())
train_set, test_set = torch.utils.data.random_split(dataset, [int(0.8*len(dataset)), int(0.2*len(dataset)+1)])
sampler = torch.utils.data.sampler.BatchSampler(torch.utils.data.sampler.SequentialSampler(dataset), batch_size=315, drop_last=False)
train_loader2 = DataLoader(dataset=train_set, sampler=sampler, shuffle=False, **kwargs)
test_loader2 = DataLoader(dataset=test_set, sampler=sampler, shuffle=False, **kwargs)