Dataset dataloader

I have dataset class as:

class SomeDataset(Dataset):
    def __init__(self, usage='val', dataset_pickle_file='./some.pkl', skip_every_n_image=1):
        super(SomeDataset, self).__init__()

        self.to_tensor = transforms.ToTensor()

        with open(dataset_pickle_file, 'rb') as file:
            self.data_info = pickle.load(file)[usage]
        self.idx = [i for i in range(0, len(self.data_info[0]), skip_every_n_image)]
        self.data_len = len(self.idx)

    def __getitem__(self, index):
         ### TODO
    def __len__(self):
        return self.data_len

I am confused which of the below would be correct for the getitem function:

    def __getitem__(self, index):
        color_img = self.data_info[0][self.idx[index]]
        color_img = Image.open(color_img)
        color_tensor = self.to_tensor(color_img)
        output = {'image': color_tensor}
        return output

Or

    def __getitem__(self, index):
        color_img = self.data_info[0][index]
        color_img = Image.open(color_img)
        color_tensor = self.to_tensor(color_img)
        output = {'image': color_tensor}
        return output

Can you all also please also mention the logic behind it? Thanks!

I guess the [self.idx[index]] would be the correct way to go.

The logic being that if you use the self.data_info[0][index] directly, then the skip_every_n_image would become useless. But when you use self.data_info[0][self.idx[index]], you are making use of the self.idx which has the data after skipping every n images.

1 Like