Reducing the size of Pytorch datasets created with IamgeFolder in a Balanced fashion

samin_hamidi · February 1, 2022, 8:10am

I have a data_set module created from TinyImageNet and I want to reduce the size of the training set in a balanced fashion so that each class has 100 data points instead of 500. I tried using

torch.utils.data.Subset()

but it does not change the dataset size in my setting. This is my code:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import os.path
import torch.utils.data as data
from torchvision import transforms
from torchvision import datasets
from sklearn.model_selection import train_test_split


meta = {'rgb_mean':[0.485, 0.456, 0.406], 'rgb_std': [0.229, 0.224, 0.225]}



class TinyImageNet99199Base(data.Dataset):
    def __init__(self, inet):
        super(TinyImageNet99199Base, self).__init__()
        self._inet = inet

    def __getitem__(self, index):
        data, target = self._inet[index]

        example_dict = {
            "input1": data,
            "target1": target,
            "index": index,
        }

        return example_dict

    def __len__(self):
        return len(self._inet)


class TinyImageNet99199Train(TinyImageNet99199Base):
    def __init__(self, args, root):
        d = os.path.dirname(root)


        inet_0 = datasets.ImageFolder(
                        root,
                        transforms.Compose([

                            transforms.RandomHorizontalFlip(),
                            transforms.ToTensor(),
                            transforms.Normalize(mean=meta['rgb_mean'], std=meta['rgb_std']),
                        ])
                     )
        super(TinyImageNet99199Train, self).__init__(inet_0)

        
        X_train_indices, X_valid_indices, Y_train_indices, _ = train_test_split(range(len(inet_0)), inet_0.targets, stratify = inet_0.targets, 
        test_size=0.2, random_state=100, shuffle=True)
       
        inet = data.Subset(inet_0,  X_valid_indices)
      

class TinyImageNet99199Valid(TinyImageNet99199Base):
    def __init__(self, args, root):
        d = os.path.dirname(root)


        inet = datasets.ImageFolder(
        root,
        transforms.Compose([

            transforms.ToTensor(),
            transforms.Normalize(mean=meta['rgb_mean'], std=meta['rgb_std']),
        ]))
        super(TinyImageNet99199Valid, self).__init__(inet)

Any alternative solutions that I can try? intuitively this should work but maybe there is a bug here that I haven’t been able to find. Any help is very much appreciated!

ptrblck · February 1, 2022, 8:21am

In your example you are initializing TinyImageNet99199Base with uper(TinyImageNet99199Train, self).__init__(inet_0) before wrapping inet_0 into Subset so inet = data.Subset(inet_0, X_valid_indices) won’t be used at all.
Apply the Subset first and pass it to the base class.

samin_hamidi · February 1, 2022, 8:35am

thank you. It is solved!