I have a data_set module created from TinyImageNet and I want to reduce the size of the training set in a balanced fashion so that each class has 100 data points instead of 500. I tried using
torch.utils.data.Subset()
but it does not change the dataset size in my setting. This is my code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import os.path
import torch.utils.data as data
from torchvision import transforms
from torchvision import datasets
from sklearn.model_selection import train_test_split
meta = {'rgb_mean':[0.485, 0.456, 0.406], 'rgb_std': [0.229, 0.224, 0.225]}
class TinyImageNet99199Base(data.Dataset):
def __init__(self, inet):
super(TinyImageNet99199Base, self).__init__()
self._inet = inet
def __getitem__(self, index):
data, target = self._inet[index]
example_dict = {
"input1": data,
"target1": target,
"index": index,
}
return example_dict
def __len__(self):
return len(self._inet)
class TinyImageNet99199Train(TinyImageNet99199Base):
def __init__(self, args, root):
d = os.path.dirname(root)
inet_0 = datasets.ImageFolder(
root,
transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=meta['rgb_mean'], std=meta['rgb_std']),
])
)
super(TinyImageNet99199Train, self).__init__(inet_0)
X_train_indices, X_valid_indices, Y_train_indices, _ = train_test_split(range(len(inet_0)), inet_0.targets, stratify = inet_0.targets,
test_size=0.2, random_state=100, shuffle=True)
inet = data.Subset(inet_0, X_valid_indices)
class TinyImageNet99199Valid(TinyImageNet99199Base):
def __init__(self, args, root):
d = os.path.dirname(root)
inet = datasets.ImageFolder(
root,
transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=meta['rgb_mean'], std=meta['rgb_std']),
]))
super(TinyImageNet99199Valid, self).__init__(inet)
Any alternative solutions that I can try? intuitively this should work but maybe there is a bug here that I haven’t been able to find. Any help is very much appreciated!