This is my first post and I desperately tried to solve this problem before posting. Since I couldn’t figure it out I decided to ask here. I have been coding in Python for about 1 year now and am new to PyTorch.
I have an unbalanced dataset of 15 classes that I need to split into train / validation / test datasets. Right now I have a working program where I create random indices and then pass this to the DataLoader
when I splitting.
The only problem is that when I want to incorporate sampling the minority classes more, I can’t pass my indices to WeightedRandomSampler
. So I got stuck here.
I then tried to use random_split
to create 3 new datasets, which can be passed into the DataLoader
, but when I do that (see below) it doesn’t seem to work either.
So is there a better to split data into train / validation / test sets when I only have one main dataset? Then, how do I weight the minority classes in my training set so that they are used more often when training?
Below is my code so far.
Thanks!
class CustomDataset(Dataset):
def __init__(self, csv_file, transform=None):
self.data = pd.read_csv(csv_file)
self.transform = transform
self.labels = self.data.label.unique()
self.num_classes = len(self.labels)
self.label_count = self.data.groupby('label')['image_path'].nunique()
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
img_path = self.data.iloc[idx, 1]
image = Image.open(img_path)
if(self.transform):
sample = self.transform(image)
label = self.data.iloc[idx, 0]
class_index = np.where(self.labels==label)
class_index = class_index[0][0]
return (sample, class_index)
## DATASET
file = 'data.csv'
image_size = 224
data_transform = transforms.Compose([
transforms.Resize([image_size,image_size]),
transforms.Grayscale(3),
transforms.ToTensor(), # Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
dataset_custom = CustomDataset(csv_file=file,
transform=data_transform)
dataset_size = len(dataset_custom) # dataset_size: how many examples do we have?
## SPLIT DATASET
train_split= 0.7
validate_split = 0.25
test_split = 0.05
train_size = int(train_split * dataset_size)
validation_size = int(validate_split * dataset_size)
test_size = int(dataset_size - train_size - validation_size)
########### CURRENTLY DOING THIS, WHICH WORKS ###########
indices = list(range(dataset_size))
np.random.shuffle(indices)
train_indices = indices[:train_size]
temp = int(train_size+validation_size)
val_indices = indices[train_size:temp]
test_indices = indices[temp:]
## DATA LOARDER ##
batch_size = 16
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)
train_loader = torch.utils.data.DataLoader(dataset=dataset_ifcb,
batch_size=batch_size,
sampler=train_sampler,
shuffle=False)
validation_loader = torch.utils.data.DataLoader(dataset=dataset_ifcb,
batch_size=batch_size,
sampler=valid_sampler,
shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=dataset_ifcb,
batch_size=batch_size,
sampler=test_sampler,
shuffle=False)
########### END OF WHAT I'M CURRENTLY DOING ###########
########### WHAT I'M TRING TO ACCOMPLISH ###########
# the return type of random_slit is a Subset
dataset_train, dataset_valid, dataset_test = random_split(dataset_custom, [train_size, validation_size, test_size])
# something more needs to be done with the train_sampler, however, I can't pass it indices
#train_sampler = WeightedRandomSampler(train_indices)
# the code doesn't even work if I don't pass a sampler to the DataLoader, but just the dataset
train_loader = torch.utils.data.DataLoader(dataset=dataset_train,
batch_size=batch_size)
validation_loader = torch.utils.data.DataLoader(dataset=dataset_valid,
batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(dataset=dataset_test,
batch_size=batch_size)
########### END OF WHAT I'M TRING TO ACCOMPLISH ###########
########### CONTINUE REGULAR CODE ###########
# create some dictionaries of the data we will be using
dataloaders = {'train':train_loader, 'val':validation_loader, 'test':test_loader}
dataset_sizes = {'train':train_size, 'val':validation_size, 'test':test_size}
## MODEL
model = torchvision.models.resnet18(pretrained=True)