Hello All;
I have a very unbalanced dataset, which I tried to balance using the following code (Class Dataset then my code):
class myDataset(Dataset):
def __init__(self, csv_file, root_dir, target, length, transform=None):
self.annotations = pd.read_csv(csv_file).iloc[:length,:]
self.root_dir = root_dir
self.transform = transform
self.target = target
self.length = length
def __len__(self):
return len(self.annotations)
def __alltargets__(self):
return self.annotations.loc[:,self.target]
def __getitem__(self, index):
img_path = os.path.join(self.root_dir, self.annotations.loc[index, 'image_id'])
image = Image.open(img_path)
image = np.array(image)
if self.transform:
image = self.transform(image=image)["image"]
image = np.transpose(image, (2, 0, 1)).astype(np.float32)
image = torch.tensor(image)# device=torch.device('cuda:0'))
y_label = torch.tensor(int(self.annotations.loc[index, str(self.target)]))# device=torch.device('cuda:0'))
return image, y_label
And then my code:
aug = al.Compose([
al.RandomResizedCrop(H, W, p=0.2),
al.Resize(H, W),
al.Transpose(p=0.2),
al.HorizontalFlip(p=0.5),
al.VerticalFlip(p=0.2),
al.augmentations.Normalize(max_pixel_value=255.0,
always_apply=True,
p=1.0)
])
dataset = myDataset(csv_file=LABEL_PATH,
root_dir=IMAGE_PATH,
target='gender',
length=LENGTH,
transform=aug)
l = dataset.__len__()
y = dataset.__alltargets__()
train_idx, valid_idx = train_test_split(np.arange(l), test_size=0.2, shuffle=True, stratify=y)
train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
test_sampler = torch.utils.data.SubsetRandomSampler(valid_idx)
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4, sampler=train_sampler)
test_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4, sampler=test_sampler)
My Question please: My actual code is just splitting evenly the classes among train and test datasets. How can I make the mini-batches balanced also ?
Thank you very much,
Habib