I have a very imbalanced dataset that contains 10k samples for the minority class and 1 million samples for the majority class (binary classification). What I want to do is dividing all minority samples into mini batches for one epoch equally without over-sampling them (I have already obtained 10k with oversampling).
I tried DataLoader with WeightedRandomSampler but it creates mini batches with equal number of samples for each class (e.g. 500 minority, 500 majority samples for batch size as 1000). But I do not want to oversample minority samples, I want to create batches like 100 minority samples and 900 majority samples without oversampling.
As nobody replied my question, I wrote my sampler class from scratch. I hope it would help anyone who needs.
class ProportionalTwoClassesBatchSampler:
"""
dataset: DataSet class that returns torch tensors
batch_size: Size of mini-batches
minority_size_in_batch: Number of minority class samples in each mini-batch
majority_priority: If it is True, iterations will include all majority
samples in the data. Otherwise, it will be completed after all minority samples are used.
"""
def __init__(self, dataset, batch_size, minority_size_in_batch, majority_priority=True):
self.labels = list(set(dataset.y.numpy()))
self.minority_size = minority_size_in_batch
self.batch = batch_size
self.data = dataset
self.priority = majority_priority
def __iter__(self):
if self.minority_size > self.batch:
raise ValueError("Number of minority samples in a batch must be lower than batch size!")
y_indices = [np.where(self.data.y.numpy() == label)[0] for label in self.labels]
self.indices = sorted(y_indices, key=lambda x: x.shape)
minority_copy = self.indices[0].copy()
while len(self.indices[1]) > self.batch - self.minority_size:
if len(self.indices[0]) < self.minority_size:
if self.priority:
# reloading minority samples
self.indices[0] = minority_copy.copy()
else:
break
minority = np.random.choice(self.indices[0],
size = self.minority_size,
replace=False)
majority = np.random.choice(self.indices[1],
size = (self.batch - self.minority_size),
replace=False)
batch_inds = np.concatenate((minority, majority), axis=0)
batch_inds = np.random.permutation(batch_inds)
self.indices[0] = np.setdiff1d(self.indices[0], minority)
self.indices[1] = np.setdiff1d(self.indices[1], majority)
yield self.data.x[batch_inds], self.data.y[batch_inds]
data_tr = Data(tr_in, tr_out)
sampler = ProportionalTwoClassesBatchSampler(data_tr, 512, 50)
ind = 0
# for x,y in sampler:
# #dfy = pd.DataFrame(data=y.numpy())
# #print(dfy.value_counts())
# ind += 1
# print(ind)