How to handle imbalanced classes

ptrblck · April 17, 2023, 7:49pm

Smaller batch sizes will create more noise, since the weighted sampling is a random process.
If you collect the batches and check the stats the epoch would still show a balanced usage:

numDataPoints = 1000
data_dim = 5
bs = 5

# Create dummy data with class imbalance 9 to 1
data = torch.randn(numDataPoints, data_dim)
target = np.hstack((np.zeros(int(numDataPoints * 0.5), dtype=np.int32),
                    np.ones(int(numDataPoints * 0.1), dtype=np.int32),
                    np.ones(int(numDataPoints * 0.1), dtype=np.int32) * 2,
                    np.ones(int(numDataPoints * 0.1), dtype=np.int32) * 3,
                    np.ones(int(numDataPoints * 0.1), dtype=np.int32) * 4,
                    np.ones(int(numDataPoints * 0.1), dtype=np.int32) * 5))

class_sample_count = np.array(
    [len(np.where(target == t)[0]) for t in np.unique(target)])
print(class_sample_count)
# [5000 1000 1000 1000 1000 1000]

weight = 1. / class_sample_count
print(weight)
# [0.0002 0.001  0.001  0.001  0.001  0.001 ]
samples_weight = np.array([weight[t] for t in target])

samples_weight = torch.from_numpy(samples_weight)
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))

target = torch.from_numpy(target).long()
train_dataset = torch.utils.data.TensorDataset(data, target)

train_loader = DataLoader(
    train_dataset, batch_size=bs, num_workers=1, sampler=sampler)

freqs = np.zeros(len(target.unique()))
for i, (data, t) in enumerate(train_loader):
    f = [len((t == i).nonzero()) for i in range(len(target.unique()))]
    print("batch index: {}, class count: {}".format(i, f))
    freqs += np.array(f)

print(freqs)
# [164. 185. 185. 139. 159. 168.]