Smaller batch sizes will create more noise, since the weighted sampling is a random process.
If you collect the batches and check the stats the epoch would still show a balanced usage:
numDataPoints = 1000
data_dim = 5
bs = 5
# Create dummy data with class imbalance 9 to 1
data = torch.randn(numDataPoints, data_dim)
target = np.hstack((np.zeros(int(numDataPoints * 0.5), dtype=np.int32),
np.ones(int(numDataPoints * 0.1), dtype=np.int32),
np.ones(int(numDataPoints * 0.1), dtype=np.int32) * 2,
np.ones(int(numDataPoints * 0.1), dtype=np.int32) * 3,
np.ones(int(numDataPoints * 0.1), dtype=np.int32) * 4,
np.ones(int(numDataPoints * 0.1), dtype=np.int32) * 5))
class_sample_count = np.array(
[len(np.where(target == t)[0]) for t in np.unique(target)])
print(class_sample_count)
# [5000 1000 1000 1000 1000 1000]
weight = 1. / class_sample_count
print(weight)
# [0.0002 0.001 0.001 0.001 0.001 0.001 ]
samples_weight = np.array([weight[t] for t in target])
samples_weight = torch.from_numpy(samples_weight)
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))
target = torch.from_numpy(target).long()
train_dataset = torch.utils.data.TensorDataset(data, target)
train_loader = DataLoader(
train_dataset, batch_size=bs, num_workers=1, sampler=sampler)
freqs = np.zeros(len(target.unique()))
for i, (data, t) in enumerate(train_loader):
f = [len((t == i).nonzero()) for i in range(len(target.unique()))]
print("batch index: {}, class count: {}".format(i, f))
freqs += np.array(f)
print(freqs)
# [164. 185. 185. 139. 159. 168.]