After reading various posts about WeightedRandomSampler (some links are left as code comments) I’m unsure what to expect from the example below (pytorch 1.3.1)
import numpy as np
import torch
from torch.utils.data import TensorDataset as dset
torch.manual_seed(42)
data_size = 15
num_classes = 3
batch_size = 4
inputs = torch.tensor(range(data_size))
print("inputs", inputs.shape, inputs)
if 0:
targets = torch.floor(num_classes*torch.rand(data_size)).int()
else:
targets = torch.tensor([1, 0, 1, 1, 0, 1, 0, 1, 1, 2, 2, 1, 0, 0, 1], dtype=torch.int32)
print("targets", targets.shape, targets)
trainDataset = dset(inputs, targets)
# https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/10
class_sample_count = np.array([len(np.where(targets==t)[0]) for t in np.unique(targets)])
print("class_sample_count", class_sample_count.shape, class_sample_count)
weights = 1. / class_sample_count
print("weights", weights.shape, weights)
# https://discuss.pytorch.org/t/some-problems-with-weightedrandomsampler/23242/2
samples_weights = weights[targets]
assert len(samples_weights) == len(targets)
if 0:
print("samples_weights", samples_weights.shape, samples_weights)
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weights, len(samples_weights), replacement=True)
trainLoader = torch.utils.data.DataLoader(dataset=trainDataset, batch_size=batch_size, sampler=sampler)
inputs_new = []
targets_new = []
for batch, (data, target) in enumerate(trainLoader):
counts = [len(np.where(target.numpy()==class_sample)[0]) for class_sample in range(len(class_sample_count))]
inputs_new.extend(data.data.numpy())
targets_new.extend(target.data.numpy())
print("batch {}, size {}, data {}, counts: {}".format(batch, data.shape[0], target.data, counts))
print("inputs_new", inputs_new)
print("targets_new", targets_new)
print("class_sample_count_new", np.array([len(np.where(targets_new==t)[0]) for t in np.unique(targets_new)]))
Output
inputs torch.Size([15]) tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
targets torch.Size([15]) tensor([1, 0, 1, 1, 0, 1, 0, 1, 1, 2, 2, 1, 0, 0, 1], dtype=torch.int32)
class_sample_count (3,) [5 8 2]
weights (3,) [0.2 0.125 0.5 ]
batch 0, size 4, data tensor([0, 1, 0, 2], dtype=torch.int32), counts: [2, 1, 1]
batch 1, size 4, data tensor([2, 0, 0, 0], dtype=torch.int32), counts: [3, 0, 1]
batch 2, size 4, data tensor([1, 1, 1, 0], dtype=torch.int32), counts: [1, 3, 0]
batch 3, size 3, data tensor([0, 1, 2], dtype=torch.int32), counts: [1, 1, 1]
inputs_new [1, 2, 1, 9, 9, 13, 13, 1, 2, 3, 7, 12, 12, 7, 9]
targets_new [0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 1, 0, 0, 1, 2]
class_sample_count_new [7 5 3]
I would expect the class_sample_count_new to be “more” balanced, is this a correct assumption?
I’ve tried also larger values of data_size and batch_size, while removing manual_seed, but still the imbalance was surprisingly large.