How to use WeightedRandomSampler for imbalanced data

Jimmy2027 · February 1, 2021, 3:20pm

Hi, I have a highly imbalanced image dataset and would like to use the WeightedRandomSampler to sample from my dataset such that the model sees approximately each class the same number of times. However since my classes are sicknesses present in some images, I have three classes for three sicknesses and a class for no sickness. My labels look like this: [[1,1,0], [0,0,0], [1, 0, 0], …].

How can I use the weighted sampler in this case? If I understood correctly I would need to give 3 weights for the three classes, but that would not take into account the images with label [0,0,0], which are the majority.

Below is a minimal working example using the weighted sampler, but like this none of the three classes gets selected:

import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.utils import shuffle
from torch.utils.data import DataLoader

len_ds = 60461
counts = {'A': 6775, 'B': 3609, 'C': 906}


def create_data(which_class: str):
    arr = np.zeros((1, len_ds))
    arr[:, :counts[which_class]] = 1
    return arr


data = {k: create_data(k) for k in counts}
data = np.concatenate((data['A'], data['B'], data['C']), axis=0)

label_df = pd.DataFrame(data).transpose()
label_df.columns = [*counts.keys()]
label_df = shuffle(label_df)
label_df.reset_index(inplace=True, drop=True)
print(label_df[label_df == 1].count())


class toy_dataset(Dataset):
    def __getitem__(self, index):
        return torch.tensor(label_df.iloc[index].values)

    def __len__(self):
        return len(label_df)


dataset = toy_dataset()


def calculateWeights(label_dict, d_set):
    arr = []
    for label, count in label_dict.items():
        weight = len(d_set) / count
        arr.append(weight)
    return arr


weights = calculateWeights(counts, dataset)
weights = torch.DoubleTensor(weights)
print('weights: ', weights)
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(dataset), replacement=True)
trainloader = DataLoader(dataset, batch_size=50, sampler=sampler,
                         shuffle=False,
                         num_workers=0, pin_memory=False)

all_labels = torch.Tensor()
for labels in trainloader:
    all_labels = torch.cat((all_labels, labels.cpu()), 0)

print(all_labels.shape)

print(
    {label: all_labels[:, i].sum().item() for i, label in
     enumerate([*counts.keys()])})

which returns:

torch.Size([60461, 3])
{'A': 6775.0, 'B': 3609.0, 'C': 906.0}
weights:  tensor([ 8.9241, 16.7528, 66.7340], dtype=torch.float64)
torch.Size([60461, 3])
{'A': 0.0, 'B': 0.0, 'C': 0.0}

ptrblck · February 2, 2021, 4:22am

The WeightedRandomSampler expects a weight tensor, which assigns a weight to each sample, not the class labels.
Here is an example of its usage.

Based on your description it also seems that you are working on a multi-label classification, where each sample might belong to zero, one, or more classes.
If that’s the case, not that over-/undersampling might not be trivial, since you could increase the class count of multiple classes by drawing specific samples. Here is a post, which discusses this use case a bit more.

Jimmy2027 · February 3, 2021, 7:12am

Thanks a lot for the reply! I should have read the docs more carefully, putting a weight for each sample makes sense.
I applied this to my MWE, however this trivial way for setting the weights gives a much higher weight for samples that have more than one class. Still iterating through the dataset with this samples gives counts of {'A': 44137.0, 'B': 37309.0, 'C': 20473.0} which is much less imbalanced than before.
Probably there is a better way for setting the weights.

import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.utils import shuffle
from torch.utils.data import DataLoader

len_ds = 60461
counts = {'A': 6775, 'B': 3609, 'C': 906}


def create_data(which_class: str):
    arr = np.zeros((1, len_ds))
    arr[:, :counts[which_class]] = 1
    return arr


data = {k: create_data(k) for k in counts}
data = np.concatenate((data['A'], data['B'], data['C']), axis=0)

label_df = pd.DataFrame(data).transpose()
label_df.columns = [*counts.keys()]
label_df = shuffle(label_df)
label_df.reset_index(inplace=True, drop=True)

y_train = label_df.values
class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])

for idx, row in label_df.iterrows():
    label_df.at[idx, 'weights'] = (row['A']) * 1 / counts['A'] + (
        row['B']) * 1 / \
                                  counts['B'] + (row['C']) * 1 / counts['C'] + (
                                          row['A'] == row['B'] == row['C'] == 0) * 1 / (
                                          len(label_df) - counts['A'] - counts['B'] - counts['C'])

print(np.unique(label_df.weights, return_counts=True))
print(label_df[label_df == 1].count())


class toy_dataset(Dataset):
    def __getitem__(self, index):
        return torch.tensor(label_df.iloc[index].values)

    def __len__(self):
        return len(label_df)


dataset = toy_dataset()

weights = torch.DoubleTensor(label_df.weights.values)
print('weights: ', weights)
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(dataset), replacement=True)
trainloader = DataLoader(dataset, batch_size=50, sampler=sampler,
                         shuffle=False,
                         num_workers=0, pin_memory=False)

all_labels = torch.Tensor()
for labels in trainloader:
    all_labels = torch.cat((all_labels, labels.cpu()), 0)

print(all_labels.shape)

print(
    {label: all_labels[:, i].sum().item() for i, label in
     enumerate([*counts.keys()])})