Torch Sampler ImbalancedDataSampler

import numpy as np
import pandas as pd
import os
import pickle
from glob import glob
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
import torch
import torchvision
import seaborn as sns
from tqdm import tqdm
from PIL import Image
from itertools import chain
import torch.nn.functional as F
import torch.optim as optim
from sklearn.utils import resample
import torch.optim.lr_scheduler as lr_scheduler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, \
    multilabel_confusion_matrix, roc_curve, auc, classification_report
from torchsampler import ImbalancedDatasetSampler

# Device configuration GPU support for MAC
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
else:
    print("MPS device not found.")

# Paths to Images and DataEntry file
all_xray_df = pd.read_csv('NihXrayData/Data_Entry_2017_v2020.csv')
allImagesGlob = glob('NihXrayData/images*/images/*.png')
# eof

all_image_paths = {os.path.basename(x): x for x in
                   allImagesGlob}
# print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df.sample(3)

# # Data Pre Processing ####
# # Simplifying to 15 primary classes (adding No Finding as the 15th class)
condition_labels = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis',
                    'Effusion', 'Pneumonia', 'Pleural_Thickening',
                    'Cardiomegaly', 'Nodule', 'Mass', 'Hernia', 'No Finding']
for label in condition_labels:
    all_xray_df[label] = all_xray_df['Finding Labels'].map(lambda result: 1.0 if label in result else 0)
all_xray_df.head(20)

all_xray_df['disease_vec'] = all_xray_df.apply(lambda target: [target[condition_labels].values], 1).map(
    lambda target: target[0])

all_xray_df.head()

print(all_xray_df[condition_labels].sum())

train_df, test_df = train_test_split(all_xray_df, test_size=0.30, random_state=2020)


class XrayDataset(torch.utils.data.Dataset):
    def __init__(self, data_frame, transform=None):
        self.data_frame = data_frame
        self.transform = transform

    def __getitem__(self, idx):
        row = self.data_frame.iloc[idx]
        address = row['path']
        data = Image.open(address).convert('RGB')
        label = np.array(row['disease_vec'], dtype=np.float32)

        if self.transform:
            data = self.transform(data)

        return data, torch.FloatTensor(label)

    def __len__(self):
        return len(self.data_frame)


# Define data augmentation for training
train_transform = transforms.Compose([
    # transforms.RandomResizedCrop(224),
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(10),
    transforms.RandomGrayscale(p=0.4),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

# Data Sets
train_dataset = XrayDataset(train_df, transform=train_transform)
test_dataset = XrayDataset(test_df, transform=transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]))

trainSampler = ImbalancedDatasetSampler(train_dataset)
# Data Loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    num_workers=0,
    sampler=trainSampler,
    shuffle=True,
)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=64,
    num_workers=0,
    shuffle=False,
)
Traceback (most recent call last):
  File "/Users/dougietownsell/PycharmProjects/Independent-Study/Xray.py", line 101, in <module>
    trainSampler = ImbalancedDatasetSampler(train_dataset)
  File "/Users/dougietownsell/PycharmProjects/Independent-Study/venv/lib/python3.9/site-packages/torchsampler/imbalanced.py", line 37, in __init__
    df["label"] = self._get_labels(dataset) if labels is None else labels
  File "/Users/dougietownsell/PycharmProjects/Independent-Study/venv/lib/python3.9/site-packages/torchsampler/imbalanced.py", line 61, in _get_labels
    return dataset.get_labels()
AttributeError: 'XrayDataset' object has no attribute 'get_labels'

Hello I’m trying to use the ImbalancedDataSampler but I keep getting this error not sure how to fix it

Judging by the example on the TorchSampler project webpage, you need to instantiate the sampler directly inside the DataLoader instantiation ( DataLoader(train_dataset, ..., sampler=ImbalancedDatasetSampler(train_dataset)) ), not outside. Not easy to verify whether this is the case though, as the link to the project’s homepage returns a 404 message. In any case, check how the _get_labels function is implemented in imbalanced.py.

It seems like the imbalanced sampler assumes the dataset object you supply has a get_labels() function, which the XrayDataset you use doesn’t have. You’ll have to check the torchsampler docs to see what the API is, then add an appropriate implementation to XRayDataset’.

1 Like