Why is my torch.utils.data.Dataset generating data slowly?

localh · October 4, 2020, 2:04pm

I am experimenting/learning with using the torch.utils.data.Dataset feature with a common data set, MNIST, in its CSV format. When I use this same code, involving more complex operations on an NLP data set, it works wonderfully fast and as expected. When I tried to port my code over to another data set, MNIST in CSV format, I am stunned at how slow it emits data from the DataLoader when I compare it to PyTorch’s datasets.MNIST.

Consider the following:

import time
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import Dataset, Subset
from torch.utils.data import random_split, DataLoader, RandomSampler


# Create Dataset
class CSVDataset(Dataset):
    """MNIST dataset."""

    def __init__(self, csv_file, target, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            features (string): column(s) containing feature data.
            target (string): column containing the dependent variable.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        # initialize
        self.data_frame = pd.read_csv(csv_file)
        self.features = [x for x in self.data_frame.columns if x != str(target)]
        self.target = target
        self.transform = transform
        return None

        # get length of df
    def __len__(self):
        return len(self.data_frame)

        # get target
    def __get_target__(self):
        return self.data_frame[self.target]

        # get df filtered by indices
    def __get_values__(self, indices):
        return self.data_frame.iloc[indices]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # pull a sample of data
        features = self.data_frame.iloc[idx][self.features]
        target = self.data_frame.iloc[idx][self.target]

        # hold sample in a dict
        sample = {'features': torch.tensor(features),
                  'target': torch.tensor(target),
                  'idx': torch.tensor(idx)}

        if self.transform:
            sample = self.transform(sample)

        return sample

# instantiate the lazy data set
csv_dataset = CSVDataset(csv_file='https://datahub.io/machine-learning/mnist_784/r/mnist_784.csv',
                         target='class',
                         transform=None)

# set train, valid, and test size
train_size = int(0.8 * len(csv_dataset))
valid_size = int(0.1 * len(csv_dataset))

# use random split to create three data sets; 
train_ds, valid_ds, test_ds = torch.utils.data.random_split(csv_dataset, [train_size, valid_size, valid_size])


train_csv_loader = DataLoader(dataset=train_ds,
                          batch_size=100,
                          shuffle=True)
# check the output
start = time.time()
for i, batch in enumerate(train_csv_loader):
    if i == 50:
        break
end = time.time()
print(end - start) # 71.06

And for comparison, PyTorch’s version emits data to the DataLoader over 100x faster.

# and officially from PyTorch
train_dataset = datasets.MNIST(root='data',
                               train=True,
                               transform=transforms.ToTensor(),
                               download=True)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=100,
                          shuffle=True)

# check the output
start = time.time()
for i, batch in enumerate(train_loader):
    if i == 50:
        break
end = time.time() 
print(end - start)   # 0.52

I am confused as to what is causing such a slowdown when I am using nearly the same strategy (the Dataset class), with more operations for NLP (also from a CSV file), and it works very fast.

Thanks for your consideration and time!

localh · October 5, 2020, 1:48am

I believe I have found the general source of the slow downs. The culprit comes down to the initializer:

# replace the initialize with this:
        # initialize
        self.data_frame = pd.read_csv(csv_file)
        self.features = self.data_frame[self.data_frame.columns[:-1]]
        self.target = self.data_frame[self.data_frame.columns[-1]]
        self.transform = transform