I am experimenting/learning with using the torch.utils.data.Dataset
feature with a common data set, MNIST, in its CSV format. When I use this same code, involving more complex operations on an NLP data set, it works wonderfully fast and as expected. When I tried to port my code over to another data set, MNIST in CSV format, I am stunned at how slow it emits data from the DataLoader
when I compare it to PyTorch’s datasets.MNIST
.
Consider the following:
import time
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import Dataset, Subset
from torch.utils.data import random_split, DataLoader, RandomSampler
# Create Dataset
class CSVDataset(Dataset):
"""MNIST dataset."""
def __init__(self, csv_file, target, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
features (string): column(s) containing feature data.
target (string): column containing the dependent variable.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
# initialize
self.data_frame = pd.read_csv(csv_file)
self.features = [x for x in self.data_frame.columns if x != str(target)]
self.target = target
self.transform = transform
return None
# get length of df
def __len__(self):
return len(self.data_frame)
# get target
def __get_target__(self):
return self.data_frame[self.target]
# get df filtered by indices
def __get_values__(self, indices):
return self.data_frame.iloc[indices]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
# pull a sample of data
features = self.data_frame.iloc[idx][self.features]
target = self.data_frame.iloc[idx][self.target]
# hold sample in a dict
sample = {'features': torch.tensor(features),
'target': torch.tensor(target),
'idx': torch.tensor(idx)}
if self.transform:
sample = self.transform(sample)
return sample
# instantiate the lazy data set
csv_dataset = CSVDataset(csv_file='https://datahub.io/machine-learning/mnist_784/r/mnist_784.csv',
target='class',
transform=None)
# set train, valid, and test size
train_size = int(0.8 * len(csv_dataset))
valid_size = int(0.1 * len(csv_dataset))
# use random split to create three data sets;
train_ds, valid_ds, test_ds = torch.utils.data.random_split(csv_dataset, [train_size, valid_size, valid_size])
train_csv_loader = DataLoader(dataset=train_ds,
batch_size=100,
shuffle=True)
# check the output
start = time.time()
for i, batch in enumerate(train_csv_loader):
if i == 50:
break
end = time.time()
print(end - start) # 71.06
And for comparison, PyTorch’s version emits data to the DataLoader over 100x faster.
# and officially from PyTorch
train_dataset = datasets.MNIST(root='data',
train=True,
transform=transforms.ToTensor(),
download=True)
train_loader = DataLoader(dataset=train_dataset,
batch_size=100,
shuffle=True)
# check the output
start = time.time()
for i, batch in enumerate(train_loader):
if i == 50:
break
end = time.time()
print(end - start) # 0.52
I am confused as to what is causing such a slowdown when I am using nearly the same strategy (the Dataset class), with more operations for NLP (also from a CSV file), and it works very fast.
Thanks for your consideration and time!