I am trying to build a model which will take two arrays from the dataloader. But the problem is the dataloader is extremely slow. To iterate over the entire sample it took around 3-4 hrs on a A100 GPU with num_workers=32. Is there any way in which I can speed up the dataloader? I have tried setting pin_memory=True, which is actually making it slower.
This is my implementation
class VQADataset(torch.utils.data.Dataset):
ans_vocab = {}
def __init__(self, data_dir, qafile, img_dir, phase, img_def='images_resnet', raw_images=True, raw_audio=True, lang='French'):
self.data_dir = data_dir
if phase == 'train':
self.examples = open(os.path.join(data_dir, qafile), 'r').readlines()
else:
self.examples = open(os.path.join(data_dir, qafile), 'r').readlines()
if phase == 'train':
self.load_vocab(data_dir)
self.img_dir = img_dir
self.phase = phase
self.raw_images = raw_images
self.raw_audio = raw_audio
self.lang = lang
self.phase = phase
self.unk_emb = 8000
self.img_def =img_def
def load_vocab(self, data_dir):
ans_vocab_file = os.path.join(data_dir, 'ans_itos.tsv')
for line in open(ans_vocab_file):
parts = line.split('\t')
VQADataset.ans_vocab[parts[0]] = parts[1]
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
audio_id, ques_id, ques, imgid, ans = self.examples[idx].strip().split('\t')
int(imgid)))
img = np.load('{0}/features/{2}/{4}/COCO_{1}_{3:012d}.npy'.format(self.data_dir, self.img_dir, self.phase, int(imgid), self.img_def)).squeeze()
audio_features = np.load(os.path.join(self.data_dir, 'features', 'w2v2_'+self.phase, self.lang, f"{self.lang}_{audio_id}.npy")).squeeze()
return ques_id, ques, torch.tensor(audio_features), torch.tensor(img), imgid, int(ans)
My train dataset contains around 440000 samples and validation set contains around 200000 samples
train_dataloader = DataLoader(training_data, batch_size=256, num_workers=32)
Thank You