I have created a Dataset class ( map style) by inheriting data.Dataset.I do some of the preprocessing and extraction of word vectors in this class object and then pass the tensors (having the embeddings) to the DataLoader.
The DataLoader seems very slow if I iterate through it. I have tried n_workers = 12 but the situation doesn’t improve (it worsens actually).
The code is attached below.
Things I have already tried:
- Increasing Num_workers
- Reducing IO operations inside the constructor (by passing tensors directly)
Some questions:
- I was surprised even increasing batch size to 1000 and num_workers to 12 still I could not see increase in the memory consumption of my cores (individually). Cannot figure out why the code just gets stuck if I give in num_workers.
The Dataset(json file) is not huge in size (~6mb).
class Dataset(data.Dataset):
def __init__(self, binPath, data_path,hi_embeddings,en_embeddings,list_uids=None, list_data=None):
self.word2ids = torch.load(os.path.join(binPath, "word2ids.pth"))
# self.uid2hin = torch.load(os.path.join(binPath, "uid2hin.pth"))
# self.ids2word = torch.load(os.path.join(binPath, "ids2word.pth"))
# TODO: Fix this mapping,
self.ids2word = {j: i for i, j in self.word2ids.items()}
self.word2id_en = torch.load(open(os.path.join(binPath, "embeddings", "word2id_en.pth"), "rb"))
self.word2id_hi = torch.load(open(os.path.join(binPath, "embeddings", "word2id_hi.pth"), "rb"))
self.data = json.load(open(data_path, 'rb'))
self.list_uids = list(self.data.keys())
# self.hi_embeddings = self.load_embeddings(path=os.path.join(binPath, "embeddings"), lang='hi')
# self.en_embeddings = self.load_embeddings(path=os.path.join(binPath, "embeddings"), lang='en')
self.hi_embeddings = hi_embeddings
self.en_embeddings = en_embeddings
self.uid2hi_index = torch.load(open(os.path.join(binPath, "embeddings","uid2hi_idx.pth"), "rb"))
self.embed_dim = 300
def __len__(self):
return len(self.data)
def __getitem__(self, index):
id = self.list_uids[index]
X = self.tokenize(id, self.word2ids)
idx_hi_list = set(self.uid2hi_index[id])
X_ = self.tensorify_sentences(idx_hi_list, X)
# print(X_)
X_ = X_.permute(1, 0)
y = self.data[id]["sent"]
return X_, y
def tensorify_sentences(self, idx_hi_list: Set[int], X: List[int]) -> torch.Tensor:
array = np.zeros((self.embed_dim, len(X)))
for i, _ in enumerate(X):
word = self.ids2word[X[i]]
if i in idx_hi_list:
array[:, i] = self._lookup_embeddings(word, lang='hi')
array[:, i] = self._lookup_embeddings(word, lang='en')
resulting_tensor = torch.from_numpy(array)
return resulting_tensor