Hi,
I have written a dataset loading class for loading 2 text files as source and targets, for neural machine translation purpose. Each file has 93577946 lines, and each of them allocates 8GB memory on Hard Disc.
The class is as the following:
class LoadUniModal(Dataset):
sources = []
targets = []
maxlen = 0
lengths = []
def __init__(self, src, trg, src_vocab, trg_vocab):
self.src_vocab = src_vocab
self.trg_vocab = trg_vocab
with codecs.open(src, encoding="utf-8") as f:
for line in f:
tokens = line.replace("\n", "").split()
self.maxlen = max(self.maxlen, len(tokens))
self.sources.append(tokens)
with codecs.open(trg, encoding="utf-8") as f:
for line in f:
tokens = line.replace("\n", "").split()
self.maxlen = max(self.maxlen, len(tokens))
self.targets.append(tokens)
self.lengths.append(len(tokens)+2)
# Overrride to give PyTorch access to any image on the dataset
def __getitem__(self, index):
# Source sentence processing
tokens = self.sources[index]
ntokens = [self.src_vocab['<START>']]
for a in range(self.maxlen):
if a <= (len(tokens) - 1):
if tokens[a] in self.src_vocab.keys():
ntokens.append(self.src_vocab[tokens[a]])
else:
ntokens.append(self.src_vocab['<UNK>'])
elif a == len(tokens):
ntokens.append(self.src_vocab['<END>'])
elif a > len(tokens):
ntokens.append(self.src_vocab['<PAD>'])
source = torch.from_numpy(np.asarray(ntokens)).long()
# Target sentence processing
tokens = self.targets[index]
ntokens = [self.trg_vocab['<START>']]
for a in range(self.maxlen):
if a <= (len(tokens) - 1):
if tokens[a] in self.trg_vocab.keys():
ntokens.append(self.trg_vocab[tokens[a]])
else:
ntokens.append(self.trg_vocab['<UNK>'])
elif a == len(tokens):
ntokens.append(self.trg_vocab['<END>'])
elif a > len(tokens):
ntokens.append(self.trg_vocab['<PAD>'])
target = torch.from_numpy(np.asarray(ntokens)).long()
length = self.lengths[index]
return [0], source, target, length
def __len__(self):
return len(self.sources)
I use the class in order to load dataset as follows:
def load_text_train_data(train_dir, src_vocab, trg_vocab, lang_pair, batch_size):
tpl = ast.literal_eval(lang_pair)
slang = tpl[1]
tlang = tpl[2]
strain_file = os.path.join(train_dir, "train"+slang)
ttrain_file = os.path.join(train_dir, "train"+tlang)
data_iter = LoadUniModal(strain_file, ttrain_file, src_vocab, trg_vocab)
data_iter = DataLoader(data_iter, batch_size=batch_size)
return data_iter
When I am trying to load the data, I get memory error.
How would it be possible to load the data without memory problem?
Thanks,