Loading huge text files for neural machine translation

Hi,

I have written a dataset loading class for loading 2 text files as source and targets, for neural machine translation purpose. Each file has 93577946 lines, and each of them allocates 8GB memory on Hard Disc.

The class is as the following:

class LoadUniModal(Dataset):
	sources = []
	targets = []
	maxlen = 0
	lengths = []

	def __init__(self, src, trg, src_vocab, trg_vocab):
		self.src_vocab = src_vocab
		self.trg_vocab = trg_vocab

		with codecs.open(src, encoding="utf-8") as f:
			for line in f:
				tokens = line.replace("\n", "").split()
				self.maxlen = max(self.maxlen, len(tokens))
				self.sources.append(tokens)
		with codecs.open(trg, encoding="utf-8") as f:
			for line in f:
				tokens = line.replace("\n", "").split()
				self.maxlen = max(self.maxlen, len(tokens))
				self.targets.append(tokens)
				self.lengths.append(len(tokens)+2)

	# Overrride to give PyTorch access to any image on the dataset
	def __getitem__(self, index):

		# Source sentence processing
		tokens = self.sources[index]
		ntokens = [self.src_vocab['<START>']]
		for a in range(self.maxlen):
			if a <= (len(tokens) - 1):
				if tokens[a] in self.src_vocab.keys():
					ntokens.append(self.src_vocab[tokens[a]])
				else:
					ntokens.append(self.src_vocab['<UNK>'])
			elif a == len(tokens):
				ntokens.append(self.src_vocab['<END>'])
			elif a > len(tokens):
				ntokens.append(self.src_vocab['<PAD>'])

		source = torch.from_numpy(np.asarray(ntokens)).long()

		# Target sentence processing
		tokens = self.targets[index]
                ntokens = [self.trg_vocab['<START>']]
                for a in range(self.maxlen):
                        if a <= (len(tokens) - 1):
                                if tokens[a] in self.trg_vocab.keys():
                                        ntokens.append(self.trg_vocab[tokens[a]])
                                else:
                                        ntokens.append(self.trg_vocab['<UNK>'])
                        elif a == len(tokens):
                                ntokens.append(self.trg_vocab['<END>'])
                        elif a > len(tokens):
                                ntokens.append(self.trg_vocab['<PAD>'])

                target = torch.from_numpy(np.asarray(ntokens)).long()

		length = self.lengths[index]

		return [0], source, target, length

	def __len__(self):
		return len(self.sources)

I use the class in order to load dataset as follows:

def load_text_train_data(train_dir, src_vocab, trg_vocab, lang_pair, batch_size):

        tpl = ast.literal_eval(lang_pair)
        slang = tpl[1]
        tlang = tpl[2]

        strain_file = os.path.join(train_dir, "train"+slang)
        ttrain_file = os.path.join(train_dir, "train"+tlang)

        data_iter = LoadUniModal(strain_file, ttrain_file, src_vocab, trg_vocab)
        data_iter = DataLoader(data_iter, batch_size=batch_size)

        return data_iter

When I am trying to load the data, I get memory error.

How would it be possible to load the data without memory problem?

Thanks,

One of the ways I heard good things about was to have one huge tensor in a file and memmap it. Then you would want to solve indexing.

Another perhaps even more obvious choice could be splitting the ds, loading the right oart if necessary and adapt the dataloader’s random sampler to draw from block-form permutations.

I look forward to hearing about your experience.

Best regards

Thomas