I’m pretty new to torchtext and recently I’ve digged through a lot of tutorials for NLP with Torchtext although most of them use APIs that are now moved to torchtext.legacy
and current docs on PyTorch use functional methods and don’t have a standard conventional way of implementing a data pipeline like torchvision
. What I currently do is extending torch.utils.data.Dataset
like below:
# A custom language translation dataset using Multi30k
class Multi30kDe2En(Dataset):
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
SPECIAL_SYMBOLS = ['<unk>', '<pad>', '<bos>', '<eos>']
def __init__(self, base_url, urls):
super(Multi30kDe2En, self).__init__()
self.base_url = base_url
self.urls = urls
# paths to text files containing queries (e.g. train.de, train.en)
self.paths = [extract_archive(download_from_url(self.base_url + url))[0] for url in self.urls]
# load tokenizers
self.de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
self.en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
self.de_vocab, self.en_vocab = self._load_vocabs()
# list of all queries; a sentence per item
self.de_texts = list((io.open(self.paths[0], encoding="utf8")))
self.en_texts = list((io.open(self.paths[1], encoding="utf8")))
def __len__(self):
return len(self.en_texts)
def __getitem__(self, index):
de_text = self.de_texts[index]
en_text = self.en_texts[index]
de_tensor = torch.tensor([self.de_vocab[token] for token in self.de_tokenizer(de_text)], dtype=torch.long)
en_tensor = torch.tensor([self.en_vocab[token] for token in self.en_tokenizer(en_text)], dtype=torch.long)
return de_tensor, en_tensor
def _load_vocabs(self):
# REALLY LOOKING FOR AN ALTERNATIVE WAY INSTEAD OF AN UGLY ITERATOR
def yield_tokens(filepath, tokenizer):
with io.open(filepath, encoding='utf8') as f:
for text in f:
yield tokenizer(text)
de_vocab = build_vocab_from_iterator(yield_tokens(self.paths[0], self.de_tokenizer),
specials=self.SPECIAL_SYMBOLS)
en_vocab = build_vocab_from_iterator(yield_tokens(self.paths[1], self.en_tokenizer),
specials=self.SPECIAL_SYMBOLS)
de_vocab.set_default_index(self.UNK_IDX)
en_vocab.set_default_index(self.UNK_IDX)
return de_vocab, en_vocab
@classmethod
def collate_fn(cls, batch):
# Just to concat bos and eos tokens to the sides of the tensor
de_batch, en_batch = [], []
for de, en in batch:
de_batch.append(torch.cat([torch.tensor([cls.BOS_IDX]), de, torch.tensor([cls.EOS_IDX])], dim=0))
en_batch.append(torch.cat([torch.tensor([cls.BOS_IDX]), en, torch.tensor([cls.EOS_IDX])], dim=0))
de_batch = pad_sequence(de_batch, padding_value=cls.PAD_IDX).permute(1, 0)
en_batch = pad_sequence(en_batch, padding_value=cls.PAD_IDX).permute(1, 0)
return de_batch, en_batch
if __name__ == '__main__':
from torch.utils.data import DataLoader
base_url = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
urls = ('train.de.gz', 'train.en.gz')
dataset = Multi30kDe2En(base_url, urls)
dataloader = DataLoader(dataset, batch_size=1, collate_fn=Multi30kDe2En.collate_fn)
de_batch, en_batch = next(iter(dataloader))
print(f'German Tensor Batch : {de_batch}')
print(f'English Tensor Batch: {en_batch}')
I’d appreciate any suggestions on other alternative ways. Thanks!