Torchtext - loading a sequence labelling dataset

Salman_Mohammed · June 23, 2017, 6:10pm

Hello,

I am trying to load a sequence labelling dataset using torchtext but running into errors.
The ultimate goal is to train a LSTM and to sequence tagging (whether each word is an entity or not).

Parts of the code are given below. This is the error I get when I try to load up my dataset:

Traceback (most recent call last):
  File "/Users/salman/dev/ents_rnn/data/test_torchtext_ents.py", line 19, in <module>
    train, valid, test = LabellingDataset.splits(questions, entity_labels)
  File "/Users/salman/dev/ents_rnn/data/dataset_ents.py", line 42, in splits
    'labelling': ('labelling', label_field)}
  File "/Users/salman/anaconda3/lib/python3.6/site-packages/torchtext-0.1.1-py3.6.egg/torchtext/data.py", line 345, in splits
  File "/Users/salman/anaconda3/lib/python3.6/site-packages/torchtext-0.1.1-py3.6.egg/torchtext/data.py", line 420, in __init__
  File "/Users/salman/anaconda3/lib/python3.6/site-packages/torchtext-0.1.1-py3.6.egg/torchtext/data.py", line 420, in <listcomp>
  File "/Users/salman/anaconda3/lib/python3.6/site-packages/torchtext-0.1.1-py3.6.egg/torchtext/data.py", line 251, in fromJSON
  File "/Users/salman/anaconda3/lib/python3.6/json/__init__.py", line 354, in loads
    return _default_decoder.decode(s)
  File "/Users/salman/anaconda3/lib/python3.6/json/decoder.py", line 339, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/Users/salman/anaconda3/lib/python3.6/json/decoder.py", line 355, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 39 (char 38)

Process finished with exit code 1

Here is the sample dataset:

{"question": "The dog ate the apple", labelling: "NOT ENT NOT NOT ENT"}
{"question": "Everybody read that book", labelling: "NOT NOT NOT ENT"}
{"question": "John lives there", labelling: "ENT NOT NOT"}

This is my dataset class:

# most basic tokenizer - split on whitespace
def my_tokenizer():
    return lambda text: [tok for tok in text.split()]

class LabellingDataset(data.ZipDataset, data.TabularDataset):

    @staticmethod
    def sort_key(ex):
        return len(ex.question)

    @classmethod
    def splits(cls, text_field, label_field, root='.',
                train='train.jsonl', validation='valid.jsonl', test='test.jsonl'):
        # path = some path
        prefix_fname = 'sequence_labelled_entities_'
        return super(SimpleQaEntityDataset, cls).splits(
                    os.path.join(path, prefix_fname), train, validation, test,
                    format='JSON', fields={'question': ('question', text_field),
                                           'labelling': ('labelling', label_field)}
                )

    @classmethod
    def iters(cls, batch_size=32, device=0, root='.', wv_dir='.',
                                wv_type=None, wv_dim='300d', **kwargs):
        TEXT = data.Field(sequential=True, tokenize=my_tokenizer())
        LABEL = data.Field(sequential=True, tokenize=my_tokenizer())

        train, val, test = cls.splits(TEXT, LABEL, root=root, **kwargs)

        TEXT.build_vocab(train, wv_dir=wv_dir, wv_type=wv_type, wv_dim=wv_dim)
        LABEL.build_vocab(train)

        return data.BucketIterator.splits(
            (train, val, test), batch_size=batch_size, device=device)

And this is how I am calling my code:

data_cache = "data_cache"
vector_cache = "vector_cache/ents_input_vectors.pt"
word_vectors = "glove.6B"
d_embed = 50
batch_size = 2
epochs = 2
gpu_device = -1

questions = data.Field(lower=True)
entity_labels = data.Field(sequential=True)

train, valid, test = LabellingDataset.splits(questions, entity_labels)

# build vocab for questions
questions.build_vocab(train, valid, test)

# load word vectors if already saved or else load it from start and save it
if os.path.isfile(vector_cache):
    questions.vocab.vectors = torch.load(vector_cache)
else:
    questions.vocab.load_vectors(wv_dir=data_cache, wv_type=word_vectors, wv_dim=d_embed)
    os.makedirs(os.path.dirname(vector_cache), exist_ok=True)
    torch.save(questions.vocab.vectors, vector_cache)

# build vocab for relations
entity_labels.build_vocab(train, valid, test)

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
            (train, valid, test), batch_size=batch_size, device=gpu_device)
train_iter.repeat = False

print("train iter")
for epoch in range(epochs):
    train_iter.init_epoch()
    for batch_idx, batch in enumerate(train_iter):
        print(batch.batch_size)
    print("-" * 50)

jekbradbury · June 24, 2017, 2:27am

It looks like your code is fine, but the dataset isn’t quite JSON. JSON needs every string, including table keys, to be wrapped in quotes; in your example “question” is but not “labelling”. You can either write your own extension to TabularDataset to handle this not-quite-JSON or you can write a script to fix the data file separately.