Hello,
I am trying to load a sequence labelling dataset using torchtext but running into errors.
The ultimate goal is to train a LSTM and to sequence tagging (whether each word is an entity or not).
Parts of the code are given below. This is the error I get when I try to load up my dataset:
Traceback (most recent call last):
File "/Users/salman/dev/ents_rnn/data/test_torchtext_ents.py", line 19, in <module>
train, valid, test = LabellingDataset.splits(questions, entity_labels)
File "/Users/salman/dev/ents_rnn/data/dataset_ents.py", line 42, in splits
'labelling': ('labelling', label_field)}
File "/Users/salman/anaconda3/lib/python3.6/site-packages/torchtext-0.1.1-py3.6.egg/torchtext/data.py", line 345, in splits
File "/Users/salman/anaconda3/lib/python3.6/site-packages/torchtext-0.1.1-py3.6.egg/torchtext/data.py", line 420, in __init__
File "/Users/salman/anaconda3/lib/python3.6/site-packages/torchtext-0.1.1-py3.6.egg/torchtext/data.py", line 420, in <listcomp>
File "/Users/salman/anaconda3/lib/python3.6/site-packages/torchtext-0.1.1-py3.6.egg/torchtext/data.py", line 251, in fromJSON
File "/Users/salman/anaconda3/lib/python3.6/json/__init__.py", line 354, in loads
return _default_decoder.decode(s)
File "/Users/salman/anaconda3/lib/python3.6/json/decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/Users/salman/anaconda3/lib/python3.6/json/decoder.py", line 355, in raw_decode
obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 39 (char 38)
Process finished with exit code 1
Here is the sample dataset:
{"question": "The dog ate the apple", labelling: "NOT ENT NOT NOT ENT"}
{"question": "Everybody read that book", labelling: "NOT NOT NOT ENT"}
{"question": "John lives there", labelling: "ENT NOT NOT"}
This is my dataset class:
# most basic tokenizer - split on whitespace
def my_tokenizer():
return lambda text: [tok for tok in text.split()]
class LabellingDataset(data.ZipDataset, data.TabularDataset):
@staticmethod
def sort_key(ex):
return len(ex.question)
@classmethod
def splits(cls, text_field, label_field, root='.',
train='train.jsonl', validation='valid.jsonl', test='test.jsonl'):
# path = some path
prefix_fname = 'sequence_labelled_entities_'
return super(SimpleQaEntityDataset, cls).splits(
os.path.join(path, prefix_fname), train, validation, test,
format='JSON', fields={'question': ('question', text_field),
'labelling': ('labelling', label_field)}
)
@classmethod
def iters(cls, batch_size=32, device=0, root='.', wv_dir='.',
wv_type=None, wv_dim='300d', **kwargs):
TEXT = data.Field(sequential=True, tokenize=my_tokenizer())
LABEL = data.Field(sequential=True, tokenize=my_tokenizer())
train, val, test = cls.splits(TEXT, LABEL, root=root, **kwargs)
TEXT.build_vocab(train, wv_dir=wv_dir, wv_type=wv_type, wv_dim=wv_dim)
LABEL.build_vocab(train)
return data.BucketIterator.splits(
(train, val, test), batch_size=batch_size, device=device)
And this is how I am calling my code:
data_cache = "data_cache"
vector_cache = "vector_cache/ents_input_vectors.pt"
word_vectors = "glove.6B"
d_embed = 50
batch_size = 2
epochs = 2
gpu_device = -1
questions = data.Field(lower=True)
entity_labels = data.Field(sequential=True)
train, valid, test = LabellingDataset.splits(questions, entity_labels)
# build vocab for questions
questions.build_vocab(train, valid, test)
# load word vectors if already saved or else load it from start and save it
if os.path.isfile(vector_cache):
questions.vocab.vectors = torch.load(vector_cache)
else:
questions.vocab.load_vectors(wv_dir=data_cache, wv_type=word_vectors, wv_dim=d_embed)
os.makedirs(os.path.dirname(vector_cache), exist_ok=True)
torch.save(questions.vocab.vectors, vector_cache)
# build vocab for relations
entity_labels.build_vocab(train, valid, test)
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
(train, valid, test), batch_size=batch_size, device=gpu_device)
train_iter.repeat = False
print("train iter")
for epoch in range(epochs):
train_iter.init_epoch()
for batch_idx, batch in enumerate(train_iter):
print(batch.batch_size)
print("-" * 50)