So, from the documentation and the various tutorials I have seen, torchtext.data.tabulardataset is created from either csv, tsv or json file. I have a list of dictionaries of the type :
[{‘text’ : "Anything of the type, ‘label’ : 0}, {second sample}, {third sample}]
I need to create a custom tabular dataset for a text classification problem. Can someone explain how it can be done. Also on stackexchange platform one of the people have suggested a solution,
Also, say I have three lists right now, corresponding to train_dict, test_dict and val_dict.
class TabularDataset_From_List(data.Dataset):
def __init__(self, input_list, format, fields, skip_header=False, **kwargs):
make_example = {
'json': Example.fromJSON, 'dict': Example.fromdict,
'tsv': Example.fromTSV, 'csv': Example.fromCSV}[format.lower()]
examples = [make_example(item, fields) for item in input_list]
if make_example in (Example.fromdict, Example.fromJSON):
fields, field_dict = [], fields
for field in field_dict.values():
if isinstance(field, list):
fields.extend(field)
else:
fields.append(field)
super(TabularDataset_From_List, self).__init__(examples, fields, **kwargs)
@classmethod
def splits(cls, path=None, root='.data', train=None, validation=None,
test=None, **kwargs):
if path is None:
path = cls.download(root)
train_data = None if train is None else cls(
train, **kwargs)
val_data = None if validation is None else cls(
validation, **kwargs)
test_data = None if test is None else cls(
test, **kwargs)
return tuple(d for d in (train_data, val_data, test_data)
if d is not None)
Can someone also explain what is happening exactly in the two functions if the solution from the stackexchange post is right