Hi all,
I am going to use a dataset from Toxic Comment Classification Challenge that contains comments as the data and level of toxicity as the labels. In order to do that, I have a plan to use HAN architecture but my problem arises when I am going to make the dataset ready for this architecture using torchtext data loader. I grabbed the code for this from another forum on this website and when I run it on the data it takes forever to run TabularDataset.splits. Is there any advice you can give me to help me to prepare the data for my project?
Thanks,
from torchtext.data import Field,NestedField,get_tokenizer
from spacy.tokenizer import Tokenizer
def tokenizer1(text):
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
return [sent.text for sent in doc.sents]
nlp = spacy.load('en_core_web_sm')
tokenizer = Tokenizer(nlp.vocab)
TEXT = Field(sequential=True,tokenize=get_tokenizer("basic_english"), lower=False)
LABEL = Field(sequential=False, use_vocab=False)
nesting_field = NestedField(nesting_field = TEXT,include_lengths = True, tokenize=tokenizer1)
from torchtext.data import TabularDataset
r1 = "data dir"
tv_datafields = [('id',None),('comment_text', nesting_field),('severe_toxic', LABEL)]
data11 = TabularDataset.splits(
path=r1, # the root directory where the data lies
train='train1.csv',
format='csv',
skip_header = True,# ADDING THIS PART AND ZERO INDEX
fields=tv_datafields)[0]