Preparing dataset using Torchtext for Hierarchical Attention Network

Hi all,

I am going to use a dataset from Toxic Comment Classification Challenge that contains comments as the data and level of toxicity as the labels. In order to do that, I have a plan to use HAN architecture but my problem arises when I am going to make the dataset ready for this architecture using torchtext data loader. I grabbed the code for this from another forum on this website and when I run it on the data it takes forever to run TabularDataset.splits. Is there any advice you can give me to help me to prepare the data for my project?

Thanks,

from torchtext.data import Field,NestedField,get_tokenizer

from spacy.tokenizer import Tokenizer

def tokenizer1(text):

 nlp = spacy.load('en_core_web_sm')

 doc = nlp(text)

 return [sent.text for sent in doc.sents]

 nlp = spacy.load('en_core_web_sm')

 tokenizer = Tokenizer(nlp.vocab)

TEXT = Field(sequential=True,tokenize=get_tokenizer("basic_english"), lower=False)

LABEL = Field(sequential=False, use_vocab=False)

nesting_field = NestedField(nesting_field = TEXT,include_lengths = True, tokenize=tokenizer1)


from torchtext.data import TabularDataset

r1 = "data dir"

tv_datafields = [('id',None),('comment_text', nesting_field),('severe_toxic', LABEL)]

data11 = TabularDataset.splits(

           path=r1, # the root directory where the data lies

           train='train1.csv',

           format='csv',

           skip_header = True,# ADDING THIS PART AND ZERO INDEX

           fields=tv_datafields)[0]