I am currently trying to attempting to tokenize large text, however i have a lot of files in the directory that i want to tokenize as this is very time consuming to do 1 by 1.
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
tokenizer = AutoTokenizer.from_pretrained(“joeddav/distilbert-base-uncased-go-emotions-student”)
model = AutoModelForSequenceClassification.from_pretrained(“joeddav/distilbert-base-uncased-go-emotions-student”)
txt=“…”
words_input_dir = “/content/sample_data/”
for filename in os.listdir(words_input_dir):
if filename.endswith(“.txt”):
with open(filename, “r”) as input_file:
input_tokens = word_tokensize(input_file.read())
tokens = tokenizer.encode_plus(input_file.read(), add_special_tokens = False, return_tensors = ‘pt’)
print(len(tokens))
tokens
before I added the loop the original read
tokens = tokenizer.encode_plus(txt, add_special_tokens = False, return_tensors = ‘pt’)
TYIA.