I want to preprocess the text data, for example, converting each word to index and adding some pads (for seq2seq learning). Is it good way to handle this as below?
class MyDataset(torch.data.utils.Dataset):
def __init__(self):
self.data_files = os.listdir('data_dir')
sort(self.data_files)
def __getitem__(self, idx):
data = load_file(self.data_files[idx])
data = preprocess_data(data) # preprocess
return data
def __len__(self):
return len(self.data_files)
dset = MyDataset()
loader = torch.data.utils.DataLoader(dset, num_workers=8)