I’m trying to create a data loader for my NER data as following (Assumption1: we have two files one for sequences and another one for tag sequences. Assumtion2: Word_to_idx and Tags_to_idx are ready). Everything seems correct, but dataloader doesn’t work with batch size greater than 1.
class NERdata(Dataset):
__xs = []
__ys = []
def __init__(self,DataFolder,Words_to_idx, Tags_to_idx):
f = open(DataFolder+'/eng.train_seq','r')
ls = f.readlines()
f.close()
f = open(DataFolder+'/eng.train_labels','r')
ll = f.readlines()
f.close()
i = 0
while(i<len(ll)):
features = []
tags = []
a = ls[i].strip().split()
b = ll[i].strip().split()
j = 0
while(j<len(a)):
if a[j] in Words_to_idx:
features.append(Words_to_idx[a[j].lower()])
else:
features.append(Words_to_idx['UNK'])
tags.append(Tags_to_idx[b[j]])
j += 1
self.__xs.append(features)
self.__ys.append(tags)
i += 1
def __getitem__(self,index):
Data = self.__xs[index]
Tags = self.__ys[index]
return (torch.tensor(Data),torch.tensor(Tags))
def __len__(self):
return len(self.__xs)
When I am trying to use dataloader with above defined class like:
dataset = NERdata('data',Words_to_idx,Tags_to_idx)
Data = DataLoader(dataset=dataset,batch_size=2,shuffle=True,num_workers=2)
print(type(Data))
for batch_idx, (data, target) in enumerate(Data):
print("Data: ", batch_idx, data, target)
I am getting the following runtime error:
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 30 and 35 in dimension 1 at /pytorch/aten/src/TH/generic/THTensorMoreMath.cpp:1333
any idea about the causes of the problem?