Error in creating a DataLoader that works after the Dataset is working properly !
Command History and Logs
- train_dataset = QNA_Dataset(train_encodings)
- train_dataset.getitem(5)
STDOUT ::
{'input_ids': tensor([ 0, 1620, 23, 144, 97, 6630, 6, 10579, 9038, 18,
521, 422, 10, 346, 9, 340, 433, 6639, 4, 20,
1117, 1294, 12, 2962, 6639, 680, 130, 9911, 6, 258,
10, 3188, 8, 2384, 1992, 6, 8, 484, 15829, 8,
28059, 4, 1456, 8215, 25, 10, 65, 12, 8596, 8812,
11, 772, 504, 5067, 6, 5, 1811, 1168, 11599, 4320,
16, 1167, 2330, 3708, 8, 1449, 7, 28, 5, 7763,
11152, 25161, 5362, 11, 5, 315, 532, 4, 20, 97,
4320, 6, 20, 45011, 1371, 6, 16, 703, 2330, 10,
76, 8, 7235, 15, 1294, 13144, 8, 14129, 4, 20,
25336, 76, 6298, 16, 1027, 6333, 4, 20, 9911, 33,
15958, 5362, 3168, 6, 19, 20, 15815, 1027, 1230, 8,
4412, 2207, 2737, 8, 97, 340, 6, 8, 29796, 30,
521, 31, 258, 10579, 9038, 8, 6130, 2708, 18, 1821,
4, 8280, 1811, 1168, 11599, 8, 20, 25336, 6, 20,
15815, 16, 41, 2222, 5362, 8, 473, 45, 33, 10,
7998, 11220, 50, 143, 8161, 9233, 31, 5, 589, 4,
96, 11735, 6, 77, 103, 521, 2047, 14, 20, 15815,
880, 7, 311, 10, 3354, 9415, 6, 10, 6176, 2924,
6, 9732, 19484, 21, 1027, 4, 21371, 6, 11, 4999,
6, 77, 97, 521, 2047, 14, 5, 2225, 969, 10,
6176, 9415, 6, 5, 3354, 2225, 3445, 18448, 439, 88,
931, 4, 9081, 2225, 16, 1027, 25, 747, 25, 20,
15815, 131, 959, 6, 70, 130, 32, 7664, 7, 2,
2, 1779, 222, 5, 1811, 1168, 11599, 10202, 9, 10579,
385, 4344, 1642, 10467, 116, 2]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
'start_positions': tensor(51),
'end_positions': tensor(54)}
- train_loader = DataLoader(train_dataset,shuffle=True, batch_size=8)
STDOUT
----> 1 train_loader = DataLoader(train_dataset,shuffle=True, batch_size=8)
2 val_loader = DataLoader(val_dataset, shuffle=True, batch_size=8)
2 frames
/usr/local/lib/python3.8/dist-packages/torch/utils/data/sampler.py in num_samples(self)
112 # dataset size might change at runtime
113 if self._num_samples is None:
--> 114 return len(self.data_source)
115 return self._num_samples
116
TypeError: 'list' object cannot be interpreted as an integer
P.S : Modified the encodings out of Roberta Tokenizer Fast by adding start_positions and end_positions to be able to use it’s pretrained model
RobertaForQuestionAnswering.from_pretrained(‘roberta-base’)
Snippet of code for tokenisation and modifying the encodings generated
train_encodings = tokenizer(train_df['context'], train_df['question'], truncation=True, padding=True,max_length=10)
#adding token positions
def update_token_positions(encodings, dataframe):
start_positions = []
end_positions = []
for i in range(len(dataframe)):
start_positions.append(encodings.char_to_token(i,dataframe['answer_start'][i]))
end_positions.append(encodings.char_to_token(i,dataframe['answer_end'][i]))
# if start position is None, The answer passage has been truncated
if start_positions[-1] is None:
start_positions[-1] = tokenizer.model_max_length
# end position not found, so shift back
offset=1
while end_positions[-1] is None:
end_positions[-1] = encodings.char_to_token(i,dataframe['answer_end'][i]-offset)
offset+=1
# updating encoding dict with our start and end positions
encodings.update({'start_positions': start_positions, 'end_positions': end_positions})