DataLoader error after Dataset is created successfully!

Error in creating a DataLoader that works after the Dataset is working properly !

Command History and Logs

  1. train_dataset = QNA_Dataset(train_encodings)
  2. train_dataset.getitem(5)
STDOUT ::
{'input_ids': tensor([    0,  1620,    23,   144,    97,  6630,     6, 10579,  9038,    18,
           521,   422,    10,   346,     9,   340,   433,  6639,     4,    20,
          1117,  1294,    12,  2962,  6639,   680,   130,  9911,     6,   258,
            10,  3188,     8,  2384,  1992,     6,     8,   484, 15829,     8,
         28059,     4,  1456,  8215,    25,    10,    65,    12,  8596,  8812,
            11,   772,   504,  5067,     6,     5,  1811,  1168, 11599,  4320,
            16,  1167,  2330,  3708,     8,  1449,     7,    28,     5,  7763,
         11152, 25161,  5362,    11,     5,   315,   532,     4,    20,    97,
          4320,     6,    20, 45011,  1371,     6,    16,   703,  2330,    10,
            76,     8,  7235,    15,  1294, 13144,     8, 14129,     4,    20,
         25336,    76,  6298,    16,  1027,  6333,     4,    20,  9911,    33,
         15958,  5362,  3168,     6,    19,    20, 15815,  1027,  1230,     8,
          4412,  2207,  2737,     8,    97,   340,     6,     8, 29796,    30,
           521,    31,   258, 10579,  9038,     8,  6130,  2708,    18,  1821,
             4,  8280,  1811,  1168, 11599,     8,    20, 25336,     6,    20,
         15815,    16,    41,  2222,  5362,     8,   473,    45,    33,    10,
          7998, 11220,    50,   143,  8161,  9233,    31,     5,   589,     4,
            96, 11735,     6,    77,   103,   521,  2047,    14,    20, 15815,
           880,     7,   311,    10,  3354,  9415,     6,    10,  6176,  2924,
             6,  9732, 19484,    21,  1027,     4, 21371,     6,    11,  4999,
             6,    77,    97,   521,  2047,    14,     5,  2225,   969,    10,
          6176,  9415,     6,     5,  3354,  2225,  3445, 18448,   439,    88,
           931,     4,  9081,  2225,    16,  1027,    25,   747,    25,    20,
         15815,   131,   959,     6,    70,   130,    32,  7664,     7,     2,
             2,  1779,   222,     5,  1811,  1168, 11599, 10202,     9, 10579,
           385,  4344,  1642, 10467,   116,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'start_positions': tensor(51),
 'end_positions': tensor(54)}
  1. train_loader = DataLoader(train_dataset,shuffle=True, batch_size=8)

STDOUT

----> 1 train_loader = DataLoader(train_dataset,shuffle=True, batch_size=8)
      2 val_loader = DataLoader(val_dataset, shuffle=True, batch_size=8)

2 frames
/usr/local/lib/python3.8/dist-packages/torch/utils/data/sampler.py in num_samples(self)
    112         # dataset size might change at runtime
    113         if self._num_samples is None:
--> 114             return len(self.data_source)
    115         return self._num_samples
    116 

TypeError: 'list' object cannot be interpreted as an integer

P.S : Modified the encodings out of Roberta Tokenizer Fast by adding start_positions and end_positions to be able to use it’s pretrained model
RobertaForQuestionAnswering.from_pretrained(‘roberta-base’)

Snippet of code for tokenisation and modifying the encodings generated

train_encodings = tokenizer(train_df['context'], train_df['question'], truncation=True, padding=True,max_length=10)
#adding token positions
def update_token_positions(encodings, dataframe):
    start_positions = []
    end_positions = []
    for i in range(len(dataframe)):
        start_positions.append(encodings.char_to_token(i,dataframe['answer_start'][i]))
        end_positions.append(encodings.char_to_token(i,dataframe['answer_end'][i]))
    
    # if start position is None, The answer passage has been truncated
    if start_positions[-1] is None:
        start_positions[-1] = tokenizer.model_max_length

    # end position not found, so shift back
    offset=1
    while end_positions[-1] is None:
        end_positions[-1] = encodings.char_to_token(i,dataframe['answer_end'][i]-offset)
        offset+=1

    # updating encoding dict with our start and end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

My Bad !
forgot to return return(len(self.encodings.input_ids)) in Dataset. Fixed !