BERT Sentence Pair Classification Task

I was doing sentence pair classification using BERT. At first, I encode the sentence pair as

 train_encode = tokenizer(train1, train2,padding="max_length",truncation=True)
    test_encode = tokenizer(test1, test2,padding="max_length",truncation=True)

where train1 and train2 are lists of sentence pairs.

Then I did:

   train_seq = torch.tensor(train_encode['input_ids'])
    train_mask = torch.tensor(train_encode['attention_mask'])
    train_token = torch.tensor(train_encode['token_type_ids'])
    train_y = torch.tensor(y_train.tolist())

And created a data loader as

train_data = TensorDataset(train_seq, train_mask, train_token, train_y)

Model is defined as

model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=5)

 class BERT_Arch(nn.Module):
    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      self.dropout = nn.Dropout(0.1)
      
      self.relu =  nn.ReLU()

      self.fc1 = nn.Linear(768,512)
      
      self.fc2 = nn.Linear(512,5)

      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_ids, attn_masks, token_type_ids):

      _, cls_hs = self.bert(input_ids, attn_masks, token_type_ids)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      x = self.fc2(x)
      
      x = self.softmax(x)

      return x

And defined bert as

model1 = BERT_Arch(model)
    optimizer = AdamW(model1.parameters(), lr=0.01)
    device = torch.device("cuda") if torch.cuda.is_available() else 
             torch.device("cpu")
    model1.to(device)

I created the training loop as

EPOCHS = 5
    criterion = nn.CrossEntropyLoss()
    
    total_loss, total_accuracy = 0, 0
    
    
    total_preds=[]
    
    for epoch in range(EPOCHS):
      model1.train()
      total_train_loss = 0

      total_train_acc  = 0

  
      for step,batch in enumerate(train_dataloader):
        batch = [r.to(device) for r in batch]
        input_id,attention_mask,token_type_id,y = batch
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)
        model1.zero_grad()  
        prediction = model1(pair_token_ids,mask_ids,seg_ids)
        loss = criterion(prediction, labels)
        total_loss = total_loss + loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        preds=preds.detach().cpu().numpy()
        total_preds.append(preds)
        avg_loss = total_loss / len(train_dataloader)
        total_preds  = np.concatenate(total_preds, axis=0)

    print(avg_loss)

I get the following errors as ValueError: not enough values to unpack (expected 2, got 1)

I am not sure what I am doing wrong here? Any suggestions?

The output of

  for step,batch in enumerate(train_dataloader):
      batch = [r.to(device) for r in batch]
      input_id,attention_mask,token_type_id,y = batch

is

 tensor([[  101,  3191,  1999,  ...,     0,     0,     0],
        [  101,  2023, 11204,  ...,     0,     0,     0],
        [  101,  6140,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2023, 11204,  ...,     0,     0,     0],
        [  101,  2275,  2039,  ...,     0,     0,     0],
        [  101,  2023,  2240,  ...,     0,     0,     0]], device='cuda:0')
 

     tensor([[1, 1, 1,  ..., 0, 0, 0],
            [1, 1, 1,  ..., 0, 0, 0],
            [1, 1, 1,  ..., 0, 0, 0],
            ...,
            [1, 1, 1,  ..., 0, 0, 0],
            [1, 1, 1,  ..., 0, 0, 0],
            [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')

    tensor([[0, 0, 0,  ..., 0, 0, 0],
            [0, 0, 0,  ..., 0, 0, 0],
            [0, 0, 0,  ..., 0, 0, 0],
            ...,
            [0, 0, 0,  ..., 0, 0, 0],
            [0, 0, 0,  ..., 0, 0, 0],
            [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')

    tensor([3, 4, 1, 3, 1, 4, 2, 1], device='cuda:0')

If I don’t create a custom BERT model and just use

 model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=5)`

And use the training loop as same as above, I get the error as TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not SequenceClassifierOutput