ValueError: Expected input batch_size (1) to match target batch_size (26)

I’m getting the following error when using BERT with BiLSTM (my batch_size on BERT is 26). I want to concat last 4 hidden layer of BERT then feed it to BiLSTM. Here is my model:

from transformers import BertPreTrainedModel, BertModel
import torch.nn as nn
import torch
import torch.nn.functional as F

class BERT(BertPreTrainedModel):
    def __init__(self, config):
        super(BERT, self).__init__(config)
        self.device = config.device
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.1)
        
        self.lstm = nn.LSTM(input_size=config.hidden_size * 4, hidden_size=500, num_layers=3, dropout=0.5, bidirectional=True)
        self.qa_outputs = nn.Linear(500*2, config.num_labels)
        
        self.weight_class = config.weight_class
        self.init_weights()
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        with torch.no_grad():
            outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            cls_output = torch.cat((outputs[2][-4][:,0, ...],outputs[2][-3][:,0, ...], outputs[2][-2][:,0, ...], outputs[2][-1][:,0, ...]),-1)
            cls_output = self.lstm(cls_output.unsqueeze(0))[0]
            logits = self.qa_outputs(cls_output)
            return logits

    def loss(self, input_ids, attention_mask, token_type_ids, label):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_output = torch.cat((outputs[2][-4][:,0, ...],outputs[2][-3][:,0, ...], outputs[2][-2][:,0, ...], outputs[2][-1][:,0, ...]),-1)
        cls_output = self.lstm(cls_output.unsqueeze(0))[0]
        logits = self.qa_outputs(cls_output)
        
        target = label
        loss = F.cross_entropy(logits, target)

        predict_value = torch.max(logits, 1)[1]
        list_predict = predict_value.cpu().numpy().tolist()
        list_target = target.cpu().numpy().tolist()

        return loss, list_predict, list_target

Really don’t know how to debug this. Any solution for this error. Thanks in advance.

the problem is on my CrossEntropy. Here is an example of my logits and target I pass to CrossEntropy

tensor([[[-0.0006, -0.0089],
         [ 0.0044, -0.0024],
         [-0.0049,  0.0009],
         [-0.0042, -0.0018],
         [-0.0041, -0.0021],
         [ 0.0028, -0.0017],
         [-0.0033, -0.0095],
         [-0.0022, -0.0034],
         [-0.0064, -0.0046],
         [ 0.0048, -0.0077],
         [-0.0050, -0.0023],
         [ 0.0005, -0.0038],
         [-0.0037, -0.0002],
         [ 0.0008, -0.0033],
         [-0.0006, -0.0026],
         [-0.0013, -0.0122],
         [-0.0124, -0.0082],
         [ 0.0015, -0.0039],
         [ 0.0065, -0.0008],
         [ 0.0003, -0.0117],
         [ 0.0024, -0.0064],
         [-0.0017, -0.0041],
         [-0.0045, -0.0033],
         [ 0.0049,  0.0017],
         [-0.0013, -0.0082],
         [-0.0001, -0.0014]]], device='cuda:1', grad_fn=<AddBackward0>)
tensor([0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 1], device='cuda:1')