BERT fine tuning: Target size (torch.Size([3, 1])) must be the same as input size (torch.Size([3, 2]))

Hey, I’m trying to fine-tune BER, using BoolQ dataset from SuperGLUE

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


df = pd.read_json(file_train, lines=True)

train_df, dev_df = train_test_split(df, test_size=0.1, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and format the data
train_encodings = tokenizer(train_df['question'].tolist(),
                            train_df['passage'].tolist(),
                            padding=True,
                            truncation=True,
                            max_length=512,
                            add_special_tokens=True, # Add [CLS] and [SEP] tokens.
                            return_tensors='pt')

dev_encodings = tokenizer(dev_df['question'].tolist(),
                          dev_df['passage'].tolist(),
                          padding=True,
                          truncation=True,
                          max_length=512,
                          add_special_tokens=True, # Add [CLS] and [SEP] tokens.
                          return_tensors='pt')

import torch
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Set up the device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the hyperparameters
epochs = 3 
batch_size = 3  
learning_rate = 2e-5 
eps = 1e-8  

# Initialize the model.
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,
                                                        output_attentions = False,
                                                        output_hidden_states = False) 
model.to(device)


train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'],
                                               train_encodings['attention_mask'],
                                               torch.tensor(train_df['label'].tolist()).float().unsqueeze(1))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


dev_dataset = torch.utils.data.TensorDataset(dev_encodings['input_ids'],
                                             dev_encodings['attention_mask'],
                                             torch.tensor(dev_df['label'].tolist()).float().unsqueeze(1))

dev_loader =  DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)

total_steps = len(train_dataset) * epochs // batch_size
warmup_steps = total_steps // 10  # 10% of the total number of steps. Usually good.

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

loss_fn = torch.nn.CrossEntropyLoss().to(device)

# Define the training loop
def train_eval_loop(train_loader, dev_loader, model, loss_fn, optimizer, scheduler, device):
    model.train()

    train_loss_meter = AverageMeter()
    


    total_loss = 0.0
    for batch in train_loader:
        # Unpack the batch
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        print(labels)
        print(input_ids)
        print(input_ids.shape)
        print(attention_mask.shape)

        # Zero  gradients
        optimizer.zero_grad()

        outputs, logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)

        loss.backward()

        # Clip the gradients to avoid exploding gradients.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update the weights
        optimizer.step()
        scheduler.step()

        train_loss_meter.update(loss.item())

    

    dev_loss_meter = AverageMeter()
    model.eval()

    for batch in dev_loader:

    # batch_inputs = batch_inputs.to(device)
    # batch_targets = batch_targets.to(device)

        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)


        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            outputs.to(device)
            loss = loss_fn(outputs.logits, labels)

    dev_loss_meter.update(loss.item())

    print(f'\t Train loss:{train_loss_meter.avg}')
    print(f'\t Val(dev) loss:{dev_loss_meter.avg}')

But when I train

# Train and eval the model
for epoch in range(epochs):
    train_loss = train_eval_loop(train_loader, dev_loader, model, loss_fn, optimizer, scheduler, device)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')

I’m having ValueError: Target size (torch.Size([3, 1])) must be the same as input size (torch.Size([3, 2]))

Any ideas, what may be wrong?
my attention_mask and input_ids are [3, 512], and labels are [3, 1]

nn.CrossEntropyLoss expects model outputs in the shape [batch_size, nb_classes, *] containing ogits and targets in the shape [batch_size, *] containing class indices in the range [0, nb_classes-1].
Based on the error message I would guess your target might already contain the right values but uses the additional dimension in dim1. You could remove it via target = target.squeeze(1) and it should work.