ValueError: Using a target size (torch.Size([4])) that is different to the input size (torch.Size([4, 4])) is deprecated. Please ensure they have the same size for a binary classification problem?

Hi, I need help with this value error when running PyTorch for a binary classification problem.
I’m using a kaggle dataset, News Category Dataset | Kaggle.
and am implementing the code from here (which is for a multi-class classfication: Google Colab)

# Importing the libraries needed
import pandas as pd
import numpy as np
import pickle
import json
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

# load labels    
with open('.../News_Category_Dataset_v3.json','r') as f:
    jdata = f.read()

jdata2  = [json.loads(line) for line in jdata.split('\n') if line]
df = pd.DataFrame.from_records(jdata2)

df["category"] = np.where(df["category"] == "CRIME", 1, 0) # binomial classification

y = df['category'].astype(int)

X = df['short_description']

# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

#class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.short_description[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.category[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

# Creating the customized model

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistillBERTClass()
model.to(device)

# Creating the loss function and optimizer
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Function to calcuate the accuracy of the model

def calculate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

for epoch in range(EPOCHS):
    train(epoch)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [36], in <cell line: 1>()
      1 for epoch in range(EPOCHS):
----> 2     train(epoch)

Input In [35], in train(epoch)
     12 targets = data['targets'].to(device, dtype = torch.long)
     14 outputs = model(ids, mask)
---> 15 loss = loss_function(outputs, targets)
     16 tr_loss += loss.item()
     17 big_val, big_idx = torch.max(outputs.data, dim=1)
.....
File ~\anaconda3\lib\site-packages\torch\nn\functional.py:3089, in binary_cross_entropy(input, target, weight, size_average, reduce, reduction)
   3087     reduction_enum = _Reduction.get_enum(reduction)
   3088 if target.size() != input.size():
-> 3089     raise ValueError(
   3090         "Using a target size ({}) that is different to the input size ({}) is deprecated. "
   3091         "Please ensure they have the same size.".format(target.size(), input.size())
   3092     )
   3094 if weight is not None:
   3095     new_size = _infer_size(target.size(), weight.size())

ValueError: Using a target size (torch.Size([4])) that is different to the input size (torch.Size([4, 4])) is deprecated. Please ensure they have the same size.

Can someone help me to understand this error please?
In addition, if someone could direct me to a pytorch tutorial as I’d also like to know how to get the precision/recall metrics in pytorch, I would be really grateful…

There are a few issues in your code:
torch.nn.BCELoss() expects probabilities as the model output and is used for a binary or multi-label classification use case. In the latter case each sample can contain zero, one, or multiple active classes.
Your model currently outputs raw logits in the shape [batch_size, 4] which indicates you are working with 4 classes.
In case you are working on a multi-class classification use case, where each sample contains exactly one class, you should use nn.CrossEntropyLoss instead.

Hi thank you - I have a binary classification problem (sorry I typed binomial first!) , thats why I used BCELoss(). how can I fix it?

I’m not used to pytorch at all - I have mostly used tensorflow.

In a binary classification use case your model should output a single logit or probability.
Change the self.classifier to nn.Linear(768, 1), use nn.BCEWithLosgitsLoss, and make sure the target has the shape [batch_size, 1] containing values in [0, 1].

I am now getting

ValueError: Using a target size (torch.Size([4])) that is different to the input size (torch.Size([4, 1])) is deprecated. Please ensure they have the same size.

-------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Input In [46], in <cell line: 2>()
      1 # Creating the loss function and optimizer
----> 2 loss_function = torch.nn.BCEWithLosgitsLoss()
      3 optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

AttributeError: module 'torch.nn' has no attribute 'BCEWithLosgitsLoss'

Hi, I guess if you squeeze the tensor of size (torch.Size([4, 1]), you can get rid of extra dimension and would probably match the dimension with the target.

https://pytorch.org/docs/stable/generated/torch.squeeze.html

Squeezing the model output or (better) unsqueezeing the target in dim1 should fix the issue.

I have a typo and it should be torch.nn.BCEWithLogitsLoss.