CrossEntropy ValueError: Expected target size [2, 800], got [2]

I have been trying to execute BERT fine tune model with custom layer first on single GPU and then later on for multiple GPU distributed learning. However, I can’t figure out this error.

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=512)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id, return_dict=False)
class Classifier(torch.nn.Module):
    def __init__(self, roberta_model):
        super(Classifier, self).__init__()
        self.embedding = model.bert.embeddings
        self.encoder = model.bert.encoder
        self.classifier = model.classifier
        
    def forward(self, input_ids, token_type_ids = None, attention_mask = None, labels = None):
        emb_out = self.embedding(input_ids)
        print(f'Shape after Embedding: {emb_out.shape}')
        enc_out = self.encoder(emb_out)
        #print(f'Shape after encoder: {enc_out.shape}')
        classifier_out = self.classifier(enc_out[0])
        print(f'Shape after classifier: {classifier_out.shape}')
        return classifier_out  
 
bert = Classifier(model)
EPOCHS = 2
LR = 1e-5

optimizer = torch.optim.AdamW(bert.parameters(), lr = LR)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                           num_warmup_steps = 0, 
                                           num_training_steps = total_steps)

loss_fn = torch.nn.CrossEntropyLoss()

def train_model(model, data_loader, loss_fn, optimizer, scheduler, n_examples):
    model = model.train() # Explicitly setting model to train state
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d['input_ids']
        attention_mask = d['attention_mask']
        # Reshaping attention mask as per the input shape of tensor in the forward pass
        reshaped_attention_mask = attention_mask.reshape(d['attention_mask'].shape[0], 1, 1, d['attention_mask'].shape[1])
        targets = d['labels']
        print(f'Target shape:{targets.shape}')
        outputs= model(input_ids = input_ids, attention_mask = reshaped_attention_mask)
        print(f'Ouput shape:{outputs.shape}')
        _, preds = torch.max(outputs, dim = 1)
        
        loss = loss_fn(outputs, targets)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        loss.backward()
        # Clip the gradients of the model to prevent exploding gradients using clip_grad_norm
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids']
            attention_mask = d['attention_mask']
            # Reshaping attention mask as per the input shape of tensor in the forward pass
            reshaped_attention_mask = attention_mask.reshape(d['attention_mask'].shape[0], 1, 1, d['attention_mask'].shape[1])
            targets = d['labels']
            
            outputs = model(input_ids = input_ids, attention_mask = reshaped_attention_mask)
            _, preds = torch.max(outputs, dim = 1)
            
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
        return correct_predictions.double() / n_examples, np.mean(losses)

%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    
    train_acc, train_loss = train_model(bert, train_data_loader, loss_fn, optimizer, scheduler, len(df_train))
    print(f'Train Loss: {train_loss} ; Train Accuracy: {train_acc}')
    
    val_acc, val_loss = eval_model(bert, val_data_loader, loss_fn, len(df_val))
    print(f'Val Loss: {val_loss} ; Val Accuracy: {val_acc}')
    
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    if val_acc > best_accuracy:
        torch.save(bert.state_dict(), 'multi_gpu_bert_best_model_state.bin')
        best_acc = val_acc

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
File <timed exec>:8

Cell In[60], line 18, in train_model(model, data_loader, loss_fn, optimizer, scheduler, n_examples)
     15 _, preds = torch.max(outputs, dim = 1)
---> 18 loss = loss_fn(outputs, targets)
     20 correct_predictions += torch.sum(preds == targets)
     21 losses.append(loss.item())

File /opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
   1190 # If we don't have any hooks, we want to skip the rest of the logic in
   1191 # this function, and just call forward.
   1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194     return forward_call(*input, **kwargs)
   1195 # Do not call functions when jit is used
   1196 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/nn/modules/loss.py:1174, in CrossEntropyLoss.forward(self, input, target)
   1173 def forward(self, input: Tensor, target: Tensor) -> Tensor:
-> 1174     return F.cross_entropy(input, target, weight=self.weight,
   1175                            ignore_index=self.ignore_index, reduction=self.reduction,
   1176                            label_smoothing=self.label_smoothing)

File /opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/nn/functional.py:3026, in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
   3024 if size_average is not None or reduce is not None:
   3025     reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 3026 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)

RuntimeError: Expected target size [2, 800], got [2]

Target shape:torch.Size([2])
Shape after Embedding: torch.Size([2, 512, 768])
Shape after classifier: torch.Size([2, 512, 800])
Ouput shape:torch.Size([2, 512, 800])

My data has two columns. The first column is text data and the second is label data. I tried to change this codes for Distributed Parallel Training on multiple GPUs but I haven’t able to do so. Do you have any suggestions? Thanks in advance!

Hi Matrix!

CrossEntropyLoss is expecting either that your outputs (the input
to CrossEntropyLoss) has a shape of [2, 512] or that your targets
(the target passed to CrossEntropyLoss) has a shape of [2, 800].

I don’t know what your dimensions mean, but let’s say that your outputs
has a batch size of 2, a number of classes of 512, and a sequence length
of 800. Your targets should then provide the ground-truth class (as an
integer class label that runs from 0 to 511) for each element of the
predicted sequence of length 800 for each of the 2 samples in your batch.

That is, you would want targets to have shape [2, 800] and consist of
integers between 0 and 511 (inclusive).

Best.

K. Frank

Thank you so much for helping me. My data table consist of two columns with about 800 rows. First column is text data of review and the other is topic class of that review. I am wondering if my output shape need to be reshape at this point. Appreciate more input from everyone. Thank you!

Hi Matrix!

I don’t understand the details of your use case or what your data table is.

Is each row in your data table an independent sample? It sounds like the
second column is some sort of class label, either an integer or something
that gets transformed into an integer. Is this correct? What is the first
column? Is each row in the first column some sort of data structure that
contains text (or encoded text)?

What is the shape input_ids that you input to your model? What do all
of the dimensions of input_ids mean? Does input_ids have a batch
dimension?

You say the the output of your model, outputs, has shape [2, 512, 800].
What is the meaning of each of those dimensions? Does it have a batch
dimension? Does that dimension of size 800 have anything to do with the
“about 800 rows” of you data table?

targets has shape [2]. What does the 2 mean?

What is the meaning of the 512 in the shapes of emb_out and
classifier_out (that then shows up in outputs)?

Best.

K. Frank