Confusion about multilabels/softmax in BERT

Hello all,

I’m trying to train a BERT model for multiclass and multilabel classification (4 different labels for 5 different classes). I’m using this tutorial for multi-class classification which does binary classification.
I tried to modify the code to fit my problem by changing the criterion to CrossEntropyLoss and the output in forward to softmax instead of sigmoid (see commented out sections in init and forward).

class MovieTagger(pl.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.CrossEntropyLoss()#nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.softmax(output) #torch.sigmoid(output)  
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):
    
    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(df[9:]):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)


  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

When I try to run

_, predictions = model(sample_batch["input_ids"], sample_batch["attention_mask"])

I get the error

'softmax() received an invalid combination of arguments - got (Tensor), but expected one of:

  • (Tensor input, int dim, torch.dtype dtype)
  • (Tensor input, name dim, *, torch.dtype dtype)’

If I keep sigmoid I get predictions but get a different error because of the crossentropyloss when evaluating:

RuntimeError: 1D target tensor expected, multi-target not supported

How can I modify the Bert model correctly to handle my multilabel classification?
I don’t understand most of what’s going on in the Tagger class well enough to figure it out myself right now.
Any help or resources are greatly appreciated!

The error is due to the fact that torch.softmax(...) function expects a tensor & a dimension along which to compute the softmax. You can either use torch.nn.Softmax(..) which by default computes the softmax across the last layer or call torch.softmax(output, dim=-1)

Thank you, you’re right, that small change fixed that error :grinning_face_with_smiling_eyes:
I’m still stuck on the evaluation though…
I tried

criterion = nn.CrossEntropyLoss()
labels = sample_batch["labels"]
criterion(predictions, labels)

but I get the same RuntimeError: 1D target tensor expected, multi-target not supported error.
And I think there is something wrong with my predictions in general.
My labels are of shape [8, 5] but predictions are [8, 5509]. Shouldn’t they be the same? Or something like [8,5,4]?