Hello all,
I’m trying to train a BERT model for multiclass and multilabel classification (4 different labels for 5 different classes). I’m using this tutorial for multi-class classification which does binary classification.
I tried to modify the code to fit my problem by changing the criterion to CrossEntropyLoss and the output in forward to softmax instead of sigmoid (see commented out sections in init and forward).
class MovieTagger(pl.LightningModule):
def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
super().__init__()
self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
self.n_training_steps = n_training_steps
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.CrossEntropyLoss()#nn.BCELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
output = torch.softmax(output) #torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def training_epoch_end(self, outputs):
labels = []
predictions = []
for output in outputs:
for out_labels in output["labels"].detach().cpu():
labels.append(out_labels)
for out_predictions in output["predictions"].detach().cpu():
predictions.append(out_predictions)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
for i, name in enumerate(df[9:]):
class_roc_auc = auroc(predictions[:, i], labels[:, i])
self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps
)
return dict(
optimizer=optimizer,
lr_scheduler=dict(
scheduler=scheduler,
interval='step'
)
)
When I try to run
_, predictions = model(sample_batch["input_ids"], sample_batch["attention_mask"])
I get the error
'softmax() received an invalid combination of arguments - got (Tensor), but expected one of:
- (Tensor input, int dim, torch.dtype dtype)
- (Tensor input, name dim, *, torch.dtype dtype)’
If I keep sigmoid I get predictions but get a different error because of the crossentropyloss when evaluating:
RuntimeError: 1D target tensor expected, multi-target not supported
How can I modify the Bert model correctly to handle my multilabel classification?
I don’t understand most of what’s going on in the Tagger class well enough to figure it out myself right now.
Any help or resources are greatly appreciated!