Model not learn and i get same results every epoch

hey!
I’m trying to train a model based on BERT pre-trained model with two outputs for category and subcategory.
problem is the model is not learning and I get the same statistical results every epoch…

the training loop:

for epoch in range(1, epochs + 1):
    if epoch > 1:
        train_dataloader = classifier.create_dynamic_padding(db=train_data,
                                                             batch_size=batch_size,
                                                             product_targets='products',
                                                             action_targets='actions',
                                                             random_index=17)
    classifier.train()
    loss_train_total = 0
    status_bar = trange(0, len(train_dataloader['py_inputs']), leave=True, position=0, desc=f'Epoch {epoch} / {epochs}')

    for batch in status_bar:
        status_bar.set_postfix({'Average loss': loss_train_total / (batch + 1)})
        classifier.model.zero_grad()

        b_input_ids = train_dataloader['py_inputs'][batch].to(classifier.device)
        b_input_mask = train_dataloader['py_attn_masks'][batch].to(classifier.device)
        b_products = train_dataloader['py_products'][batch].to(classifier.device)
        b_actions = train_dataloader['py_actions'][batch].to(classifier.device)

        outputs = classifier(input_ids=b_input_ids,
                             attention_mask=b_input_mask,
                             products_targets=b_products,
                             actions_targets=b_actions)

        products_loss = criterion(torch.tensor(outputs[0], requires_grad=True), b_products)
        actions_loss = criterion(torch.tensor(outputs[1], requires_grad=True), b_actions)

        loss = (0.5*products_loss + 0.5*actions_loss)
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(classifier.model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

the model class:

class TwoLevelsClassificator(nn.Module):
    """
    A class to create a model to classified product
    """
    def __init__(self, products, actions, checkpoints):
        super(TwoLevelsClassificator, self).__init__()

        self.num_of_products = len(products)
        self.num_of_actions = len(actions)

        self.tokenizer = AutoTokenizer.from_pretrained(checkpoints)

        self.model = AutoModel.from_pretrained(checkpoints,
                                               config=AutoConfig.from_pretrained(checkpoints,
                                                                                 output_attention=True,
                                                                                 output_hidden_state=True))
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

        self.model.config.label2id = {'products': {product: product_id for product_id, product in enumerate(products)},
                                      'actions': {action: action_id for action_id, action in enumerate(actions)}}
        self.model.config.id2label = {'products': {product_id: product for product, product_id in
                                                   self.model.config.label2id['products'].items()},
                                      'actions': {action_id: action for action, action_id in
                                                  self.model.config.label2id['actions'].items()}}

        self.products_out_layer = nn.Linear(self.model.config.hidden_size, self.num_of_products)
        self.actions_out_layer = nn.Linear(self.model.config.hidden_size, self.num_of_actions)
        self.embedding = nn.Embedding(self.model.config.vocab_size, self.model.config.hidden_size, padding_idx=self.model.config.pad_token_id)
        self.out_softmax = nn.Softmax(dim=1)

    def forward(self, input_ids=None, attention_mask=None, products_targets=None, actions_targets=None):
        bert_out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        bert_last_hidden_state = bert_out[0]
        products_output_layer = self.products_out_layer(bert_last_hidden_state[:, 0, :].view(-1, 768)).detach().numpy()
        products_labels = [f'{self.model.config.id2label["products"][np.argmax(preds_vector)].replace("_", " ")}' for preds_vector in products_output_layer]
        tokenizer_labels = self.tokenizer(products_labels)
        if False in [len(i) == len(tokenizer_labels['input_ids']) for i in tokenizer_labels['input_ids']]:
            tokenizer_labels = self.padd_to_max_size(tokenized_seq=tokenizer_labels)
        embedded_products_layer = self.embedding(torch.tensor(tokenizer_labels['input_ids']))
        actions_output_layer = self.actions_out_layer(torch.cat((bert_last_hidden_state, embedded_products_layer.expand(bert_last_hidden_state.shape[0], -1, -1)), dim=1)[:, 0, :].view(-1, 768)).detach().numpy()

        return products_output_layer, actions_output_layer

Your model’s parameters don’t get any gradients and are not updated since you are detaching the computation graph by rewrapping the output tensor:

products_loss = criterion(torch.tensor(outputs[0], requires_grad=True), b_products)
actions_loss = criterion(torch.tensor(outputs[1], requires_grad=True), b_actions)

pass outputs[index] directly to the criterion and it should work.

Thank you very much,
that helped!