Help with accuracy calculation and decoding the output of a transformer model:

Hi everyone, I’m new here and new to PyTorch overall, I am training a model to diacritize Arabic sentences (multilabel classification) and I used those classes below a basic self-attention transformer class and another one with a diacritization head on top

import torch
import torch.nn as nn
from torch.nn.modules.activation import MultiheadAttention

# This code is based on NAACL 2019 tutorial here https://tinyurl.com/NAACLTransfer

class Transformer(nn.Module):
    '''
    This class defines a basic self-attention transformer.

    Args:
        embed_dim (int): dimension of the embeddings used in the transformer attnetion blocks
        hidden_dim (int): size of the fully connected layer connecting attention blocks
        num_embeddings (int): vocbulary size
        num_max_positions (int): maximum sequence length. used for positional embeddings
        num_heads (int): number of attention heads used in Multihead Attention.
        num_layer (int): number of layers. Each layer is an attention block and a fully connected layer.
        dropout (float): dropout probability
        causal (bool): whether to attend only to previous positions.

    '''
    def __init__(self, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal):
        super().__init__()
        self.causal = causal
        self.tokens_embeddings = nn.Embedding(num_embeddings, embed_dim)
        self.position_embeddings = nn.Embedding(num_max_positions, embed_dim)
        self.dropout = nn.Dropout(dropout)

        self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList()
        self.layer_norms_1, self.layer_norms_2 = nn.ModuleList(), nn.ModuleList()
        for _ in range(num_layers):
            self.attentions.append(MultiheadAttention(embed_dim, num_heads, dropout=dropout))
            self.feed_forwards.append(nn.Sequential(nn.Linear(embed_dim, hidden_dim),
                                                    nn.ReLU(),
                                                    nn.Linear(hidden_dim, embed_dim)))
            self.layer_norms_1.append(nn.LayerNorm(embed_dim, eps=1e-12))
            self.layer_norms_2.append(nn.LayerNorm(embed_dim, eps=1e-12))

    def forward(self, x, padding_mask=None):
        """ x has shape [seq length, batch], padding_mask has shape [batch, seq length] """
        positions = torch.arange(len(x), device=x.device).unsqueeze(-1)
        h = self.tokens_embeddings(x)
        h = h + self.position_embeddings(positions).expand_as(h)
        h = self.dropout(h)

        attn_mask = None

        if self.causal:
            attn_mask = torch.full((len(x), len(x)), -float('Inf'), device=h.device, dtype=h.dtype)
            attn_mask = torch.triu(attn_mask, diagonal=1)

        for layer_norm_1, attention, layer_norm_2, feed_forward in zip(self.layer_norms_1, self.attentions,
                                                                       self.layer_norms_2, self.feed_forwards):
            h = layer_norm_1(h)
            x, _ = attention(h, h, h, attn_mask=attn_mask, need_weights=False, key_padding_mask=padding_mask)
            x = self.dropout(x)
            h = x + h

            h = layer_norm_2(h)
            x = feed_forward(h)
            x = self.dropout(x)
            h = x + h
        return h
import torch
import torch.nn as nn

class TransformerWithDiacritizationHead(nn.Module):
    def __init__(self, config):
        """" Transformer with a diacritization head on top"""
        super().__init__()
        self.config = config
        self.transformer = Transformer(config.embed_dim, config.hidden_dim, config.num_embeddings,
                                       config.num_max_positions, config.num_heads, config.num_layers,
                                       config.dropout, causal=config.causal)


        self.diac_head = nn.Linear(config.embed_dim, config.num_diac_labels, bias=False)

    def init_weights(self, module):
        """ initialize weights - nn.MultiheadAttention is already initalized by PyTorch (xavier) """
        if isinstance(module, (nn.Linear, nn.Embedding, nn.LayerNorm)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if isinstance(module, (nn.Linear, nn.LayerNorm)) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, x, labels=None, padding_mask=None, label_ignore_idx=-1):
        """ x has shape [seq length, batch], padding_mask has shape [batch, seq length] """
        hidden_states = self.transformer(x, padding_mask)

        logits = self.diac_head(hidden_states) # seq_len x batch x num_labels
        # print("len logits:{} - shape logits:{}".format(len(logits),logits.shape))
        # print("len labels:{} - shape labels:{}".format(len(labels),labels.shape))
        if labels is not None:

            assert labels.size(0) == logits.size(0), "logits and labels dimension mismatch"

            #shift_logits = logits[:-1] if self.transformer.causal else logits
            #shift_labels = labels[1:] if self.transformer.causal else labels

            loss_fct = nn.CrossEntropyLoss(ignore_index=label_ignore_idx)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
            return logits, loss

        return logits

the arguments for the model

from collections import namedtuple

Config = namedtuple('Config',
field_names="embed_dim, hidden_dim, num_max_positions, num_embeddings      , num_heads, num_layers,"
              "dropout,causal,num_diac_labels, initializer_range, batch_size, lr, max_norm, n_epochs, n_warmup,"
              "mlm, gradient_accumulation_steps, device, log_dir, dataset_cache")
diac_args = Config( 512      , 512      , 256              , 38, 8       , 10        ,
               0.1  ,False ,9, 0.02             , 64        , 0.0001, 5.0 ,1     , 1000    ,
               False, 4, "cuda" if torch.cuda.is_available() else "cpu", "/kaggle/working/Arabic_DIACRITIZATION/trained_models"   , "/kaggle/working/Arabic_DIACRITIZATION/dataset_cache.bin")

prepare training loop

from ignite.metrics import Accuracy
optimizer = torch.optim.Adam(diacritization_model.parameters(), lr=diac_args.lr)

# Training function and trainer
def update(engine, batch):
    diacritization_model.train()
    batch, labels = (t.to(diac_args.device) for t in batch)
    inputs = batch.transpose(0, 1).contiguous()  # to shape [seq length, batch]
    labels = labels.transpose(0, 1).contiguous()
    predicted_label, loss = diacritization_model(inputs, labels = labels, padding_mask=None,label_ignore_idx=-1)
    loss = loss / diac_args.gradient_accumulation_steps
    loss.backward()
    torch.nn.utils.clip_grad_norm_(diacritization_model.parameters(), diac_args.max_norm)
    if engine.state.iteration % diac_args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()
trainer = Engine(update)

# Evaluation function and evaluator (evaluator output is the input of the metrics)
def inference(engine, batch):
    diacritization_model.eval()
    with torch.no_grad():
        batch, labels = (t.to(diac_args.device) for t in batch)
        inputs = batch.transpose(0, 1).contiguous()  # to shape [seq length, batch]
        labels = labels.transpose(0, 1).contiguous()
        logits = diacritization_model(inputs, labels = labels, padding_mask=None,label_ignore_idx=-1)
    return logits[0], labels
evaluator = Engine(inference)

# Attache metric to evaluator & evaluation to trainer: evaluate on valid set after each epoch
# Accuracy().attach(evaluator, "accuracy")
# @trainer.on(Events.EPOCH_COMPLETED)
# def log_validation_results(engine):
#     evaluator.run(valid_loader)
#     print(f"Validation Epoch: {engine.state.epoch} Error rate: {100*(1 - evaluator.state.metrics['accuracy'])}")

# Learning rate schedule: linearly warm-up to lr and then to zero
scheduler = PiecewiseLinear(optimizer, 'lr', [(0, 0.0), (diac_args.n_warmup, diac_args.lr),
                                              (len(train_loader)*diac_args.n_epochs, 0.0)])
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

# Add progressbar with loss
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
ProgressBar(persist=True).attach(trainer, metric_names=['loss'])

# Save checkpoints and finetuning config
checkpoint_handler = ModelCheckpoint(diac_args.log_dir, 'finetuning_checkpoint', save_interval=1, require_empty=False)
# trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': diacritization_model})
torch.save(diac_args, os.path.join(diac_args.log_dir, 'fine_tuning_args.bin'))

as you can see I have already commented the part of the code responsible for the validation accuracy calculation, cause whenever the epoch ends it breaks with error of not compatible shapes for the true labels and logits, for the first time the inference function was returning logits as tuple and the error was
the logits or y_pred has no attribute called ndimension cause it is tuple and then I made it return logits[0] so it returns the array of shape [256, 64, 9] → [seq_length, batch_size, num_categories]
and the labels are transposed to be in shape [seq_length, batch_size] so whenever the accuracy metric is being called it throws errors all of them around shape mismatch so anyone can help me with this cause I am training the model with loss being calculated and I want at least to see the result of the test accuracy, and if anyone can help me with the decode thing, I want to decode the logits value for the input sentence, like I read I saw that it needs softmax and argmax to get the index of the most probable output of the model then decode it like the encoded labels vocabulary, if you guys can help me with the decode code I’ll appreciate it.
thanks all and sorry for taking so long. :smiling_face_with_tear:

@ptrblck
Can you please help me with that?

I see that you are using nn.CrossEntropyLoss as the criterion:

loss_fct = nn.CrossEntropyLoss(ignore_index=label_ignore_idx)loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

and that you are flattening the output and targets to a 2D and 1D tensor, respectively.
Your shape is given as: [seq_length, batch_size, num_categories] so be careful with the flattening operation.
nn.CrossEntropyLoss expects a model output containing logits in the shape [batch_size, nb_classes, seq_len] (for a temporal classification use case) and labels in the shape [batch_size, seq_len] containing class indices in [0, nb_classes-1].
You might thus need to permute the output and target instead of simply flattening them, so double check it.

I’m also unsure what exactly the error is you are seeing, but I assume your model output might still be 3D while the target would be 2D. In this case, use torch.argmax on the model output to create the predicted class indices.
Assuming your output was permuted and has the shape [batch_size, nb_classes, seq_len], this would work: preds = torch.argmax(output, dim=1).

Sorry I think I miss understand you, where is the flattening operation happening exactly?
Cause I just transposed the shapes to be accepted in the model in update and inference function and once the epoch ends and the accuracy.py from ignite want to do his work it breaks
I tried to reshape the logits and labels shapes but didn’t get it right and I don’t know if I am doing it in the right place in the code also, for the decode I think I get it today I was trying but there still some bugs need to be fixed, I trained the model for 10 epochs and the loss went to 0.0262 although I don’t know the training accuracy and the validation one, if you can be more specific in the place I am missing the right thing to do I’ll be thankful, and on more thing, I have 9 classes yes but the padding has -1 encoding label the other classes have encoding from 0 to 8, is that an issue for the model or I just deal with the real classes and ignore the padding as logits output when taking the softmax and argmax, thanks a lot!

Here in the loss calculation:

loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

so make sure the outputs and targets have the previously described shapes.

I have already tried to make them like you said before returning them from the inference function but it didn’t work, where should I reshape them?
and if the loss is decreasing and I finished training the model, should I retrain after this problem being solved?
Sorry for my questions but like I said I am new in dealing with Pytorch, thanks!

You should permute the output instead of reshaping as the latter would interleave the values.
Assuming your original shape is still [seq_length, batch_size, num_categories] this should work to reshape the activation to [batch_size, num_categories, seq_length]:

seq_length = 3
batch_size = 2
num_categories = 4

x = torch.arange(seq_length*batch_size*num_categories).view(seq_length, batch_size, num_categories)
print(x)
# tensor([[[ 0,  1,  2,  3],
#          [ 4,  5,  6,  7]],

#         [[ 8,  9, 10, 11],
#          [12, 13, 14, 15]],

#         [[16, 17, 18, 19],
#          [20, 21, 22, 23]]])

y = x.permute(1, 2, 0).contiguous()
print(y)
# tensor([[[ 0,  8, 16],
#          [ 1,  9, 17],
#          [ 2, 10, 18],
#          [ 3, 11, 19]],

#         [[ 4, 12, 20],
#          [ 5, 13, 21],
#          [ 6, 14, 22],
#          [ 7, 15, 23]]])

If you’ve trained your model on a wrong output, I would not just retrain the model using the right outputs but train it from scratch instead.

Sir I just trained it as it should be labels and inputs should be in shape [seq_length, batch_size]
So before I pass them to the model I transposed them and the loss was decreasing in a proper way you can say, my problem was after each epoch the accuracy was not able to get calculated, does my loss function parameters have any relation with the parameters should be returned for the accuracy calculation?
My labels are from 0-8 and -1 for padding and the loss has labels and logits in it’s input is there any problem with that?
Cause training 10 epochs would take 1000 minutes of training on gpu p100 and I am not ready to do it again especially I have used all my available time on kaggle, I will try to reshape the logits and labels but you didn’t tell me where?
Is it before the inference return or what?
And labels how should I reshape them so I can get them compatible with logits?
Thanks a lot!

What I mean is that how would I train the model on a wrong output if the loss is responsible for detecting the training phase and telling you somehow about how your model is training and I think that it’s parameters are correct. Am I wrong?, I need the accuracy to check if the model may reach an over fitting state or not especially when calculating the test accuracy and also I may retrain to get the graphics for loss and accuracy cause our supervised professor wants that
Sorry for taking so long.

I still have the same problem, what I miss?


this is where I modified the code

Please post a minimal and executable code snippet using your shape creating random tensors to reproduce the issue. I’ve given the expected shapes a few times already and am unsure why comparing your tensors to the shapes I’ve posted won’t help.

labels = torch.arange(256*64).view(256,64)
logits = torch.arange(256*64*9).view(256,64,9)

I have tried a lot of making sense codes and solutions but I still got errors and my project has to be finished during the upcoming 3 to 4 days…
I have tried to make the labels shape [64,1,256] as [seq_length, labels.unsqueeze(1), batch_size] but also got this error message:


this is the last modification before returning the inference output, I have tried to figure out how the accuracy.py from ignite works but didn’t get all of it right, there are some twists I don’t get and I’m running out of time, if there is anyway to calculate the accuracy without engine or manually it would help I think…
at least after each batch, calculate the accuracy and append it to a matrix so I can plot a curve for it, thanks!

Unfortunately, you didn’t post an executable code snippet but a tensor creation, which is most likely wrong. Based on the shape I assume you are dealing with 9 classes, so labels should contain values in [0, 8], not [0, 256*64].
In any case, this code works again for me, so compare it to yours:

from collections import OrderedDict

import torch
from torch import nn, optim

from ignite.engine import *
from ignite.handlers import *
from ignite.metrics import *
from ignite.utils import *
from ignite.contrib.metrics.regression import *
from ignite.contrib.metrics import *

# create default evaluator for doctests

def eval_step(engine, batch):
    return batch

default_evaluator = Engine(eval_step)

# create default optimizer for doctests

param_tensor = torch.zeros([1], requires_grad=True)
default_optimizer = torch.optim.SGD([param_tensor], lr=0.1)

# create default trainer for doctests
# as handlers could be attached to the trainer,
# each test must define his own trainer using `.. testsetup:`

def get_default_trainer():

    def train_step(engine, batch):
        return batch

    return Engine(train_step)

# create default model for doctests

default_model = nn.Sequential(OrderedDict([
    ('base', nn.Linear(4, 2)),
    ('fc', nn.Linear(2, 1))
]))

manual_seed(2809)

metric = Accuracy()
metric.attach(default_evaluator, "accuracy")
y_true = torch.randint(0, 9, (256, 64))
y_pred = torch.randn(256, 64, 9)
# permute to [batch_size, nb_classes, seq_len]
y_pred = y_pred.permute(0, 2, 1).contiguous()
state = default_evaluator.run([[y_pred, y_true]])
print(state.metrics["accuracy"])

Sorry about that but I have already posted the blocks of code that are important, my data is too big and I can’t but the full code here cause it needs the data, how can I share it with so you can see what ever you want, yes my classes are from 0 to 8 but they are stored in array of shape [256, 64] and the logits is stored in array of shape [256, 64, 9] I just need them to get accepted but the evaluator so that I can get my accuracy and validation accuracy right without errors, my block of code is up in the first section of the topic and the last modification is in the last comment of mine, if you need the notebook or anything else this is my telegram username @HussienAlfx
Sorry for taking your time.

Sir I just got another error index out of range in metric.py when calling this part of code when the epoch ends

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    evaluator.run(valid_loader)
    print(f"Validation Epoch: {engine.state.epoch} Error rate: {100*(1 - evaluator.state.metrics['accuracy'])}")

I think the previous error solution was to put logits and labels like you did I didn’t know that [[y_pred, y_true]] so I changed the return of the inference to this

# Evaluation function and evaluator (evaluator output is the input of the metrics)
def inference(engine, batch):
    diacritization_model.eval()
    with torch.no_grad():
        batch, labels = (t.to(diac_args.device) for t in batch)
        inputs = batch.transpose(0, 1).contiguous()  # to shape [seq length, batch]
        labels = labels.transpose(0, 1).contiguous()
        print("labels shape in inference ",labels.shape)
        logits = diacritization_model(inputs, labels = labels, padding_mask=None,label_ignore_idx=-1)
        logits = logits[0].permute(1, 2, 0).contiguous()
        labels = labels.transpose(0, 1).contiguous()
    return [[logits[0], labels]]

this is the error message

and I have tried to see why that happens to me:

from collections import OrderedDict

import torch
from torch import nn, optim

from ignite.engine import *
from ignite.handlers import *
from ignite.metrics import *
from ignite.utils import *
from ignite.contrib.metrics.regression import *
from ignite.contrib.metrics import *

# create default evaluator for doctests

def eval_step(engine, batch):
    return batch

default_evaluator = Engine(eval_step)

# create default optimizer for doctests

param_tensor = torch.zeros([1], requires_grad=True)
default_optimizer = torch.optim.SGD([param_tensor], lr=0.1)

# create default trainer for doctests
# as handlers could be attached to the trainer,
# each test must define his own trainer using `.. testsetup:`

def get_default_trainer():

    def train_step(engine, batch):
        return batch

    return Engine(train_step)

# create default model for doctests

default_model = nn.Sequential(OrderedDict([
    ('base', nn.Linear(4, 2)),
    ('fc', nn.Linear(2, 1))
]))

manual_seed(2809)

metric = Accuracy()
metric.attach(default_evaluator, "accuracy")
y_true = torch.randint(0, 9, (256, 64))
y_pred = torch.randn(256, 64, 9)
# permute to [batch_size, nb_classes, seq_len]
y_pred = y_pred.permute(0, 2, 1).contiguous()
state = default_evaluator.run([[y_pred, y_true]])
print(state.metrics["accuracy"])
x = [[y_pred, y_true]]

len(x[1])
 # ----> 1 len(x[1])
#IndexError: list index out of range

but your code works fine as it should be, I didn’t get why my matrix has this issue.