Transformer-Encoder model for binary classification

Hello community,

I am working on creating a binary classifier by using the Transformer-Encoder architecture and I have not been able to get the model to learn i.e. the training and validation loss remain nearly equal in every epoch. The code is based on - http://nlp.seas.harvard.edu/2018/04/03/attention.html

class TransformerModel(nn.Module):
    """
    """   
        
        
    def __init__(self, n_token, n_dim_model, n_head, n_hidden, n_blocks, dropout=0.5):
        super(TransformerModel, self).__init__()
        
        # #Multi Headed Attention Layer
        self_attention = MultiHeadedAttention(n_head, n_dim_model)
        # #Feedforward Layer
        feed_forward = FeedForwardLayer(n_dim_model, n_hidden, dropout)
        # #Positional Encoding
        positional_encoding = PositionalEncoding(n_dim_model, dropout)
        
        encoder_layer = EncoderLayer(n_dim_model, copy.deepcopy(self_attention), copy.deepcopy(feed_forward), dropout)
        self.encoder = Encoder(encoder_layer, n_blocks)

        embedding = Embeddings(n_dim_model, n_token)
        self.src_embed = nn.Sequential(embedding, copy.deepcopy(positional_encoding))
        
        # Fully-Connected Layer
        self.fc = nn.Linear(n_dim_model, 2)

    def forward(self, x):
        
        # # x dimension[k, batch_size = 64]
        embedded_sents = self.src_embed(x.permute(1,0))        
        encoded_sents = self.encoder(embedded_sents)
        
       
        final_feature_map = encoded_sents[:,-1,:] 
        final_out = self.fc(final_feature_map) 
        
        return final_out 
    

I am unsure if this architecture is correct. Would really appreciate any suggestion on this!
Thank you