I am working on creating a binary classifier by using the Transformer-Encoder architecture and I have not been able to get the model to learn i.e. the training and validation loss remain nearly equal in every epoch. The code is based on - http://nlp.seas.harvard.edu/2018/04/03/attention.html
class TransformerModel(nn.Module): """ """ def __init__(self, n_token, n_dim_model, n_head, n_hidden, n_blocks, dropout=0.5): super(TransformerModel, self).__init__() # #Multi Headed Attention Layer self_attention = MultiHeadedAttention(n_head, n_dim_model) # #Feedforward Layer feed_forward = FeedForwardLayer(n_dim_model, n_hidden, dropout) # #Positional Encoding positional_encoding = PositionalEncoding(n_dim_model, dropout) encoder_layer = EncoderLayer(n_dim_model, copy.deepcopy(self_attention), copy.deepcopy(feed_forward), dropout) self.encoder = Encoder(encoder_layer, n_blocks) embedding = Embeddings(n_dim_model, n_token) self.src_embed = nn.Sequential(embedding, copy.deepcopy(positional_encoding)) # Fully-Connected Layer self.fc = nn.Linear(n_dim_model, 2) def forward(self, x): # # x dimension[k, batch_size = 64] embedded_sents = self.src_embed(x.permute(1,0)) encoded_sents = self.encoder(embedded_sents) final_feature_map = encoded_sents[:,-1,:] final_out = self.fc(final_feature_map) return final_out
I am unsure if this architecture is correct. Would really appreciate any suggestion on this!