Transformer models (Only encoder side)

Hi everyone,
I would like to design a model based on the encoder side of Transformer model (“Attention is all you need” paper). However, the “loss” function gives me something between 0.6 to 0.8 and it does not decrease! My dataset includes 4000 samples. I want to be sure about my implementation. Would you please check it and help me to identify where it goes wrong?
Thanks

class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=149):
        super(PositionalEncoding, self).__init__()
        batch_size = 5
        self.dropout = torch.nn.Dropout(p=dropout)
        pe = torch.zeros(batch_size, max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, :, 0::2] = torch.sin(position * div_term)
        pe[:, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:x.size(0),:]
        return self.dropout(x)
   
class PosTransformerModel(nn.Module):
    def __init__(self, embedding_size, nhead, ffn_dim, nlayers, dropout=0.5):
        super(PosTransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(embedding_size, dropout)
        encoder_layers = TransformerEncoderLayer(embedding_size, nhead, ffn_dim, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding_size = embedding_size
    def forward(self, src):
        src = src * math.sqrt(self.embedding_size)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        return output
    
class TransformerModel(nn.Module):
    def __init__(self, embedding_size, nhead, ffn_dim, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        encoder_layers = TransformerEncoderLayer(embedding_size, nhead, ffn_dim, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding_size = embedding_size

    def forward(self, src):
        output = self.transformer_encoder(src)
        return output
    

class Main(nn.Module):
    def __init__(self, number_of_sequences, sequence_vector_size, embedding_size, nhead, ffn_dim, nlayers, dropout):
        super(Main, self).__init__()
        self.non_position_transformer = TransformerModel(embedding_size=sequence_vector_size, nhead=nhead, ffn_dim=ffn_dim, nlayers=nlayers, dropout=dropout)
        self.patients_linear = torch.nn.Linear(number_of_sequences*sequence_vector_size, 512)
        self.linear1024 = torch.nn.Linear(512, 128)
        self.linear512 = torch.nn.Linear(128, 32)
        self.linear16 = torch.nn.Linear(32, 8)
        self.last_linear = torch.nn.Linear(8, 1)
        self.embedding_size = embedding_size
        self.sequence_vector_size = sequence_vector_size
        self.position_transformers = torch.nn.ModuleList([PosTransformerModel(embedding_size=embedding_size, nhead=nhead, ffn_dim=ffn_dim, nlayers=nlayers, dropout=dropout) for i in range(number_of_sequences)])
        self.linears = torch.nn.ModuleList([torch.nn.Linear(149*embedding_size, sequence_vector_size) for i in range(number_of_sequences)])
            
    def forward(self, X_array):
        number_of_sequences = 327
        outs = []
        for i in range(number_of_sequences):
            seq_number_of_words = 149   
            out = self.position_transformers[i](X_array[:,i])
            out = out.view(-1, seq_number_of_words*self.embedding_size)
            out = self.linears[i](out)
            outs.append(out)    
        second_stage_tensor = outs[0]
        for i in range(1,number_of_sequences):
            second_stage_tensor = torch.cat((second_stage_tensor, outs[i]), 1)
        second_stage_tensor = second_stage_tensor.view(-1, number_of_sequences, self.sequence_vector_size)
        out = self.non_position_transformer(second_stage_tensor)
        out = out.view(-1, number_of_sequences*self.sequence_vector_size)
        out = self.patients_linear(out)
        out = self.linear1024(out)
        out = self.linear512(out)
        out = self.linear16(out)
        out = self.last_linear(out)
        out = torch.sigmoid(out)
        return out


number_of_sequences = 327
em_size = 20
sequence_vector_size = 10
nhead = 5
ffn_dim = 10
nlayers = 1
dropout = 0.5
num_epoch = 5
batch_size = 5 
fold_num = 1
main = Main(number_of_sequences, sequence_vector_size, em_size, nhead, ffn_dim, nlayers, dropout)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(main.parameters(), lr=0.001)
the_number = 1
for epoch in range(num_epoch):
    print("----> Epoch: " + str(epoch))
    for file in os.listdir("DATASET/"):
        if(file.find("test") == -1 and file.find("training") != -1):
            batch_list = pickle.load(open("DATASET/" + file,"rb"))
            if(len(batch_list["X"]) < 5):
                print(file.split("batch")[1].split(".pickle")[0] + " ---> " +  str(len(batch_list["X"])))
            batch_list = pickle.load(open("DATASET/" + file,"rb"))
            X_batch = torch.tensor(batch_list["X"],dtype=torch.long)
            Y_batch = torch.tensor(batch_list["Y"], dtype=torch.float32)
            y_hat = main(X_batch)
            loss = criterion(y_hat, Y_batch)
            print(the_number, loss.data)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            the_number += 1

Does the optimizer have problem? or backward ?
I don’t have any idea! The loss of the model does not improve in each iteration and epoch!

I have the same problems with transformers. See Transformer model doesn't improve even when fed the same single example over and over

Did you ever figure out a solution to your problem?

Hi Calebh,
Unfortunately, not yet!

Did any of you find a solution?