Hi everyone,
I would like to design a model based on the encoder side of Transformer model (“Attention is all you need” paper). However, the “loss” function gives me something between 0.6 to 0.8 and it does not decrease! My dataset includes 4000 samples. I want to be sure about my implementation. Would you please check it and help me to identify where it goes wrong?
Thanks
class PositionalEncoding(torch.nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=149):
super(PositionalEncoding, self).__init__()
batch_size = 5
self.dropout = torch.nn.Dropout(p=dropout)
pe = torch.zeros(batch_size, max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, :, 0::2] = torch.sin(position * div_term)
pe[:, :, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0),:]
return self.dropout(x)
class PosTransformerModel(nn.Module):
def __init__(self, embedding_size, nhead, ffn_dim, nlayers, dropout=0.5):
super(PosTransformerModel, self).__init__()
self.model_type = 'Transformer'
self.pos_encoder = PositionalEncoding(embedding_size, dropout)
encoder_layers = TransformerEncoderLayer(embedding_size, nhead, ffn_dim, dropout)
self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
self.embedding_size = embedding_size
def forward(self, src):
src = src * math.sqrt(self.embedding_size)
src = self.pos_encoder(src)
output = self.transformer_encoder(src)
return output
class TransformerModel(nn.Module):
def __init__(self, embedding_size, nhead, ffn_dim, nlayers, dropout=0.5):
super(TransformerModel, self).__init__()
self.model_type = 'Transformer'
encoder_layers = TransformerEncoderLayer(embedding_size, nhead, ffn_dim, dropout)
self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
self.embedding_size = embedding_size
def forward(self, src):
output = self.transformer_encoder(src)
return output
class Main(nn.Module):
def __init__(self, number_of_sequences, sequence_vector_size, embedding_size, nhead, ffn_dim, nlayers, dropout):
super(Main, self).__init__()
self.non_position_transformer = TransformerModel(embedding_size=sequence_vector_size, nhead=nhead, ffn_dim=ffn_dim, nlayers=nlayers, dropout=dropout)
self.patients_linear = torch.nn.Linear(number_of_sequences*sequence_vector_size, 512)
self.linear1024 = torch.nn.Linear(512, 128)
self.linear512 = torch.nn.Linear(128, 32)
self.linear16 = torch.nn.Linear(32, 8)
self.last_linear = torch.nn.Linear(8, 1)
self.embedding_size = embedding_size
self.sequence_vector_size = sequence_vector_size
self.position_transformers = torch.nn.ModuleList([PosTransformerModel(embedding_size=embedding_size, nhead=nhead, ffn_dim=ffn_dim, nlayers=nlayers, dropout=dropout) for i in range(number_of_sequences)])
self.linears = torch.nn.ModuleList([torch.nn.Linear(149*embedding_size, sequence_vector_size) for i in range(number_of_sequences)])
def forward(self, X_array):
number_of_sequences = 327
outs = []
for i in range(number_of_sequences):
seq_number_of_words = 149
out = self.position_transformers[i](X_array[:,i])
out = out.view(-1, seq_number_of_words*self.embedding_size)
out = self.linears[i](out)
outs.append(out)
second_stage_tensor = outs[0]
for i in range(1,number_of_sequences):
second_stage_tensor = torch.cat((second_stage_tensor, outs[i]), 1)
second_stage_tensor = second_stage_tensor.view(-1, number_of_sequences, self.sequence_vector_size)
out = self.non_position_transformer(second_stage_tensor)
out = out.view(-1, number_of_sequences*self.sequence_vector_size)
out = self.patients_linear(out)
out = self.linear1024(out)
out = self.linear512(out)
out = self.linear16(out)
out = self.last_linear(out)
out = torch.sigmoid(out)
return out
number_of_sequences = 327
em_size = 20
sequence_vector_size = 10
nhead = 5
ffn_dim = 10
nlayers = 1
dropout = 0.5
num_epoch = 5
batch_size = 5
fold_num = 1
main = Main(number_of_sequences, sequence_vector_size, em_size, nhead, ffn_dim, nlayers, dropout)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(main.parameters(), lr=0.001)
the_number = 1
for epoch in range(num_epoch):
print("----> Epoch: " + str(epoch))
for file in os.listdir("DATASET/"):
if(file.find("test") == -1 and file.find("training") != -1):
batch_list = pickle.load(open("DATASET/" + file,"rb"))
if(len(batch_list["X"]) < 5):
print(file.split("batch")[1].split(".pickle")[0] + " ---> " + str(len(batch_list["X"])))
batch_list = pickle.load(open("DATASET/" + file,"rb"))
X_batch = torch.tensor(batch_list["X"],dtype=torch.long)
Y_batch = torch.tensor(batch_list["Y"], dtype=torch.float32)
y_hat = main(X_batch)
loss = criterion(y_hat, Y_batch)
print(the_number, loss.data)
optimizer.zero_grad()
loss.backward()
optimizer.step()
the_number += 1