I was a Keras user, now begin to migrate my work to Pytorch. However, I noticed that for some models that learn normally in keras, don’t learn (loss doesn’t decrease) when I implement and train them using Pytorch. I feel that there might be some bug in my code? (I’m using exact the same model structure, hyper-param, optimizer, and even initializer…)
For example, in this basic LSTM network, the keras model:
def Word2VecModel(embedding_matrix, num_words, embedding_dim, seq_length,
num_hidden_lstm,
output_dim,
dropout_rate):
print("Creating text model...")
model = Sequential()
model.add(Embedding(num_words, embedding_dim,
weights=[embedding_matrix], input_length=seq_length, trainable=False))
model.add(LSTM(units=num_hidden_lstm, return_sequences=True, input_shape=(seq_length, embedding_dim)))
model.add(Dropout(dropout_rate))
model.add(LSTM(units=num_hidden_lstm, return_sequences=False))
model.add(Dropout(dropout_rate))
model.add(Dense(output_dim, activation='relu'))
# model.add(Dense(output_dim, activation='tanh'))
return model
lstm_model = Word2VecModel(embedding_matrix, num_words, embedding_dim, seq_length,
num_hidden_lstm,
output_dim,
dropout_rate)
print ("Creating Blind LSTM model")
y_input = Input(shape=(seq_length,), name='y_input')
lstm_result = lstm_model(y_input)
d1 = Dense(128, activation='tanh')(lstm_result)
d2 = Dense(num_classes, activation='softmax')(d1)
new_model = Model(inputs=[ y_input], outputs=d2)
epochs = epochs
batch_size= batch_size
new_model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
The PyTorch model:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.embed = nn.Embedding(400001, 300)
# Embedding layer: loading weights
embedding_matrix = ebd.load()
print(embedding_matrix.shape)
self.embed.weight.data = torch.Tensor(embedding_matrix)
self.embed.weight.requires_grad = False
self.lstm1 = nn.LSTM(300, 128, 1, batch_first=True)
self.lstm2 = nn.LSTM(128, 128, 1, batch_first=True)
self.fc1 = nn.Linear(128, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, 28)
def forward(self, question):
batch_size = question.size()[0]
embed = self.embed(question)
# print(embed.shape)
lstm_out, _ = self.lstm1(embed)
lstm_out = F.dropout(lstm_out, 0.5)
# print(lstm_out.shape)
lstm_out, _ = self.lstm2(lstm_out)
lstm_out = F.dropout(lstm_out, 0.5)
# print(lstm_out.shape)
x = lstm_out[:,-1]
# print(x.shape)
x = F.relu(self.fc1(x))
x = torch.tanh(self.fc2(x))
x = self.fc3(x)
return x
def train(epoch):
training_set = My_Data2(split='val')
train_set = DataLoader(
training_set, batch_size=batch_size, num_workers=1
)
dataset = iter(train_set)
pbar = tqdm(dataset)
moving_loss = 0
net.train(True)
for iter_id, (question, q_len, answer) in enumerate(pbar):
q_len = q_len.tolist()
question = question.type(torch.LongTensor)
question, answer = (
question.to(device),
answer.to(device),
)
net.zero_grad()
output = net(question)
loss = criterion(output, answer)
loss.backward()
optimizer.step()
correct = output.detach().argmax(1) == answer
correct = torch.tensor(correct, dtype=torch.float32).sum() / batch_size
if moving_loss == 0:
moving_loss = correct
else:
moving_loss = (moving_loss * iter_id + correct)/(iter_id+1)
pbar.set_description(
'Epoch: {}; Loss: {:.5f}; Current_Acc: {:.5f}; Total_Acc: {:.5f}'.format(
epoch + 1, loss.item(), correct, moving_loss
)
)
batch_size = 64
n_epoch = 20
dim = 512
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=1e-4)
for epoch in range(10000):
print('==========%d epoch =============='%(epoch))
train(epoch)
acc = valid(epoch)
The keras model converges in 20 epoches with an (85% Train Acc, 65% Test Acc), while the PyTorch model loss fluctuate, and gives an Acc about 40%. (Since the training data is not balanced, even a random guess would give 40%)
Is there any problem in my implementation? Any obvious bugs that prevent the model from learning?
Thanks!