I am not sure what I am missing. I am trying to implement a 6 class multi-label network. Keras gives the following results:
precision recall f1-score support
0 0.77 0.82 0.79 7829
1 0.71 0.79 0.75 8176
2 0.68 0.69 0.69 6982
3 0.73 0.67 0.70 7146
4 0.72 0.82 0.77 7606
5 0.78 0.84 0.80 8310
avg / total 0.73 0.78 0.75 46049
whereas pytorch is slightly better in terms of precision but a lot worse on recall.
precision recall f1-score support
0 0.81 0.62 0.70 7715
1 0.77 0.51 0.62 7941
2 0.76 0.46 0.58 6937
3 0.82 0.40 0.54 7231
4 0.81 0.60 0.69 7821
5 0.81 0.63 0.71 7894
avg / total 0.80 0.54 0.64 45539
I noticed that pytorch produces values that are a lot more extreme, i.e. the output of the sigmoid is heavily clumped around 0 and 1, whereas keras produces a more balanced distribution for all values between 0 and 1. This leads me to believe that keras is doing some regularization magic but I wasnt able to find that in the documentation.
Everything but the code shown here is the same (features, scaling etc)
This is the code for keras
nnet = keras.models.Sequential()
nnet.add(keras.layers.Dense(64, activation="relu", input_shape=(X_train.shape[1],) ))
nnet.add(keras.layers.Dropout(0.3))
nnet.add(keras.layers.Dense(32, activation="relu"))
nnet.add(keras.layers.Dropout(0.3))
nnet.add(keras.layers.Dense(y.shape[1], activation="sigmoid"))
nnet.compile(optimizer="rmsprop", metrics=["binary_accuracy"], loss="binary_crossentropy")
history = nnet.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=256)
The Pytorch version
class BaselineNet(nn.Module):
def __init__(self, D_in, num_targets=1):
super(BaselineNet, self).__init__()
D_hidden_1 = 64
D_hidden_2 = 32
dropout_ratio = 0.3
self.num_targets = num_targets
self.net = nn.Sequential(
nn.Linear(D_in, D_hidden_1),
nn.ReLU(),
nn.Dropout(dropout_ratio),
nn.Linear(D_hidden_1, D_hidden_2),
nn.ReLU(),
nn.Dropout(dropout_ratio),
nn.Linear(D_hidden_2, self.num_targets),
)
# Pytorch requires to cast models to cuda manually
if torch.cuda.is_available() and use_cuda:
self.cuda()
self.init_weights()
def forward(self, x):
h = self.net(x)
return F.sigmoid(h)
def init_weights(self):
"""
Here we reproduce Keras default initialization weights to initialize weights
"""
ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
b = (param.data for name, param in self.named_parameters() if 'bias' in name)
for t in ih:
nn.init.xavier_uniform(t)
for t in hh:
nn.init.orthogonal(t)
for t in b:
nn.init.constant(t, 0)
class StableBCELoss(nn.modules.Module):
def __init__(self):
super(StableBCELoss, self).__init__()
def forward(self, input, target):
neg_abs = - input.abs()
loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
return loss.mean()
train_loader = torch.utils.data.DataLoader(dataset=torch.utils.data.TensorDataset(X_train, y_train),
batch_size=256,
shuffle=True)
input_size = X_train.shape[1]
baseline_model = BaselineNet(input_size, len(targets))
criterion = StableBCELoss()
optimizer = optim.RMSprop(baseline_model.parameters())
for t in range(20):
baseline_model.train()
avg_loss = []
# Forward pass: Compute predicted y by passing x to the model
for batch_idx, (data, target) in enumerate(train_loader):
data, target = to_var(data), to_var(target)
y_pred = baseline_model(data)
# Compute and print loss
loss = criterion(y_pred, target.float())
# Zero gradients, perform a backward pass, and update the weights.
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_loss.append(loss.data[0])
print('Train Epoch: {} Loss: {:.6f}'.format(
t, np.mean(avg_loss)))