Hello everyone:
Recently a paper reported that it would be better to apply L2 regularization to weights tensor only and bias should not be regularised. The implementation way I can think of is to place weights and bias tensor into two different list and use different L2 regularization hyper-parameters to these parameter list explicitly. But I found this would be very complex, Can you think of a more simple implementation?
Thank you for your advice!!!
Here is what I did, based on this
factor = 0.00005 # reg term coefficient/multiplier/weight
model = NeuralNet(input_size, hidden_size, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss() # This criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
l1_reg_criterion = nn.L1Loss() # size_average=False
# Train the model
n_total_steps = len(X_train)
X_train_RFE_vals= X_train_RFE.values
y_train_vals = y_train.values.flatten()
X_test_RFE_vals= X_test_RFE.values
y_test_vals = y_test.values.flatten()
for epoch in range(1, num_epochs + 1):
# for i in range(len(X_train_RFE_vals)//100 + 1): #, batch_size
n_correct = 0
n_samples = 0
for i in range(0, X_train_RFE_vals.shape[0], batch_size):
x = torch.as_tensor(X_train_RFE_vals[i:i+batch_size], dtype=torch.float).to(device)
y = torch.as_tensor(y_train_vals[i:i+batch_size], dtype=torch.long).to(device)
outputs = model(x)
loss = criterion(outputs, y)
reg_loss = 0
for name, param in model.l1.state_dict().items(): # L1 is the first layer
if name == 'weight':
# print(param.size())
# print((param-param).sum().item()); throw_for_bar_after_print_stop
reg_loss = l1_reg_criterion(param, param-param)
# print(f'loss: {loss}, reg_loss: {reg_loss}'); throw_for_bar_after_print_stop
loss = loss + (factor * reg_loss)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, predicted = torch.max(outputs.data, dim=1)
n_samples += y.size(0)
n_correct += (predicted == y).sum().item()
print(f'Epoch [{epoch}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}, Acc: {100.0 * n_correct / (n_samples+1):.4f}; -- loss: {loss}, reg_loss: {reg_loss}%')