I have implemented a simple network with nn.Module and I encountered the problem about regularization. The optimizer provides ‘weight_decay’, but it includes all the parameters. I want to do L2 excluding bias with which may cause underfitting.

I calculated the regular cost and separated the weight and bias. However when I applied them to torch.optim.Adam using param_groups the loss did not drop.

I wondered if there was any wrong with my code.

```
class CatNetwork(nn.Module):
def __init__(self, in_dim, n_hidden_1, out_dim):
super(CatNetwork, self).__init__()
self.layer1 = nn.Sequential(
nn.Linear(in_dim, n_hidden_1),
nn.ReLU(True)
)
self.layer2 = nn.Sequential(
nn.Linear(n_hidden_1, out_dim),
nn.Sigmoid()
)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
return x
# Regularization cost
def regularization(self, model, weight_decay):
parameter_list = model.state_dict().keys() # parameter list odict_keys(['layer1.0.weight', 'layer1.0.bias', 'layer2.0.weight', 'layer2.0.bias'])
loss_reg = 0
for para in parameter_list:
loss_reg += torch.sum(torch.square(model.state_dict()[para]))
return weight_decay * loss_reg
model = CatNetwork(n_input, 16, 1)
# Separate weight and bias
weight_list = []; bias_list = []
for i in range(1, 3):
para_weight = 'layer' + str(i) + '.0.weight'
para_bias = 'layer' + str(i) + '.0.bias'
weight_list += [model.state_dict()[para_weight]]
bias_list += [model.state_dict()[para_bias]]
# loss function
criterion = nn.BCELoss()
# optimizer
optimizer = torch.optim.Adam([{'params': weight_list, 'weight_decay':1}, {'params': bias_list, 'weight_decay':0}], lr=lr)
...
for i in range(epoch_number);
out = model(x)
loss = criterion(out, y) + model.regularization(model, weight_decay) # loss + reg_cost
# I'm not sure whether I should add the regularization cost as I don't know how the 'weight_decay' method of optimizer really works.
# But it has no difference to the result whether I keep it or drop it.
...
```