this seems to work:
def add_weight_decay(net, l2_value, skip_list=()):
decay, no_decay = [], []
for name, param in net.named_parameters():
if not param.requires_grad: continue # frozen weights
if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: no_decay.append(param)
else: decay.append(param)
return [{'params': no_decay, 'weight_decay': 0.}, {'params': decay, 'weight_decay': l2_value}]
params = add_weight_decay(net, 2e-5)
sgd = torch.optim.SGD(params, lr=0.05)
from https://raberrytv.wordpress.com/2017/10/29/pytorch-weight-decay-made-easy/