I just learned that it is possible to set weight_decay
in AdamW
to a value greater than 1. I thought that weight decay must be between 0 and 1, or am I missing something?
def make_optimizer(model, decoder_weight_decay):
groups = []
for n, p in model.named_parameters():
if n.startswith('decoder.weight'):
weight_decay = decoder_weight_decay
group = {
"params": p,
"weight_decay": weight_decay,
"name": n}
groups.append(group)
return AdamW(groups)