Hello, it gets error when I resume the optimizer.
I define the optimizer like that:
# 削减公用层learning_rate, 这个很重要
all_parameters = set(model.parameters())
nas_layers_params = []
for m in model.modules():
if isinstance(m, BlockSwitch):
nas_layers_params += list(m.parameters())
nas_layers_params = set(nas_layers_params)
comm_layers_params = all_parameters - nas_layers_params
nas_layers_params = list(nas_layers_params)
comm_layers_params = list(comm_layers_params)
optimizer = torch.optim.Adam(
[{"params": nas_layers_params},
{"params": comm_layers_params, "lr": args.learning_rate/model.num_blocks_per_layer} # 公用层learning_rate应取平均
],
args.learning_rate,
#momentum=args.momentum,
weight_decay=args.weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, float(args.epochs), eta_min=args.learning_rate_min, last_epoch=-1)
Save model like that:
# save the states of this epoch
state = {
'epoch': epoch,
'args': args,
'optimizer_state': optimizer.state_dict(),
'supernet_state': model.state_dict(),
'scheduler_state': scheduler.state_dict()
}
path = './super_train/{}/super_train_states.pt.tar'.format(args.exp_name)
torch.save(state, path)
And load optimizer like that:
if args.resume:
resume_path = './super_train/{}/super_train_states.pt.tar'.format(args.exp_name)
if os.path.isfile(resume_path):
print("Loading checkpoint '{}'".format(resume_path))
checkpoint = torch.load(resume_path)
start_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['supernet_state'])
optimizer.load_state_dict(checkpoint['optimizer_state'])
scheduler.load_state_dict(checkpoint['scheduler_state'])
else:
raise ValueError("No checkpoint found at '{}'".format(resume_path))
But get the error:
File "train.py", line 197, in main
train(args, epoch, train_data, device, model, criterion=criterion, optimizer=optimizer, my_choice=choice)
File "train.py", line 77, in train
optimizer.step()
File "/data/limingyao/miniconda3/envs/py38/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 66, in wrapper
return wrapped(*args, **kwargs)
File "/data/limingyao/miniconda3/envs/py38/lib/python3.8/site-packages/torch/optim/adam.py", line 95, in step
exp_avg.mul_(beta1).add_(1 - beta1, grad)
RuntimeError: The size of tensor a (80) must match the size of tensor b (240) at non-singleton dimension 0
Is the way I load optimizer.state_dict wrong?
Thank you