I encounter the following problem:
RuntimeError: Error(s) in loading state_dict for SphereFace: While copying the parameter named "fc2.weight", whose dimensions in the model are torch.Size([81391, 512]) and whose dimensions in the checkpoint are torch.Size([81931, 512]).
The error is not telling me what’s wrong.
Is this a bug?
The very weird thing is that if I train with a smaller model with class size 10572
instead of 81931
, the same code works (loading trained model). But when it is the larger model with class size 81931
, it complains. How can the model being bigger cause an error?
The following is how I save the model:
save_checkpoint({
'epoch': epoch,
'arch': args.arch,
'model_config': {'num_classes': model.num_classes,
'use_prelu': model.use_prelu,
'use_se': model.use_se,
'weight_scale': model.weight_scale,
'feature_scale': model.feature_scale},
'state_dict': model.state_dict(),
'optimizer' : optimizer.state_dict(),
'loss': best_loss,
'prec1': top1.avg,
'prec5': top5.avg
}, is_best, savedir)
def save_checkpoint(state, is_best, savedir):
if not os.path.exists(savedir):
os.makedirs(savedir)
checkpoint_savepath = os.path.join(savedir, 'checkpoint.pth.tar')
torch.save(state, checkpoint_savepath)
if is_best:
best_savename = '_'.join([state['arch'], 'epoch' + str(state['epoch'])]) + '.pth.tar'
best_savepath = os.path.join(savedir, best_savename)
shutil.copyfile(checkpoint_savepath, best_savepath)