here’s a similar problem, but I think torch
is good enough to deal with initialization problem.
Let’s see loss first
# first time save and the loss
epoch:8 loss:2.6038
# second time reload, and save model when epoch is 10
epoch:8 loss:7.8922
epoch:10 loss:4.8546
# third time reload, and save model when epoch is 27
epoch:10 loss:6.0920
epoch:27 loss:2.1613
# fourth time reload and training
epoch:27 loss:6.0920
as you can see, once I reload the model and run a epoch training, the loss raising instead of descending. It’s really strange.
I use step
method that because of my machine cannot support big batch.
- The first time I use four GPU with 16 batch size and backward the loss every 2 step..
- The second time I use two GPU with 8 batch size and backward the loss every 4 step
- The third time I use four GPU with 16 batch size and backward the loss every 2 step..
- The fourth time I use four GPU with 16 batch size and backward the loss every 2 step..
According to the first to the third time, I think it’s batchnorm
problem cause the batch size is different that the param alpha
and beta
is not suitable for another set.
But surprisingly, the fourth time still get a similar situation.So it’s really confuse me.Did I miss some param should save? Here’s my load and reload code. I am sure the use of model.train()
and model.eval()
is right.
# saving script
def save_model(self, path, epoch, model, optimizer):
#if epoch % 4 != 0:
# return
if isinstance(model, torch.nn.DataParallel):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save({
'epoch': epoch,
'model_state_dict': state_dict,
'optimizer_state_dict': optimizer.state_dict(),
'learning_rate':optimizer.state_dict()['param_groups'][0]['lr']
# 'loss': loss
}, path)
# reloading script
def load_model(self, model, optimizer, resume, strict=True):
checkpoint = torch.load(resume, map_location=lambda storage, loc: storage)
print('loaded weights from {}, epoch {}'.format(resume, checkpoint['epoch']))
state_dict_ = checkpoint['model_state_dict']
state_dict = {}
for k in state_dict_:
if k.startswith('module') and not k.startswith('module_list'):
state_dict[k[7:]] = state_dict_[k]
else:
state_dict[k] = state_dict_[k]
model_state_dict = model.state_dict()
if not strict:
for k in state_dict:
if k in model_state_dict:
if state_dict[k].shape != model_state_dict[k].shape:
print('Skip loading parameter {}, required shape{}, ' \
'loaded shape{}.'.format(k, model_state_dict[k].shape, state_dict[k].shape))
state_dict[k] = model_state_dict[k]
else:
print('Drop parameter {}.'.format(k))
for k in model_state_dict:
if not (k in state_dict):
print('No param {}.'.format(k))
state_dict[k] = model_state_dict[k]
model.load_state_dict(state_dict, strict=False)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
for state in optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.cuda()
epoch = checkpoint['epoch']
# loss = checkpoint['loss']
return model, optimizer, epoch