Hi, I cant resume properly from a checkpoint. each time I try to resume, it seems the training statistics are either invalid or missing since the accuracy gets very bad!
For instance, I save a checkpoint at epoch 80, and I get 62.5% accuracy, When I resume from this very checkpoint, the accuracy now becomes 34!!
What am I doing wrong here? here is the snippet for saving and resuming :
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print_log("=> loading checkpoint '{}'".format(args.resume), log)
checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
best_prec1 = checkpoint['best_prec1']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print_log("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']), log)
else:
print_log("=> no checkpoint found at '{}'".format(args.resume), log)
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
'optimizer' : optimizer.state_dict(),
}, is_best, filename, bestname)
# measure elapsed time
epoch_time.update(time.time() - start_time)
start_time = time.time()
def save_checkpoint(state, is_best, filename, bestname):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, bestname)
Any help is greatly appreciated