@final
class Checkpoint:
def __init__(self, epoch: int, accuracy: float, loss: float,
model: Net,
optimizer: Optimizer
) -> None:
self.epoch = epoch
self.accuracy = accuracy
self.loss = loss
self.model = copy.deepcopy(model.state_dict())
self.optimizer = copy.deepcopy(optimizer.state_dict())
self.rng_state = torch.get_rng_state()
self.cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
self.numpy_rng_state = np.random.get_state()
self.python_rng_state =random.getstate()
def restore(self, model: Net, optimizer: Optimizer):
assert self.model
model.load_state_dict(self.model)
assert self.optimizer
optimizer.load_state_dict(self.optimizer)
# older versions didn't have the rng states
if hasattr(self,"rng_state"):
torch.set_rng_state(self.rng_state)
if hasattr(self,"cuda_rng_state"):
if torch.cuda.is_available() and self.cuda_rng_state is not None:
torch.cuda.set_rng_state(self.cuda_rng_state)
if hasattr(self,"numpy_rng_state"):
np.random.set_state(self.numpy_rng_state)
if hasattr(self,"python_rng_state"):
random.setstate(self.python_rng_state)
The lr is confirmed to be the same, but the rate of change of loss function initially high, even though before saving it had reached a steady state plateau. It as though the adadelta state is being recomputed from scratch, yet I can see that the state is successfully reloaded. An array of 10 parameter groups, corresponding to 5 layers, are there. The groups each have “step”, “square_avg”, and “acc_delta” tensors.
Any similar experiences?