(Using Adadelta) After reloading checkpoint loss exhibit high rate of change compared to when it was saved

@final
class Checkpoint:
    def __init__(self, epoch: int, accuracy: float, loss: float,
                 model: Net,
                 optimizer: Optimizer
) -> None:
        self.epoch = epoch
        self.accuracy = accuracy
        self.loss = loss
        self.model = copy.deepcopy(model.state_dict())
        self.optimizer = copy.deepcopy(optimizer.state_dict())
        self.rng_state = torch.get_rng_state()
        self.cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
        self.numpy_rng_state = np.random.get_state()
        self.python_rng_state =random.getstate()

    def restore(self, model: Net, optimizer: Optimizer):
        assert self.model
        model.load_state_dict(self.model)
        assert self.optimizer
        optimizer.load_state_dict(self.optimizer)
        # older versions didn't have the rng states
        if hasattr(self,"rng_state"):
          torch.set_rng_state(self.rng_state)
        if hasattr(self,"cuda_rng_state"):
          if torch.cuda.is_available() and self.cuda_rng_state is not None:
              torch.cuda.set_rng_state(self.cuda_rng_state)
        if hasattr(self,"numpy_rng_state"):
          np.random.set_state(self.numpy_rng_state)
        if hasattr(self,"python_rng_state"):
          random.setstate(self.python_rng_state)

The lr is confirmed to be the same, but the rate of change of loss function initially high, even though before saving it had reached a steady state plateau. It as though the adadelta state is being recomputed from scratch, yet I can see that the state is successfully reloaded. An array of 10 parameter groups, corresponding to 5 layers, are there. The groups each have “step”, “square_avg”, and “acc_delta” tensors.

Any similar experiences?