log_gpu_memory_usage('fsdp checkpoint load 1.2', logger=None)
state_dict_cfg = ShardedStateDictConfig(offload_to_cpu=True)
optim_cfg = ShardedOptimStateDictConfig(offload_to_cpu=True)
with FSDP.state_dict_type(self.model, StateDictType.SHARDED_STATE_DICT, state_dict_cfg, optim_cfg):
self.model.load_state_dict(model_state_dict)
if self.optimizer is not None:
self.optimizer.load_state_dict(optimizer_state_dict)
log_gpu_memory_usage('fsdp checkpoint load 1.3', logger=None)
I obtained the following printing results.
fsdp checkpoint load 1.2, memory allocated (GB): 52.07300519943237, memory reserved (GB): 61.4921875
fsdp checkpoint load 1.3, memory allocated (GB): 65.82867097854614, memory reserved (GB): 67.294921875
I think this doesn’t meet the expectations. Why does the loaded parameters increase the usage of GPU video memory? Is this a bug? Or is there something that I’m not aware of?