Saving and loading modules

witha · May 5, 2021, 5:13am

Hi,
I am currently running on google colab. I have saved my running model to a directory in a drive. but when i reload it it gives the following error (saving the model does not generate any error).

Traceback (most recent call last):
File “multi_task_game_v3.py”, line 215, in
main()
File “multi_task_game_v3.py”, line 205, in main
optimizer.load_state_dict(checkpoint[‘optimizer_state_dict’])
File “/usr/local/lib/python3.7/dist-packages/torch/optim/optimizer.py”, line 135, in load_state_dict
state_dict = deepcopy(state_dict)
File “/usr/lib/python3.7/copy.py”, line 150, in deepcopy
y = copier(x, memo)
File “/usr/lib/python3.7/copy.py”, line 241, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File “/usr/lib/python3.7/copy.py”, line 150, in deepcopy
y = copier(x, memo)
File “/usr/lib/python3.7/copy.py”, line 241, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File “/usr/lib/python3.7/copy.py”, line 150, in deepcopy
y = copier(x, memo)
File “/usr/lib/python3.7/copy.py”, line 241, in deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File “/usr/lib/python3.7/copy.py”, line 161, in deepcopy
y = copier(memo)
File “/usr/local/lib/python3.7/dist-packages/torch/tensor.py”, line 63, in deepcopy
new_storage = self.storage().deepcopy(memo)
File “/usr/local/lib/python3.7/dist-packages/torch/storage.py”, line 48, in deepcopy
new_storage = self.clone()
File “/usr/local/lib/python3.7/dist-packages/torch/storage.py”, line 64, in clone
return type(self)(self.size()).copy(self)
File “/usr/local/lib/python3.7/dist-packages/torch/cuda/init.py”, line 484, in _lazy_new
return super(_CudaBase, cls).new(cls, *args, **kwargs)
RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 15.90 GiB total capacity; 14.87 GiB already allocated; 105.75 MiB free; 14.92 GiB reserved in total by PyTorch)

The code is as follows

def train(args, model, device, train_loader, optimizer, epoch, criterion, metrices, 
 train_path):
    model.train()
    input_array = []
    loss_a1 = 0.0
    loss_a2 = 0.0
    const = 16

    for batch_idx, (images, tasks, targets, similarity_bits, summary, roles, index)  in enumerate(train_loader):
      optimizer.zero_grad()
      tasks = tasks.float().to(device)

      targets_a1, targets_a2, out_agent1, out_agent2 = do(images, tasks, targets, similarity_bits, summary, 
      roles, index, device, model, epoch, metrices, args)
    
      loss_a1 = criterion(out_agent1, targets_a1)
      loss_a2 = criterion(out_agent2, targets_a2)
      total_loss = torch.mean(loss_a1 + loss_a2)
      #metrices.record_loss(torch.add(loss_a1, loss_a2).detach())
      total_loss.backward()
      optimizer.step()
    
      lr_d = optimizer.param_groups[0]['lr']

      l_con, l_cat = metrices.get_loss(loss_a1.detach(), loss_a2.detach(), tasks.detach())
      F1_con, F1_cat = metrices.get_F1(targets_a1.detach(), targets_a2.detach(), out_agent1.detach(), out_agent2.detach(), tasks.detach())
      F1_con = np.mean(np.asarray(F1_con))
      if batch_idx % args.log_interval == 0:
          print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, \tF1: {:.6f}, \tLr:{:.5f}'.format(
            epoch, batch_idx * len(index), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), total_loss.item(), F1_con, lr_d))

    metrices.per_epoch()
   if epoch % args.srate == 0 :
       metrices.print(epoch)
       torch.save({
          'epoch' : epoch,
          'model_state_dict' : model.state_dict(),
          'optimizer_state_dict' : optimizer.state_dict()
       },train_path + 'model.pt')

  metrices.reinitialize()


wd = os.getcwd()
os.makedirs(wd+"/"+args.name+"/train/", exist_ok = True)
os.makedirs(wd+"/"+args.name+"/test/", exist_ok = True)
train_path = wd+"/"+args.name+"/train/"
train_metrices = metrices(wd+"/"+args.name+"/train/", args.com_d, args.steps)
test_metrices = metrices(wd+"/"+args.name+"/test/", args.com_d, args.steps)
epochs_elapsed = 1
if args.restore:
    checkpoint = torch.load(train_path + 'model.pt')
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epochs_elapsed = checkpoint['epoch']

print(args.lr)
for epoch in range(epochs_elapsed, args.epochs + 1):
    scheduler.step()
    train(args, model, device, train_loader, optimizer, epoch, criterion, train_metrices, train_path)
    test(args, model, device, test_loader, epoch, criterion, test_metrices)

if __name__ == '__main__':
  main()

ptrblck · May 5, 2021, 7:13am

Your GPU might already be using device memory and loading the stored state_dict could cause the out of memory issue, so you might need to restart the notebook and load the state_dict at the beginning of it.