Resuming training raises error "CUDA out of memory"

AlbyTree · April 29, 2020, 3:14pm

After I trained my model for 1 epoch I interrupted the process via terminal with CTRL+Z.
When I tried to resume the training I got this error

THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
  File "train.py", line 174, in <module>
    train(train_loader, model, optimizer, epoch)
  File "train.py", line 97, in train
    loss1 = CE(atts, gts)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/loss.py", line 500, in forward
    reduce=self.reduce)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/functional.py", line 1516, in binary_cross_entropy_with_logits
    max_val = (-input).clamp(min=0)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu:58

The code that manages everything is this one

import wandb
import torch
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np
import pdb, os, argparse
from datetime import datetime

from model.CPD_models import CPD_VGG
from model.CPD_ResNet_models import CPD_ResNet
from data import get_loader
from utils import clip_gradient, adjust_lr


parser = argparse.ArgumentParser()
parser.add_argument('--epoch', type=int, default=10, help='epoch number')
parser.add_argument('--lr', type=float, default=1e-4, help='learning rate')
parser.add_argument('--batchsize', type=int, default=1, help='training batch size')
parser.add_argument('--trainsize', type=int, default=352, help='training dataset size')
parser.add_argument('--clip', type=float, default=0.5, help='gradient clipping margin')
parser.add_argument('--is_ResNet', type=bool, default=False, help='VGG or ResNet backbone')
parser.add_argument('--decay_rate', type=float, default=0.1, help='decay rate of learning rate')
parser.add_argument('--decay_epoch', type=int, default=50, help='every n epochs decay learning rate')
parser.add_argument('--model_id', type=str, required=True, help='required unique id for trained model name')
parser.add_argument('--resume', type=str, default='', help='path to resume model training from checkpoint')
parser.add_argument('--wandb', type=bool, default=False, help='enable wandb tracking model training')
opt = parser.parse_args()

model_id = opt.model_id;
WANDB_EN = opt.wandb
if WANDB_EN:
	wandb.init(entity="albytree", project="cpd-train")

# Add all parsed config in one line
if WANDB_EN:
	wandb.config.update(opt)
tot_epochs = opt.epoch
print("Training Info")
print("EPOCHS: {}".format(opt.epoch))
print("LEARNING RATE: {}".format(opt.lr))
print("BATCH SIZE: {}".format(opt.batchsize))
print("TRAIN SIZE: {}".format(opt.trainsize))
print("CLIP: {}".format(opt.clip))
print("USING ResNet backbone: {}".format(opt.is_ResNet))
print("DECAY RATE: {}".format(opt.decay_rate))
print("DECAY EPOCH: {}".format(opt.decay_epoch))
print("MODEL ID: {}".format(opt.model_id))

# build models
if opt.is_ResNet:
	model = CPD_ResNet()
else:
	model = CPD_VGG()

model.cuda()
params = model.parameters()
optimizer = torch.optim.Adam(params, opt.lr)
# If no previous training, 0 epochs passed
last_epoch = 0
resume_model_path = opt.resume;
if resume_model_path:
	print("Loading previous trained model:"+resume_model_path)
	checkpoint = torch.load(resume_model_path)
	model.load_state_dict(checkpoint['model_state_dict'])
	optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
	last_epoch = checkpoint['epoch']
	last_loss = checkpoint['loss']

dataset_name = 'ECSSD'
image_root = '../../DATASETS/TEST/'+dataset_name+'/im/'
gt_root = '../../DATASETS/TEST/'+dataset_name+'/gt/'
train_loader = get_loader(image_root, gt_root, batchsize=opt.batchsize, trainsize=opt.trainsize)
total_step = len(train_loader)
print("Total step per epoch: {}".format(total_step))

CE = torch.nn.BCEWithLogitsLoss()

####################################################################################################

def train(train_loader, model, optimizer, epoch):
	model.train()
	for i, pack in enumerate(train_loader, start=1):
		optimizer.zero_grad()
		images, gts = pack
		images = Variable(images)
		gts = Variable(gts)
		images = images.cuda()
		gts = gts.cuda()

		atts, dets = model(images)
		loss1 = CE(atts, gts)
		loss2 = CE(dets, gts)
		loss = loss1 + loss2
		loss.backward()

		clip_gradient(optimizer, opt.clip)
		optimizer.step()
		if WANDB_EN:
			wandb.log({'Loss': loss})
		if i % 100 == 0 or i == total_step:
			print('{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], Loss1: {:.4f} Loss2: {:0.4f}'.
				  format(datetime.now(), epoch, opt.epoch, i, total_step, loss1.data, loss2.data))

	# Save model and optimizer training data
	trained_model_data = {
		'model_state_dict': model.state_dict(),
		'optimizer_state_dict': optimizer.state_dict(),
		'epoch': epoch,
		'loss': loss
	}

	if opt.is_ResNet:
		save_path = 'models/CPD_Resnet/'
	else:
		save_path = 'models/CPD_VGG/'

	if not os.path.exists(save_path):
		print("Making trained model folder [{}]".format(save_path))
		os.makedirs(save_path)

	torch_model_ext = '.pth'
	wandb_model_ext = '.h5'
	model_unique_id = model_id+'_'+'ep'+'_'+'%d' % epoch
	trained_model_name = 'CPD_train' 
	save_full_path_torch = save_path + trained_model_name + '_' + model_unique_id + torch_model_ext 
	save_full_path_wandb = save_path + trained_model_name + '_' + model_unique_id + wandb_model_ext
	if os.path.exists(save_full_path_torch):
		print("Torch model with name ["+save_full_path_torch+"] already exists!")
		answ = raw_input("Do you want to replace it? [y/n] ")
		if("y" in answ):
			torch.save(trained_model_data, save_full_path_torch) 
			print("Saved torch model in "+save_full_path_torch)
	else:
			torch.save(trained_model_data, save_full_path_torch) 
			print("Saved torch model in "+save_full_path_torch)

	if WANDB_EN:
		if os.path.exists(save_full_path_wandb):	
			print("Wandb model with name ["+save_full_path_wandb+"] already exists!")
			answ = raw_input("Do you want to replace it? [y/n] ")
			if("y" in answ):
				wandb.save(save_full_path_wandb)
				print("Saved wandb model in "+save_full_path_wandb)
		else:
				wandb.save(save_full_path_wandb)
				print("Saved wandb model in "+save_full_path_wandb)


####################################################################################################

print("Training on dataset: "+dataset_name)
print("Train images path: "+image_root)
print("Train gt path: "+gt_root)
print("Let's go!")

if WANDB_EN:
	wandb.watch(model, log="all")
for epoch in range(last_epoch+1, tot_epochs+1):
	adjust_lr(optimizer, opt.lr, epoch, opt.decay_rate, opt.decay_epoch)
	train(train_loader, model, optimizer, epoch)
print("TRAINING DONE!")

It seems that there’s something wrong with the loss but I cannot understand what’s the problem.

ptrblck · April 30, 2020, 4:49am

Are you only seeing the out of memory error after resuming the suspended process?
Is your script working fine, if you don’t suspend the training?

AlbyTree · April 30, 2020, 8:15am

I trained the model for 2 epochs without errors and then I interrupted the process.
I also killed the process that was leaved in the gpu memory.
After I tried to resume the model saved at epoch 1 and epoch 2 I got the same cuda error but in a different part of the code

THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
  File "train.py", line 191, in <module>
    train(train_loader, model, optimizer, epoch)
  File "train.py", line 112, in train
    atts, dets = model(images)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/albytree/TESI/CODICE/Workspace/ALGS/CPD/model/CPD_models.py", line 131, in forward
    detection = self.agg2(x5_2, x4_2, x3_2)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/albytree/TESI/CODICE/Workspace/ALGS/CPD/model/CPD_models.py", line 86, in forward
    x3_2 = torch.cat((x3_1, self.conv_upsample5(self.upsample(x2_2))), 1)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu:58

Moreover I tried to test the saved model at epoch 1 and epoch 2 and got this error

Traceback (most recent call last):
  File "test.py", line 45, in <module>
    model.load_state_dict(torch.load(opt.model_path))
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 721, in load_state_dict
    self.__class__.__name__, "\n\t".join(error_msgs)))
RuntimeError: Error(s) in loading state_dict for CPD_VGG:
	Missing key(s) in state_dict: "vgg.conv1.conv1_1.bias", "vgg.conv1.conv1_1.weight", "vgg.conv1.conv1_2.bias", "vgg.conv1.conv1_2.weight", "vgg.conv2.conv2_1.bias", "vgg.conv2.conv2_1.weight", "vgg.conv2.conv2_2.bias", "vgg.conv2.conv2_2.weight", "vgg.conv3.conv3_1.bias", "vgg.conv3.conv3_1.weight", "vgg.conv3.conv3_2.bias", "vgg.conv3.conv3_2.weight", "vgg.conv3.conv3_3.bias", "vgg.conv3.conv3_3.weight", "vgg.conv4_1.conv4_1_1.bias", "vgg.conv4_1.conv4_1_1.weight", "vgg.conv4_1.conv4_2_1.bias", "vgg.conv4_1.conv4_2_1.weight", "vgg.conv4_1.conv4_3_1.bias", "vgg.conv4_1.conv4_3_1.weight", "vgg.conv5_1.conv5_1_1.bias", "vgg.conv5_1.conv5_1_1.weight", "vgg.conv5_1.conv5_2_1.bias", "vgg.conv5_1.conv5_2_1.weight", "vgg.conv5_1.conv5_3_1.bias", "vgg.conv5_1.conv5_3_1.weight", "vgg.conv4_2.conv4_1_2.bias", "vgg.conv4_2.conv4_1_2.weight", "vgg.conv4_2.conv4_2_2.bias", "vgg.conv4_2.conv4_2_2.weight", "vgg.conv4_2.conv4_3_2.bias", "vgg.conv4_2.conv4_3_2.weight", "vgg.conv5_2.conv5_1_2.bias", "vgg.conv5_2.conv5_1_2.weight", "vgg.conv5_2.conv5_2_2.bias", "vgg.conv5_2.conv5_2_2.weight", "vgg.conv5_2.conv5_3_2.bias", "vgg.conv5_2.conv5_3_2.weight", "rfb3_1.branch0.0.bias", "rfb3_1.branch0.0.weight", "rfb3_1.branch1.0.bias", "rfb3_1.branch1.0.weight", "rfb3_1.branch1.1.bias", "rfb3_1.branch1.1.weight", "rfb3_1.branch1.2.bias", "rfb3_1.branch1.2.weight", "rfb3_1.branch1.3.bias", "rfb3_1.branch1.3.weight", "rfb3_1.branch2.0.bias", "rfb3_1.branch2.0.weight", "rfb3_1.branch2.1.bias", "rfb3_1.branch2.1.weight", "rfb3_1.branch2.2.bias", "rfb3_1.branch2.2.weight", "rfb3_1.branch2.3.bias", "rfb3_1.branch2.3.weight", "rfb3_1.branch3.0.bias", "rfb3_1.branch3.0.weight", "rfb3_1.branch3.1.bias", "rfb3_1.branch3.1.weight", "rfb3_1.branch3.2.bias", "rfb3_1.branch3.2.weight", "rfb3_1.branch3.3.bias", "rfb3_1.branch3.3.weight", "rfb3_1.conv_cat.bias", "rfb3_1.conv_cat.weight", "rfb3_1.conv_res.bias", "rfb3_1.conv_res.weight", "rfb4_1.branch0.0.bias", "rfb4_1.branch0.0.weight", "rfb4_1.branch1.0.bias", "rfb4_1.branch1.0.weight", "rfb4_1.branch1.1.bias", "rfb4_1.branch1.1.weight", "rfb4_1.branch1.2.bias", "rfb4_1.branch1.2.weight", "rfb4_1.branch1.3.bias", "rfb4_1.branch1.3.weight", "rfb4_1.branch2.0.bias", "rfb4_1.branch2.0.weight", "rfb4_1.branch2.1.bias", "rfb4_1.branch2.1.weight", "rfb4_1.branch2.2.bias", "rfb4_1.branch2.2.weight", "rfb4_1.branch2.3.bias", "rfb4_1.branch2.3.weight", "rfb4_1.branch3.0.bias", "rfb4_1.branch3.0.weight", "rfb4_1.branch3.1.bias", "rfb4_1.branch3.1.weight", "rfb4_1.branch3.2.bias", "rfb4_1.branch3.2.weight", "rfb4_1.branch3.3.bias", "rfb4_1.branch3.3.weight", "rfb4_1.conv_cat.bias", "rfb4_1.conv_cat.weight", "rfb4_1.conv_res.bias", "rfb4_1.conv_res.weight", "rfb5_1.branch0.0.bias", "rfb5_1.branch0.0.weight", "rfb5_1.branch1.0.bias", "rfb5_1.branch1.0.weight", "rfb5_1.branch1.1.bias", "rfb5_1.branch1.1.weight", "rfb5_1.branch1.2.bias", "rfb5_1.branch1.2.weight", "rfb5_1.branch1.3.bias", "rfb5_1.branch1.3.weight", "rfb5_1.branch2.0.bias", "rfb5_1.branch2.0.weight", "rfb5_1.branch2.1.bias", "rfb5_1.branch2.1.weight", "rfb5_1.branch2.2.bias", "rfb5_1.branch2.2.weight", "rfb5_1.branch2.3.bias", "rfb5_1.branch2.3.weight", "rfb5_1.branch3.0.bias", "rfb5_1.branch3.0.weight", "rfb5_1.branch3.1.bias", "rfb5_1.branch3.1.weight", "rfb5_1.branch3.2.bias", "rfb5_1.branch3.2.weight", "rfb5_1.branch3.3.bias", "rfb5_1.branch3.3.weight", "rfb5_1.conv_cat.bias", "rfb5_1.conv_cat.weight", "rfb5_1.conv_res.bias", "rfb5_1.conv_res.weight", "agg1.conv_upsample1.bias", "agg1.conv_upsample1.weight", "agg1.conv_upsample2.bias", "agg1.conv_upsample2.weight", "agg1.conv_upsample3.bias", "agg1.conv_upsample3.weight", "agg1.conv_upsample4.bias", "agg1.conv_upsample4.weight", "agg1.conv_upsample5.bias", "agg1.conv_upsample5.weight", "agg1.conv_concat2.bias", "agg1.conv_concat2.weight", "agg1.conv_concat3.bias", "agg1.conv_concat3.weight", "agg1.conv4.bias", "agg1.conv4.weight", "agg1.conv5.bias", "agg1.conv5.weight", "rfb3_2.branch0.0.bias", "rfb3_2.branch0.0.weight", "rfb3_2.branch1.0.bias", "rfb3_2.branch1.0.weight", "rfb3_2.branch1.1.bias", "rfb3_2.branch1.1.weight", "rfb3_2.branch1.2.bias", "rfb3_2.branch1.2.weight", "rfb3_2.branch1.3.bias", "rfb3_2.branch1.3.weight", "rfb3_2.branch2.0.bias", "rfb3_2.branch2.0.weight", "rfb3_2.branch2.1.bias", "rfb3_2.branch2.1.weight", "rfb3_2.branch2.2.bias", "rfb3_2.branch2.2.weight", "rfb3_2.branch2.3.bias", "rfb3_2.branch2.3.weight", "rfb3_2.branch3.0.bias", "rfb3_2.branch3.0.weight", "rfb3_2.branch3.1.bias", "rfb3_2.branch3.1.weight", "rfb3_2.branch3.2.bias", "rfb3_2.branch3.2.weight", "rfb3_2.branch3.3.bias", "rfb3_2.branch3.3.weight", "rfb3_2.conv_cat.bias", "rfb3_2.conv_cat.weight", "rfb3_2.conv_res.bias", "rfb3_2.conv_res.weight", "rfb4_2.branch0.0.bias", "rfb4_2.branch0.0.weight", "rfb4_2.branch1.0.bias", "rfb4_2.branch1.0.weight", "rfb4_2.branch1.1.bias", "rfb4_2.branch1.1.weight", "rfb4_2.branch1.2.bias", "rfb4_2.branch1.2.weight", "rfb4_2.branch1.3.bias", "rfb4_2.branch1.3.weight", "rfb4_2.branch2.0.bias", "rfb4_2.branch2.0.weight", "rfb4_2.branch2.1.bias", "rfb4_2.branch2.1.weight", "rfb4_2.branch2.2.bias", "rfb4_2.branch2.2.weight", "rfb4_2.branch2.3.bias", "rfb4_2.branch2.3.weight", "rfb4_2.branch3.0.bias", "rfb4_2.branch3.0.weight", "rfb4_2.branch3.1.bias", "rfb4_2.branch3.1.weight", "rfb4_2.branch3.2.bias", "rfb4_2.branch3.2.weight", "rfb4_2.branch3.3.bias", "rfb4_2.branch3.3.weight", "rfb4_2.conv_cat.bias", "rfb4_2.conv_cat.weight", "rfb4_2.conv_res.bias", "rfb4_2.conv_res.weight", "rfb5_2.branch0.0.bias", "rfb5_2.branch0.0.weight", "rfb5_2.branch1.0.bias", "rfb5_2.branch1.0.weight", "rfb5_2.branch1.1.bias", "rfb5_2.branch1.1.weight", "rfb5_2.branch1.2.bias", "rfb5_2.branch1.2.weight", "rfb5_2.branch1.3.bias", "rfb5_2.branch1.3.weight", "rfb5_2.branch2.0.bias", "rfb5_2.branch2.0.weight", "rfb5_2.branch2.1.bias", "rfb5_2.branch2.1.weight", "rfb5_2.branch2.2.bias", "rfb5_2.branch2.2.weight", "rfb5_2.branch2.3.bias", "rfb5_2.branch2.3.weight", "rfb5_2.branch3.0.bias", "rfb5_2.branch3.0.weight", "rfb5_2.branch3.1.bias", "rfb5_2.branch3.1.weight", "rfb5_2.branch3.2.bias", "rfb5_2.branch3.2.weight", "rfb5_2.branch3.3.bias", "rfb5_2.branch3.3.weight", "rfb5_2.conv_cat.bias", "rfb5_2.conv_cat.weight", "rfb5_2.conv_res.bias", "rfb5_2.conv_res.weight", "agg2.conv_upsample1.bias", "agg2.conv_upsample1.weight", "agg2.conv_upsample2.bias", "agg2.conv_upsample2.weight", "agg2.conv_upsample3.bias", "agg2.conv_upsample3.weight", "agg2.conv_upsample4.bias", "agg2.conv_upsample4.weight", "agg2.conv_upsample5.bias", "agg2.conv_upsample5.weight", "agg2.conv_concat2.bias", "agg2.conv_concat2.weight", "agg2.conv_concat3.bias", "agg2.conv_concat3.weight", "agg2.conv4.bias", "agg2.conv4.weight", "agg2.conv5.bias", "agg2.conv5.weight", "HA.gaussian_kernel". 
	Unexpected key(s) in state_dict: "loss", "optimizer_state_dict", "model_state_dict", "epoch".

Maybe I’m not saving the states as intended ?
The weird thing is that before adding the resume training code I was just saving the model at every epoch only with torch.save(model.state_dict(), save_full_path_torch) : I trained the model in 10 epochs and it still works during testing.

ptrblck · April 30, 2020, 3:29pm

Could you check the memory usage of the GPU before running the “resume training” script?
CTRL-Z will suspend, but not kill the process so you should make sure that you have enough memory on your device for further training.

For the state_dict error:
If seems you have stored more that the model.state_dict() in the checkpoint, but are trying to load this checkpoint directly via model.load_state_dict.
Try to index the checkpoint dict via:

model.load_state_dict(checkpoint['model_state_dict'])

AlbyTree · April 30, 2020, 8:34pm

This is the gpu state before the first 2 epochs

Thu Apr 30 22:00:46 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 750 Ti  On   | 00000000:01:00.0  On |                  N/A |
| 34%   34C    P0     2W /  38W |    237MiB /  2000MiB |      1%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1606      G   /usr/lib/xorg/Xorg                            15MiB |
|    0      1698      G   /usr/bin/gnome-shell                          48MiB |
|    0      2057      G   /usr/lib/xorg/Xorg                            90MiB |
|    0      2190      G   /usr/bin/gnome-shell                          77MiB |
+-----------------------------------------------------------------------------+

This is the gpu state during the 2 epochs training

Thu Apr 30 22:01:10 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 750 Ti  On   | 00000000:01:00.0  On |                  N/A |
| 34%   42C    P0    36W /  38W |   1858MiB /  2000MiB |    100%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1606      G   /usr/lib/xorg/Xorg                            15MiB |
|    0      1698      G   /usr/bin/gnome-shell                          48MiB |
|    0      2057      G   /usr/lib/xorg/Xorg                            90MiB |
|    0      2190      G   /usr/bin/gnome-shell                          84MiB |
|    0     11215      C   python                                      1603MiB |
+-----------------------------------------------------------------------------+

This is the gpu state after I interrupted the process in the cpu with CTRL+Z and killed the process in the gpu with the command nvidia-smi | grep 'python' | awk '{ print $3 }' | xargs -n1 kill -9

Thu Apr 30 22:18:14 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 750 Ti  On   | 00000000:01:00.0  On |                  N/A |
| 34%   44C    P0     3W /  38W |    244MiB /  2000MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1606      G   /usr/lib/xorg/Xorg                            15MiB |
|    0      1698      G   /usr/bin/gnome-shell                          48MiB |
|    0      2057      G   /usr/lib/xorg/Xorg                            90MiB |
|    0      2190      G   /usr/bin/gnome-shell                          84MiB |
+-----------------------------------------------------------------------------+

I still got the cuda error but the point in the code where it happened changed once again

THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
  File "train.py", line 191, in <module>
    train(train_loader, model, optimizer, epoch)
  File "train.py", line 112, in train
    atts, dets = model(images)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/albytree/TESI/CODICE/Workspace/ALGS/CPD/model/CPD_models.py", line 126, in forward
    x4_2 = self.vgg.conv4_2(x3_2)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/container.py", line 91, in forward
    input = module(input)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/activation.py", line 46, in forward
    return F.threshold(input, self.threshold, self.value, self.inplace)
  File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/functional.py", line 601, in threshold
    return torch._C._nn.threshold(input, threshold, value)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu:58

I don’t think I don’t have enough memory to resume because this error occurs even by resuming from epoch 1 when I trained for 2 epochs.

About the testing: I didn’t update the code with

model.load_state_dict(checkpoint['model_state_dict'])

and now it works even with the interrupted models.

ptrblck · April 30, 2020, 11:39pm

If I understand it correctly, you are not seeing the out of memory issue, if you are not loading the state_dict?
If so, could you push the model to the CPU before saving the state_dicts or use torch.load(checkpoint_path, map_location='cpu)`?

AlbyTree · May 1, 2020, 7:57am

Sorry, I might have just confused you: I meant to say that your solution for the state_dict error should’ve been obvious and I should’ve just updated the testing code (another script) with that solution without asking.
By testing I mean it as using a trained model to make predictions on a dataset.

To recap what happens:

During the training I get no error
If interrupt the training during an epoch and than I try to resume it after a completed epoch I get the error
If I test a model whose training was either interrupted or not, the testing works

EDIT:

By using the pdb python debugger I followed the train function during resuming and found out something.
By resuming from epoch 2, after epoch 3 step 1 the gpu state is

Fri May  1 10:56:13 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 750 Ti  On   | 00000000:01:00.0  On |                  N/A |
| 34%   33C    P0     1W /  38W |   1897MiB /  2000MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1651      G   /usr/lib/xorg/Xorg                            15MiB |
|    0      1685      G   /usr/bin/gnome-shell                          48MiB |
|    0      2214      G   /usr/lib/xorg/Xorg                            88MiB |
|    0      2347      G   /usr/bin/gnome-shell                          94MiB |
|    0     23531      C   python                                      1633MiB |
+-----------------------------------------------------------------------------+

At epoch 3 step 2 after executing atts, dets = model(images) the gpu state is

Fri May  1 10:56:33 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 750 Ti  On   | 00000000:01:00.0  On |                  N/A |
| 34%   34C    P0     3W /  38W |   1996MiB /  2000MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1651      G   /usr/lib/xorg/Xorg                            15MiB |
|    0      1685      G   /usr/bin/gnome-shell                          48MiB |
|    0      2214      G   /usr/lib/xorg/Xorg                            88MiB |
|    0      2347      G   /usr/bin/gnome-shell                          94MiB |
|    0     23531      C   python                                      1732MiB |
+-----------------------------------------------------------------------------+

and I get the error (the same happens if I resume from epoch 1): at least 1 step of the new epoch is completed but in the next one I run out of memory.

ptrblck · May 2, 2020, 3:35am

Does this OOM only occur, if you suspend the process or also if you let it train for 3 epochs?

AlbyTree · May 3, 2020, 6:57am

I trained the model for 3 epochs, I interrupted the training and got the same error when I tried to resume it from epoch 3.

AlbyTree · May 6, 2020, 8:57am

Is there anything else I can do to solve this problem ?

RupertP · August 4, 2020, 6:06pm

I’ve got the same problem.

I never had any problems with restoring models but I recently increased the size of my model. I was able to train it for 20k iterations without any problems/errors. Now I want to resume the training process but I get a CUDA out of Memory error even though I’m resuming on the same PC and the VRAM of the GPU is empty. It seems like restoring the model requires more VRAM than simply starting a new one. It does look different than a “normal” Cuda out of memory error though. Im using Pytorch 1.6 and mixed precision. Maybe thats the reason? Here is the stacktrace:

Traceback (most recent call last):
  File "scripts/train.py", line 449, in <module>
    main(sys.argv[1:])
  File "scripts/train.py", line 230, in main
    scaled_loss.backward()
  File "/disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/tensor.py", line 185, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/autograd/__init__.py", line 127, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 110.00 MiB (GPU 0; 10.76 GiB total capacity; 9.46 GiB already allocated; 75.00 MiB free; 9.62 GiB reserved in total by PyTorch)
Exception raised from malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:272 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f8e5447b1e2 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1e64b (0x7f8e546d164b in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1f464 (0x7f8e546d2464 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0x1faa1 (0x7f8e546d2aa1 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x11e (0x7f8de3f0a90e in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0xf33949 (0x7f8de2344949 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xf4d777 (0x7f8de235e777 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0x10e9c7d (0x7f8e1d0fac7d in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0x10e9f97 (0x7f8e1d0faf97 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #9: at::empty(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0xfa (0x7f8e1d205a1a in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #10: at::native::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x49e (0x7f8e1ce83c3e in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #11: <unknown function> + 0x12880c1 (0x7f8e1d2990c1 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #12: <unknown function> + 0x12c3863 (0x7f8e1d2d4863 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #13: at::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x101 (0x7f8e1d1e8b31 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: at::Tensor at::native::(anonymous namespace)::host_softmax_backward<at::native::(anonymous namespace)::LogSoftMaxBackwardEpilogue, true>(at::Tensor const&, at::Tensor const&, long, bool) + 0xa7 (0x7f8de3794a97 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #15: at::native::log_softmax_backward_cuda(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x65a (0x7f8de377efaa in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #16: <unknown function> + 0xf215c0 (0x7f8de23325c0 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #17: <unknown function> + 0x11141d6 (0x7f8e1d1251d6 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #18: at::_log_softmax_backward_data(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x119 (0x7f8e1d1b3649 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #19: <unknown function> + 0x2ec639f (0x7f8e1eed739f in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #20: <unknown function> + 0x11141d6 (0x7f8e1d1251d6 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #21: at::_log_softmax_backward_data(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x119 (0x7f8e1d1b3649 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #22: torch::autograd::generated::LogSoftmaxBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x1d7 (0x7f8e1ed53057 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #23: <unknown function> + 0x3375bb7 (0x7f8e1f386bb7 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #24: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x7f8e1f382400 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #25: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x7f8e1f382fa1 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #26: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7f8e1f37b119 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #27: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7f8e5522d4ba in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #28: <unknown function> + 0xbd6df (0x7f8e2c9b96df in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #29: <unknown function> + 0x76db (0x7f8e5a03f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #30: clone + 0x3f (0x7f8e5a378a3f in /lib/x86_64-linux-gnu/libc.so.6)

I also got a RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED the first few times when I tried to resume it. Now I only get the OOM error.

Traceback (most recent call last):
  File "scripts/train.py", line 449, in <module>
    main(sys.argv[1:])
  File "scripts/train.py", line 230, in main
    scaled_loss.backward()
  File "/disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/tensor.py", line 185, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/autograd/__init__.py", line 127, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED
Exception raised from createCuDNNHandle at /pytorch/aten/src/ATen/cudnn/Handle.cpp:9 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fe80cfcc1e2 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xfef088 (0x7fe799da0088 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #2: at::native::getCudnnHandle() + 0x108d (0x7fe799da196d in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0xebcaec (0x7fe799c6daec in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xeb800e (0x7fe799c6900e in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0xeb9bfb (0x7fe799c6abfb in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #6: at::native::cudnn_convolution_backward_input(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) + 0xb2 (0x7fe799c6b152 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0xf1f35b (0x7fe799cd035b in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #8: <unknown function> + 0xf4f178 (0x7fe799d00178 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #9: at::cudnn_convolution_backward_input(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) + 0x1ad (0x7fe7d4ba888d in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #10: at::native::cudnn_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool, std::array<bool, 2ul>) + 0x223 (0x7fe799c69823 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #11: <unknown function> + 0xf1f445 (0x7fe799cd0445 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #12: <unknown function> + 0xf4f1d4 (0x7fe799d001d4 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #13: at::cudnn_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool, std::array<bool, 2ul>) + 0x1e2 (0x7fe7d4bb7242 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: <unknown function> + 0x2ec9c62 (0x7fe7d687ac62 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #15: <unknown function> + 0x2ede224 (0x7fe7d688f224 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #16: at::cudnn_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool, std::array<bool, 2ul>) + 0x1e2 (0x7fe7d4bb7242 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::generated::CudnnConvolutionBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x258 (0x7fe7d6701c38 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #18: <unknown function> + 0x3375bb7 (0x7fe7d6d26bb7 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #19: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x7fe7d6d22400 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #20: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x7fe7d6d22fa1 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #21: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7fe7d6d1b119 in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #22: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7fe804e604ba in /disk/no_backup/user/D4LCN/env/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #23: <unknown function> + 0xbd6df (0x7fe7e43596df in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #24: <unknown function> + 0x76db (0x7fe8119df6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #25: clone + 0x3f (0x7fe811d18a3f in /lib/x86_64-linux-gnu/libc.so.6)

ptrblck · August 7, 2020, 5:29am

The cudnn error might be a red herring and could be raised if you are running out of memory.
Could you post a code snippet (using random data) to reproduce this issue, so that we could run it on our nodes and check the memory usage?

jinfagang · November 16, 2020, 6:28am

If you guys training with more than 1 GPU, this might occured when your resume operation all do on the master GPU.
This can be happen such as statedict stored all from master GPU, your optimizer all store from master GPU etc.
So that all will fail on your multi GPU training processes.