CPU RAM usage increases inside each epoch and keeps increasing for all epochs (OSError: [Errno 12] Cannot allocate memory)

After monitoring CPU RAM usage, I find that RAM usage increases for all epoch. During an epoch run, memory keeps constantly increasing. RAM isn’t freed after epoch ends. Usage keeps increasing when new epoch comes. Hence, memory usage doesn’t become constant after running first epoch as it should have. Eventually after some epochs, this leads to OOM error on CPU. To be noted that GPU memory stays constant after first epoch.

From reading related posts I believe the problem is in custom Dataset implementation, although I can’t point out where. I gather all image file names in a list, save it as csv and then use loaded dataframe from csv in MyDataset class to load data, since people on forum have cautioned against using python lists directly to load data.

Some things I have tried but didn’t work:

  1. Setting num_workers = 0
  2. Using numpy array of file names instead of python list to load data
  3. Decreased batch size to 1

For 100k images and batch size 50, this error comes at 25th epoch, for 50k images it came at 50th epoch.

This is a 2x super resolution script with (16,16,4) input images and (32,32,4) output images.
Code:

  • train.py -> Main training loop and gathering file names (added here)
  • progressive_loader.py -> Custom dataset implementation (added here)
  • prosrs.yaml -> hyperparameters & configuration file
  • generators.py -> model (modified densenet kinda architecture ~50+ layers)

train.py:

def load_dataset(args):
    files = {'train':{},'test':{}}

    for phase in ['train','test']:
        for ft in ['source','target']:
            if args[phase].dataset.path[ft]:
                files[phase][ft] = get_filenames(
                    args[phase].dataset.path[ft], image_format=IMG_EXTENSIONS)
            else:
                files[phase][ft] = []

    return files['train'],files['test']


def main(args):

    ############### loading datasets #################
    train_files,test_files = load_dataset(args)
    print("Dataset images retrieved")

    num_images_to_train = 100000
    train_files['target'] = train_files['target'][:num_images_to_train]
    test_files['target'] = train_files['target'].copy()

    with open('imagepaths.csv', "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        for val in train_files['target']:
            writer.writerow([val])


   # Dataset passing

    training_dataset = MyDataset(
        prosr.Phase.TRAIN,
        scale=args.data.scale,
        input_size=args.data.input_size,
        args=args,
        **args.train.dataset)

    training_data_loader = DataLoader(
        training_dataset, batch_size=args.train.batch_size, shuffle=False, num_workers=4)

    if len(test_files['target']):
        testing_dataset = MyDataset(
                prosr.Phase.VAL,
                scale=args.data.scale,
                input_size=None,
                args=args,
                **args.test.dataset)
        testing_data_loader = DataLoader(testing_dataset, batch_size=1, shuffle=False, num_workers=4)
    else:
        testing_dataset = None
        testing_data_loader = None


    start_epoch = 0
    lr = args.train.lr
    # save_dir = args.cmd.output
    steps_per_epoch = len(training_data_loader)
    total_steps = start_epoch * steps_per_epoch


    ############# start training ##############

    batchsize = args.train.batch_size
    print("Batch size = ", batchsize)
    print("Num batches size = ", len(training_data_loader))
    loss = []
    psnr_list = []
    # output_imgs = torch.zeros((len(trainer.training_dataset)*batchsize, 4, 32, 32))
    num_random = 100
    HR_imgs = torch.zeros((num_random, 4, 32, 32))
    output_imgs = torch.zeros((num_random, 4, 32, 32))
    random_indices = randint(0, (len(training_data_loader)*batchsize)-1, num_random)

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    args.G.max_scale = max(args.data.scale)
    net_G = ProSR(**args.G).cuda()
    optimizer_G = torch.optim.Adam(
        [p for p in net_G.parameters() if p.requires_grad],
        lr=args.train.lr,
        betas=(0.9, 0.999),
        eps=1.0e-08)
    l1_criterion = torch.nn.L1Loss()


    #########################################################################
    for epoch in range(start_epoch + 1, args.train.epochs + 1):
        iter_start_time = time()
        epoch_start_time = time()
        net_G.train()
        epoch_loss = 0
        print("Epoch: ", epoch)
        for i, data in enumerate(training_data_loader):

            # Forward and backward pass
            lr = data['input'].cuda()
            hr = data['target'].cuda()
            interpolated = data['bicubic'].cuda()
            output_batch = net_G(lr, upscale_factor=2) + interpolated
            optimizer_G.zero_grad()
            l1_loss = l1_criterion(output_batch, hr)
            l1_loss.backward()
            optimizer_G.step()

            epoch_loss += l1_loss
            total_steps += 1     
            #################################################################

progressive_loader.py:

def pil_loader(path, args, mode='RGBA'):
    with open(full_path, 'rb') as f:
        with Image.open(f) as img:
            return img.convert(mode)

def downscale_by_ratio(img, ratio=2, method=Image.BICUBIC):
    if ratio == 1:
        return img
    w, h = img.size
    w, h = floor(w / ratio), floor(h / ratio)
    return img.resize((w, h), method)



class MyDataset(Dataset):

    def __init__(self, phase, scale, input_size, args, mean,
                 stddev, downscale, **kwargs):

        self.phase = phase
        self.scale = 2
        self.mean = mean
        self.stddev = stddev
        self.args = args
        self.image_loader = pil_loader
        self.downscale=downscale

        self.data_frame = pd.read_csv("imagepaths.csv")

        # Input normalization
        self.normalize_fn = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(self.mean, self.stddev)
        ])

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, index):
        return self.get(index)

    def get(self, index, scale=2):

        if torch.is_tensor(index):
            index = index.tolist()

        scale = 2
        ret_data = {}
        ret_data['scale'] = scale

        # Load target image
        if len(self.data_frame):
            target_img = self.image_loader(self.data_frame.iloc[index, 0], self.args)

            ret_data['target'] = target_img
            ret_data['target_fn'] = self.data_frame.iloc[index, 0]
            ret_data['input'] = downscale_by_ratio(
                ret_data['target'], scale, method=Image.BICUBIC)
            ret_data['input_fn'] = self.data_frame.iloc[index, 0]



            # Change Image.BICUBIC to Image.BILINEAR
            ret_data['bicubic'] = downscale_by_ratio(
                ret_data['input'], 1 / scale, method=Image.BICUBIC)

            ret_data['input'] = self.normalize_fn(ret_data['input'])
            ret_data['bicubic'] = self.normalize_fn(ret_data['bicubic'])
            if len(self.data_frame):
                ret_data['target'] = self.normalize_fn(ret_data['target'])

        return ret_data

Can someone suggest where is the problem?

1 Like

It seems you are storing the complete computation graph in this line of code:

epoch_loss += l1_loss

If you want to use epoch_loss for printing/debugging purposes (i.e. without wanting to call epoch_loss.backward() in the future), you should detach the l1_loss before accumulating it via:

epoch_loss += l1_loss.detach() # or .item()
4 Likes

Yup, this was the issue. Appreciate your help.

1 Like

Thanks, your solution resolved the issue in my case as well.