CUDA out of memory but desired allocation is less than free memory

Hi. I’m getting the error

Tried to allocate 3.39 GiB (GPU 0; 12.00 GiB total capacity; 170.94 MiB already allocated; 9.81 GiB free; 182.00 MiB reserved in total by PyTorch)

Which doesn’t make sense because clearly 3.39 < 9.81. Does anyone have any idea why this could be happening?

Sorry if this is in the wrong category I don’t really know where to put it as I’m very new here.

Could you post more information about the use case, which code is causing it, your used setup etc.?

My use case is training a segmentation model using a pretrained/built model. My training loop looks like this:


    model.to(device)
    if is_distributed and use_cuda:
        # multi-machine multi-gpu case
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # single-machine multi-gpu case or single-machine or multi-machine cpu case
        model = torch.nn.DataParallel(model)
    torchinfo.summary(model)
    model.to(device)
    model.train()
    tv_loss_fn = tversky_focal_loss
    optimizer = timm.optim.create_optimizer_v2(model, optimizer_name='adamw', learning_rate=args.lr,
                                               momentum=args.momentum, amsgrad=True, weight_decay=1e-2)
    lookahead_optimizer = timm.optim.Lookahead(optimizer, alpha=0.5, k=6)
    print(lookahead_optimizer)
    lookahead_optimizer.zero_grad()

    torch.backends.cudnn.benchmark = True

    scaler = torch.cuda.amp.GradScaler()
    for epoch in range(args.epochs):
        running_loss = 0.0
        c_loss = 0.0
        for i, input_data in enumerate(train_dl, 0):
            # print(i)
            # print(torch.cuda.mem_get_info(torch.cuda.current_device()))
            inputs, labels = input_data
            inputs = inputs.to(device)

            with torch.cuda.amp.autocast():
                outputs = model(inputs)
            outputs_cpu = outputs.cpu()
            labels_cpu = labels.cpu()
            loss = tv_loss_fn(outputs_cpu[:,1,:,:].unsqueeze(1), labels_cpu[:,1,:,:].unsqueeze(1)).to(device)
            scaler.scale(loss).backward()
            if (i + 1) % 2 == 0 or (i + 1) == len(train_dl):
                scaler.step(lookahead_optimizer)
                scaler.update()
                lookahead_optimizer.zero_grad(set_to_none=True)
            running_loss += float(loss)
            c_loss = loss.item()
            if i % args.log_interval == 0:
                logger.info(
                    "Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}".format(
                        epoch,
                        i * len(inputs),
                        len(train_dl.sampler),
                        100.0 * i / len(train_dl),
                        loss.item(),
                    )
                )
        logger.debug(f"Epoch {epoch} finished")
        lookahead_optimizer.sync_lookahead()

and the traceback is:

Traceback (most recent call last):
  File "D:\Programs\Programming\seg5\SegTest3-local.py", line 457, in <module>
    _train(args=parser.parse_args())
  File "D:\Programs\Programming\seg5\SegTest3-local.py", line 309, in _train
    outputs = model(inputs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\parallel\data_parallel.py", line 169, in forward
    return self.module(*inputs[0], **kwargs[0])
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "D:\Programs\Programming\seg5\SegTest3-local.py", line 199, in forward
    x = self.DLV3_model(x)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\segmentation_models_pytorch\base\model.py", line 29, in forward
    features = self.encoder(x)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\segmentation_models_pytorch\encoders\timm_universal.py", line 30, in forward
    features = self.model(x)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\timm\models\features.py", line 282, in forward
    x = module(x)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\timm\models\xception_aligned.py", line 108, in forward
    x = self.stack(x)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\container.py", line 204, in forward
    input = module(input)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\timm\models\xception_aligned.py", line 74, in forward
    x = self.conv_pw(x)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\conv.py", line 463, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "C:\Users\camma\.virtualenvs\seg5\lib\site-packages\torch\nn\modules\conv.py", line 459, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.38 GiB (GPU 0; 12.00 GiB total capacity; 1007.72 MiB already allocated; 9.00 GiB free; 1020.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Process finished with exit code 1

I’m using windows 10, pytorch 1.13.0+cu117 and an NVIDIA RTX 3060 to train on.

Something to note is that if I manually reset the GPU using powershell it will work for a while and it only breaks intermittently.
EDIT: Turns out that isn’t working anymore (or perhaps it never worked?) I now have to totally restart my system when this problems starts occurring for it to fix.

Hope this helps elucidate things.

Based on your latest output it seems you are trying to allocate 8.38GiB while 9.00GiB is free. If the free memory is fragmented, the allocation might fail.
Your original issue showed a different behavior. Assuming you can reproduce it, could you share a minimal and executable code snippet, please?

I have been unable to get it back to the 3gb state, I don’t remember what exact settings I used. Regardless, here’s the for the latest output. Is this helpful?

import glob
import os
import PIL.Image as Image
import torch
import torch.distributed as dist
import torch.utils.data as data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.transforms.functional as tf
import segmentation_models_pytorch as smp
import torchinfo
import numpy as np
import torch.nn as nn
import timm.optim
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


class RoadDatasetSkippingEmpties(data.Dataset):
    def __init__(self, folder_path, aspect_ratio: tuple = (64, 64)):
        super(RoadDatasetSkippingEmpties, self).__init__()
        self.mask_images = glob.glob(os.path.join(folder_path, "train_annotation", "*.PNG"))
        #         self.mask_images = glob.glob(os.path.join(folder_path, "train_annotation", "*.png"))
        self.mask_images.sort(key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
        self.raw_images = glob.glob(os.path.join(folder_path, "train", "*.JPEG"))
        #         self.raw_images = glob.glob(os.path.join(folder_path, "train", "*.jpg"))
        self.raw_images.sort(key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
        self.empty_masks = []
        self.empty_raw_images = []
        for mask_path in self.mask_images:
            with Image.open(mask_path) as mask:
                mask_sum = np.sum(np.array(mask))
                if mask_sum == 0:
                    self.empty_masks.append(mask_path)
                    raw_path = os.path.join(folder_path, "train",
                                            os.path.splitext(os.path.basename(mask_path))[0] + ".JPEG")
                    #                     os.path.splitext(os.path.basename(mask_path))[0] + ".jpg")
                    self.empty_raw_images.append(raw_path)
        for empty_mask_path in self.empty_masks:
            self.mask_images.remove(empty_mask_path)
        for empty_raw_path in self.empty_raw_images:
            self.raw_images.remove(empty_raw_path)
        self.aspect_ratio = aspect_ratio

    def __getitem__(self, index, use_empties=False):
        if self.mask_images[index] not in self.empty_masks or use_empties:
            raw_path = self.raw_images[index]
            mask_path = self.mask_images[index]
            d = tf.resize(Image.open(raw_path), self.aspect_ratio)
            if d.mode != "RGB":
                rgbimg = Image.new("RGB", d.size)
                rgbimg.paste(d)
                d = rgbimg
            y = tf.resize(Image.open(mask_path).convert('L'), self.aspect_ratio, Image.Resampling.NEAREST)
            nptensor = tf.to_tensor(y).numpy()
            negative_class = np.where(nptensor[0] == 0, 1.0, 0.0)
            positive_class = np.where(nptensor[0] == 0, 0.0, 1.0)
            mask = np.array([negative_class, positive_class])
            return tf.normalize(tf.to_tensor(d), [0.4920, 0.4848, 0.4776], [0.1457, 0.1340, 0.1368]), torch.from_numpy(
                mask)
        else:
            return self.__getitem__(index, use_empties=use_empties)

    def __len__(self):
        return len(self.raw_images)


class MyResize(nn.Module):
    def __init__(self, ratio: tuple):
        super(MyResize, self).__init__()
        self.fc = transforms.Resize(ratio)

    def forward(self, x):
        x = self.fc(x)
        return x


class BoostContrast(nn.Module):
    def __init__(self, factor=2.0):
        super(BoostContrast, self).__init__()
        self.factor = factor

    def forward(self, x):
        x = tf.adjust_contrast(x, contrast_factor=self.factor)
        return x


class CrackNet(nn.Module):
    def __init__(self, pth_to_weights=None, contrast_factor=2.0, aspect_ratio: tuple = (64, 64)):
        super(CrackNet, self).__init__()
        if not pth_to_weights:
            self.DLV3_model = smp.DeepLabV3Plus(encoder_name="tu-xception71",
                                                encoder_weights="imagenet",
                                                in_channels=3,
                                                classes=2
                                                )
        else:
            self.DLV3_model = smp.DeepLabV3Plus(encoder_name="tu-xception71",
                                                encoder_weights="imagenet",
                                                in_channels=3,
                                                classes=2
                                                )

        for i in self.DLV3_model.children():
            if i._get_name() == "SegmentationHead" or i._get_name() == "DeepLabV3PlusDecoder":
                for m in i.modules():
                    for p in m.parameters():
                        p.requires_grad = True
            else:
                for m in i.modules():
                    for p in m.parameters():
                        p.requires_grad = False

        self.resize = MyResize(aspect_ratio)
        self.resize.requires_grad = False
        self.adjust_contrast = BoostContrast(factor=contrast_factor)
        self.adjust_contrast.requires_grad = False
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.bn2d = nn.BatchNorm2d(2)

    def forward(self, x):
        x = self.adjust_contrast(x)
        x = self.resize(x)
        x = self.DLV3_model(x)
        x = self.bn2d(x)
        return self.softmax(x)


def _train():
    is_distributed = False
    use_cuda = True
    kwargs = {"num_workers": 0, "pin_memory": True} if use_cuda else {}
    device = torch.device("cuda" if use_cuda else "cpu")

    # set the seed for generating random numbers
    torch.manual_seed(100)

    resize_aspect_ratio = (640, 1024)
    ds = RoadDatasetSkippingEmpties(folder_path=(r".\Processed_Data_Unflipped"),
                                    aspect_ratio=resize_aspect_ratio)
    train_size = int(0.95 * len(ds))
    test_size = len(ds) - train_size
    train_ds, test_ds = data.random_split(ds, [train_size, test_size])
    train_sampler = (torch.utils.data.distributed.DistributedSampler(train_ds) if is_distributed else None)
    train_dl = data.DataLoader(train_ds, num_workers= 0, batch_size=4, shuffle=train_sampler is None,
                               sampler=train_sampler)

    model = CrackNet(contrast_factor=1.5, aspect_ratio=resize_aspect_ratio)

    model.to(device)
    model = nn.DataParallel(model)
    torchinfo.summary(model)
    model.to(device)
    model.train()
    loss_fn = nn.BCELoss()
    optimizer = timm.optim.create_optimizer_v2(model, optimizer_name='adamw', learning_rate=0.001,
                                               momentum=0.7066, amsgrad=True, weight_decay=1e-2)
    lookahead_optimizer = timm.optim.Lookahead(optimizer, alpha=0.5, k=6)
    print(lookahead_optimizer)
    lookahead_optimizer.zero_grad()

    torch.backends.cudnn.benchmark = True

    scaler = torch.cuda.amp.GradScaler()
    for epoch in range(100):
        running_loss = 0.0
        c_loss = 0.0
        for i, input_data in enumerate(train_dl, 0):

            inputs, labels = input_data
            inputs = inputs.to(device)

            with torch.cuda.amp.autocast():
                outputs = model(inputs)
                outputs_cpu = outputs.cpu().float()
                # print(outputs_cpu)
                labels_cpu = labels.cpu().float()
                loss = loss_fn(outputs_cpu[:, 1, :, :].unsqueeze(1), labels_cpu[:, 1, :, :].unsqueeze(1)).to(device)
            scaler.scale(loss).backward()

            if (i + 1) % 2 == 0 or (i + 1) == len(train_dl):
                scaler.step(lookahead_optimizer)
                scaler.update()
                lookahead_optimizer.zero_grad(set_to_none=True)
            running_loss += float(loss)
            c_loss = loss.item()
            if i % 8 == 0:
                logger.info(
                    "Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}".format(
                        epoch,
                        i * len(inputs),
                        len(train_dl.sampler),
                        100.0 * i / len(train_dl),
                        loss.item(),
                    )
                )
        logger.debug(f"Epoch {epoch} finished")
        lookahead_optimizer.sync_lookahead()


if __name__ == "__main__":
    _train()

@ptrblck Did you manage to get any of this working? Still trying to solve this issue

Your code is unfortunately not executable as it depends on a specific dataset, so could you remove it, update the code to use random inputs, and check if the issue still reproduces, please?