Same code OOM when training in a big dataset but it's ok in a small one!

hi,
I am trying to train resnet on my own dataset and the image is not a fixed size, i don’t want to reshape the image, so i change batchnorm to groupnorm in the code of resnet, like this:

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = conv1x1(inplanes, planes)
        # self.bn1 = nn.BatchNorm2d(planes)
        self.bn1 = nn.GroupNorm(32, planes)
        self.conv2 = conv3x3(planes, planes, stride)
        # self.bn2 = nn.BatchNorm2d(planes)
        self.bn2 = nn.GroupNorm(32, planes)
        self.conv3 = conv1x1(planes, planes * self.expansion)
        # self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.bn3 = nn.GroupNorm(32, planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

and the training and validation code:

def train_model(model, dataloaders, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    for epoch in range(num_epochs):
        print('\nEpoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            count = 0

            # Iterate over data.
            for inputs, labels, path in dataloaders[phase]:
                count += 1
                print(count)
                inputs = inputs.to(device)
                labels = labels.to(device)
                # print(labels, '\t\t', path)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)
                    # print('target:', labels.item(), 'pred:', preds.item())

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

        if phase == 'val' and (epoch + 1) % 5 == 0:
            print("Saving model.....")
            torch.save(model.state_dict(), './checkpoint/resnet101-GN-' + str(epoch + 1) + '.pth')

    print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

    return model

So the problem is that when training on whole dataset, including 80 classes, 4w images around, i got OOM somewhere during trianing, like: (note that batchsize is always 1 and the number below means training on ith image )

Epoch 0/44
----------
1
2
.
.
.
58
59
Traceback (most recent call last):
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 249, in <module>
    model_ft = train_model(model_ft, dataloaders, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=num_epochs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 159, in train_model
    outputs = model(inputs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 166, in forward
    x = self.layer2(x)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 85, in forward
    out = self.bn1(out)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/normalization.py", line 233, in forward
    input, self.num_groups, self.weight, self.bias, self.eps)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/functional.py", line 1662, in group_norm
    torch.backends.cudnn.enabled)
RuntimeError: CUDA out of memory. Tried to allocate 46.12 MiB (GPU 1; 10.92 GiB total capacity; 9.94 GiB already allocated; 23.50 MiB free; 319.47 MiB cached)

and sometime it crash

Epoch 0/44
----------
1
2
3
4
5
Traceback (most recent call last):
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 249, in <module>
    model_ft = train_model(model_ft, dataloaders, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=num_epochs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 159, in train_model
    outputs = model(inputs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 167, in forward
    x = self.layer3(x)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 93, in forward
    out = self.bn3(out)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/normalization.py", line 233, in forward
    input, self.num_groups, self.weight, self.bias, self.eps)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/functional.py", line 1662, in group_norm
    torch.backends.cudnn.enabled)
RuntimeError: CUDA out of memory. Tried to allocate 53.62 MiB (GPU 1; 10.92 GiB total capacity; 10.07 GiB already allocated; 15.50 MiB free; 172.92 MiB cached)

BUT, when i split the dataset into a small one like 3 classes, 1.5k images around, it can complete the training and validation without any problem.

Could anyone tell me where is the problem?
Thanks a lot!

Do you shuffle the data loader? In that case, it might be the same image that is too large that makes your training crash, but at different iterations since it’s not ordered…

The first intuition that comes to mind is that one (or multiple) image(s) is too large and computing gradients and stuff on GPU fills the memory, but the other images do not. If shuffling is enabled, I would try to disable it and see which image(s) make it crash, then resize those only and then check again (you will probably have to do it image by image).

If it’s not shuffled, I have absolutely no idea what could happen :slightly_frowning_face:

BTW, if not all your images are included in your small dataset, it’s totally possible that the small one doesn’t crash, if the too-large images are not present…

1 Like

hi, thank you for your reply!
i have tried what you said, yes, you are right ! That helps me a lot, and some images is too large, which makes training crash, and the reason why training can complete on the small dataset is there is no large image!
But now i wonder why it always crash on Normalization layer, like what you can see above, the end of ERROR shows it crash in group_norm.
And i have try batchnorm, it also crash on this place.

Traceback (most recent call last):
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 255, in <module>
    model_ft = train_model(model_ft, dataloaders, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=num_epochs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGNTrainVal.py", line 165, in train_model
    outputs = model(inputs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 182, in forward
    x = self.layer3(x)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/code/ResNet50/ModifyBatchLayer/ResnetWithGN.py", line 96, in forward
    out = self.bn3(out)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 76, in forward
    exponential_average_factor, self.eps)
  File "/home/xsl/.conda/envs/xsltf/lib/python3.6/site-packages/torch/nn/functional.py", line 1623, in batch_norm
    training, momentum, eps, torch.backends.cudnn.enabled
RuntimeError: CUDA out of memory. Tried to allocate 126.62 MiB (GPU 0; 11.91 GiB total capacity; 11.07 GiB already allocated; 107.06 MiB free; 93.97 MiB cached)

I’m really not sure about how the internals are handled, but it would make sense if it now crashes on the same image at the same location every time, if the memory is allocated in the order of execution.

For example, your network could have three layers, PyTorch tries to allocate GPU memory for the first layer, given some input size X, then the second layer, but then the third layer cannot allocate memory because X is too large and the first two layers have almost filled the memory…

You could try resizing the image that crashes your program to smaller and smaller sizes and see if the error comes up at the same location, and this may also help you determine what is the maximum size your GPU can handle with that network architecture!

1 Like

Thanks a lot!
I have resized the image that is too large and there is no error again.
Thanks again. :smile:

1 Like