BatchNorm2d CUDNN_STATUS_NOT_SUPPORTED may be non-contiguous

Dear experts,
During some early tests of a project I was using asis this torch code of a denoiser

## DnCNN
## https://github.com/cszn/DnCNN/blob/master/TrainingCodes/dncnn_pytorch/main_train.py
## https://arxiv.org/pdf/1608.03981.pdf
    
class DnCNN(nn.Module):
    def __init__(self, depth=17, n_channels=64, image_channels=1, use_bnorm=True, kernel_size=3):
        super(DnCNN, self).__init__()
        kernel_size = 3
        padding = 1
        layers = []

        layers.append(nn.Conv2d(in_channels=image_channels, out_channels=n_channels, kernel_size=kernel_size, padding=padding, bias=True))
        layers.append(nn.ReLU(inplace=True))
        for _ in range(depth-2):
            layers.append(nn.Conv2d(in_channels=n_channels, out_channels=n_channels, kernel_size=kernel_size, padding=padding, bias=False))
            layers.append(nn.BatchNorm2d(n_channels, eps=0.0001, momentum = 0.95))
            layers.append(nn.ReLU(inplace=True))
        layers.append(nn.Conv2d(in_channels=n_channels, out_channels=image_channels, kernel_size=kernel_size, padding=padding, bias=False))
        self.dncnn = nn.Sequential(*layers)
        self._initialize_weights()

    def forward(self, x):
        y = x
        out = self.dncnn(x)
        return y-out

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.orthogonal_(m.weight)
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)

I was able to use it nicely with 512x512 gray-scale image with a 64 batch size.

Then, I was asked to try to use with 2000 x 2000 images (that I cannot downsized). Now to keep up the new memory size needed, I have decrease the batch size to 10. With such parameters, there is no more like RuntimeError: CUDA out of memory. Tried to allocate .... But I face a new type of error that I do not master:

RuntimeError: cuDNN error: CUDNN_STATUS_NOT_SUPPORTED. This error may appear if you passed in a non-contiguou
s input.

I have tried to use during training to make the tensor contiguous as followed

        # perform the optimizer loop
        optimizer.zero_grad()
        outputs = model(new_img_batch_noisy.contiguous()) # new contiguous      
        loss = F.mse_loss(outputs, new_img_batch)  # de-noised+de-pated  vs original imgs
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

But, I have no success as the trace back is still the same

  File "donut_train.py", line 82, in train
    outputs = model(new_img_batch_noisy.contiguous()) # new contiguous for AuxTel        
  File "/gpfslocalsup/pub/anaconda-py3/2021.05/envs/pytorch-1.8.1+py3.8.8-lts/lib/python3.8/site-packages/tor
ch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/gpfsdswork/projects/rech/ixh/ufd72rp/donut/kapa_network.py", line 302, in forward
    out = self.dncnn(x)
  File "/gpfslocalsup/pub/anaconda-py3/2021.05/envs/pytorch-1.8.1+py3.8.8-lts/lib/python3.8/site-packages/tor
ch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/gpfslocalsup/pub/anaconda-py3/2021.05/envs/pytorch-1.8.1+py3.8.8-lts/lib/python3.8/site-packages/tor
ch/nn/modules/container.py", line 119, in forward
    input = module(input)
  File "/gpfslocalsup/pub/anaconda-py3/2021.05/envs/pytorch-1.8.1+py3.8.8-lts/lib/python3.8/site-packages/tor
ch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/gpfslocalsup/pub/anaconda-py3/2021.05/envs/pytorch-1.8.1+py3.8.8-lts/lib/python3.8/site-packages/tor
ch/nn/modules/batchnorm.py", line 135, in forward
    return F.batch_norm(
  File "/gpfslocalsup/pub/anaconda-py3/2021.05/envs/pytorch-1.8.1+py3.8.8-lts/lib/python3.8/site-packages/tor
ch/nn/functional.py", line 2149, in batch_norm
    return torch.batch_norm(
RuntimeError: cuDNN error: CUDNN_STATUS_NOT_SUPPORTED. This error may appear if you passed in a non-contiguou
s input.

I’d be grateful if you could point me in the right direction. Thanks

PS: I do not show all the python as it will be too long. Reminder: I am able to run the whole training/testing code using 512x512 images in batches of size 64

Could you post a minimal and executable code snippet reproducing the issue?

Ok here it is (I though that it would more complicated than it is…)

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import numpy as np

class DnCNN(nn.Module):
    def __init__(self, depth=17, n_channels=64, image_channels=1, use_bnorm=True, kernel_size=3):
        super(DnCNN, self).__init__()
        kernel_size = 3
        padding = 1
        layers = []

        layers.append(nn.Conv2d(in_channels=image_channels, out_channels=n_channels, kernel_size=kernel_size, padding=padding, bias=True))
        layers.append(nn.ReLU(inplace=True))
        for _ in range(depth-2):
            layers.append(nn.Conv2d(in_channels=n_channels, out_channels=n_channels, kernel_size=kernel_size, padding=padding, bias=False))
            layers.append(nn.BatchNorm2d(n_channels, eps=0.0001, momentum = 0.95))
            layers.append(nn.ReLU(inplace=True))
        layers.append(nn.Conv2d(in_channels=n_channels, out_channels=image_channels, kernel_size=kernel_size, padding=padding, bias=False))
        self.dncnn = nn.Sequential(*layers)
        self._initialize_weights()

    def forward(self, x):
        y = x
        out = self.dncnn(x)
        return y-out

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.orthogonal_(m.weight)
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)


#####
print("torch verion: ",torch.__version__)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("Cuda: ",device)
#####

model=DnCNN()
model.to(device)

np.random.seed(0)
data = torch.from_numpy(np.random.normal(size=(10,1,2000,2000)).astype(np.float32)).to(device)

# With (size=(10,1,2000,2000) 
#    return torch.batch_norm(
#RuntimeError: cuDNN error: CUDNN_STATUS_NOT_SUPPORTED. This error may appear if you passed in a non-contiguous input.
#

out = model(data)
print("ou shape:",out.shape)

####
print("Bye")

Here are some elements on the GPU device & torch version

torch verion:  1.8.0a0+56b43f4
 CUDA Version: 12.2 
Tesla V100-SXM2-32GB