Runtime cudnn error when using Dataparallel to run training on Multiple GPUs

I having problem running training on Multiple Gpu when using Dataparallel. The code works fine when only one Gpu is used for training. I have pasted my code below.

batch_loader.py:

from torch.utils import data
import random
import os
import numpy as np
import torch
    
class TrainFolder(data.Dataset):
    def __init__(self, file):
        super(TrainFolder, self).__init__()

        self.images = []
        fid = file
        for x in fid:
            labelfile = x.replace("input", "target")
            info = (x, labelfile)
            self.images.append(info)

        random.shuffle(self.images)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image_file, label_file = self.images[index]
        img = np.load(image_file)
        lab = np.load(label_file)

        img = np.rollaxis(img, 2, 0)
        lab = np.rollaxis(lab, 2, 0)

        img = torch.from_numpy(img[:, :, :])
        lab = torch.from_numpy(lab[:, :, :])
        return img, lab

network.py:

import math
import torch
import torch.nn as nn

def gen_initialization(m):
    if type(m) == nn.Conv2d:
        sh = m.weight.shape
        nn.init.normal_(m.weight, std=math.sqrt(2.0 / (sh[0]*sh[2]*sh[3])))
        nn.init.constant_(m.bias, 0)
    elif type(m) == nn.BatchNorm2d:
        nn.init.normal_(m.weight)
        nn.init.normal_(m.bias)

class TripleConv(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(TripleConv, self).__init__()
        mid_ch = (in_ch + out_ch) // 2
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, mid_ch, kernel_size=3, stride=1, padding=1, bias=True),
            nn.ReLU(),
            nn.Conv2d(mid_ch, mid_ch, kernel_size=3, stride=1, padding=1, bias=True),
            nn.ReLU(),
            nn.Conv2d(mid_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
            nn.ReLU()
        )
        self.conv.apply(gen_initialization)

    def forward(self, x):
        return self.conv(x)


class Down(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(Down, self).__init__()
        self.triple_conv = TripleConv(in_ch, out_ch)
        self.avg_pool_conv = nn.AvgPool2d(2, 2)
        self.in_ch = in_ch
        self.out_ch = out_ch

    def forward(self, x):
        self.cache = self.triple_conv(x)
        pad = torch.zeros(x.shape[0], self.out_ch - self.in_ch, x.shape[2], x.shape[3], device=x.device)
        x = torch.cat((x, pad), dim=1)
        self.cache += x
        return self.avg_pool_conv(self.cache)


class Center(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(Center, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
            nn.ReLU()
        )
        self.conv.apply(gen_initialization)

    def forward(self, x):
        return self.conv(x)


class Up(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(Up, self).__init__()
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear',
                                    align_corners=True)
        self.triple_conv = TripleConv(in_ch, out_ch)

    def forward(self, x, cache):
        x = self.upsample(x)
        x = torch.cat((x, cache), dim=1)
        x = self.triple_conv(x)
        return x


class UNet(nn.Module):
    def __init__(self, in_ch, first_ch=None):
        super(UNet, self).__init__()

        if not first_ch:
            first_ch = 32

        self.down1 = Down(in_ch, first_ch)
        self.down2 = Down(first_ch, first_ch*2)
        self.down3 = Down(first_ch*2, first_ch*4)
        self.down4 = Down(first_ch*4, first_ch*8)
        self.center = Center(first_ch*8, first_ch*8)
        self.up4 = Up(first_ch*8*2, first_ch*4)
        self.up3 = Up(first_ch*4*2, first_ch*2)
        self.up2 = Up(first_ch*2*2, first_ch)
        self.up1 = Up(first_ch*2, first_ch)
        self.output = nn.Conv2d(first_ch, in_ch, kernel_size=3, stride=1,
                                padding=1, bias=True)
        self.output.apply(gen_initialization)

    def forward(self, x):

        x = self.down1(x)
        x = self.down2(x)
        x = self.down3(x)
        x = self.down4(x)
        x = self.center(x)
        x = self.up4(x, self.down4.cache)
        x = self.up3(x, self.down3.cache)
        x = self.up2(x, self.down2.cache)
        x = self.up1(x, self.down1.cache)
        x = self.output(x)
        return x

train.py:

from configobj import ConfigObj
from tqdm import tqdm
import os
import network
import glob
import random
import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from batch_loader import TrainFolder
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

def init_parameters():
    tc, vc = ConfigObj(), ConfigObj()
    tc.batch_size, vc.batch_size = 20, 4
    tc.n_channels, vc.n_channels = 2, 2
    tc.image_size, vc.image_size = 256, 256
    tc.use_fp16, vc.use_fp16 = False, False   # enable to use fp16 float precision instead of fp32
    return tc, vc

if __name__ == '__main__':
    num_workers = 10
    torch.manual_seed(47)
    torch.backends.cudnn.benchmark = True

    train_samples = glob.glob('/home/data/nas/Processed_Data/training_data/spa_network/npyfiles/train/input/*.npy')
    valid_samples = glob.glob('/home/data/nas/Processed_Data/training_data/spa_network/npyfiles/valid/input/*.npy')

    random.shuffle(train_samples)

    trainData = TrainFolder(train_samples)
    validData = TrainFolder(valid_samples)

    train_config, valid_config = init_parameters()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    input = torch.Tensor(train_config.batch_size, train_config.n_channels, train_config.image_size, train_config.image_size).to(device)
    input.requires_grad = False
    label = torch.Tensor(train_config.batch_size, train_config.n_channels, train_config.image_size, train_config.image_size).to(device)
    label.requires_grad = False

    valid_input = torch.Tensor(valid_config.batch_size, valid_config.n_channels, valid_config.image_size, valid_config.image_size).to(device)
    valid_input.requires_grad = False
    valid_label = torch.Tensor(valid_config.batch_size, valid_config.n_channels, valid_config.image_size, valid_config.image_size).to(device)
    valid_label.requires_grad = False

    train_data_loader = DataLoader(dataset=trainData, num_workers=num_workers, batch_size=train_config.batch_size, shuffle=True, drop_last=False, pin_memory=True)
    valid_data_loader = DataLoader(dataset=validData, num_workers=num_workers, batch_size=valid_config.batch_size, shuffle=True, drop_last=False, pin_memory=True)

    netG = network.UNet(2, first_ch=32)
    if torch.cuda.device_count() > 1 :
        print("Using ", torch.cuda.device_count(), "GPUs!")
        netG = nn.DataParallel(netG)

    netG.to(device)
    optimizerG = optim.Adam(netG.parameters(), lr=1e-3, betas=(0.9, 0.999))

    # Initialize BCELoss function
    criterion = nn.MSELoss().to(device=device)

    scalerG = torch.cuda.amp.GradScaler(enabled=train_config.use_fp16)
    print('Start training')
    niter = 10000

    for epoch in range(niter):
        netG.train()
        train_g_mse_error = 0
        for i, data in enumerate(tqdm(train_data_loader)):
            input.copy_(data[0])
            label.copy_(data[1])

            # train the generator over here
            netG.zero_grad()
            optimizerG.zero_grad()

            with torch.cuda.amp.autocast(enabled=train_config.use_fp16):
                output = netG(input)            
                errG_mse = torch.mean(torch.abs(output - label))
            
            scalerG.scale(errG_mse).backward()
            train_g_mse_error += errG_mse.mean()
                       
            scalerG.step(optimizerG)
            scalerG.update()

        train_g_mse_error = train_g_mse_error / len(train_data_loader)

        netG.eval()
        with torch.no_grad():
            valid_g_mse_error = 0
            for i, batch in enumerate(tqdm(valid_data_loader)):
                valid_input.copy_(batch[0])
                valid_label.copy_(batch[1])

                with torch.cuda.amp.autocast(enabled=valid_config.use_fp16):
                    G_output = netG(valid_input)
                    valid_errG_mse = torch.mean(torch.abs(G_output - valid_label))
                valid_g_mse_error += valid_errG_mse.mean()

            valid_g_mse_error = valid_g_mse_error / len(valid_data_loader)
        
        if epoch % 5 == 0:
            torch.save(netG.state_dict(), f'model/network_epoch{epoch}.pth')

Error:
Traceback (most recent call last):
File “train.py”, line 85, in
scalerG.scale(errG_mse).backward()
File “/usr/local/lib/python3.6/dist-packages/torch/tensor.py”, line 185, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File “/usr/local/lib/python3.6/dist-packages/torch/autograd/init.py”, line 127, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_BAD_PARAM
Exception raised from operator() at /pytorch/aten/src/ATen/native/cudnn/Conv.cpp:1141 (most recent call first):

Environment:
Ubuntu. 18.04
Cuda: 10.2
Pytorch- 1.6.0
Cudnn-7.5
Gpu0- Rtx 1080
Gpu1- Rtx 2080

Could you update to the latest stable PyTorch release (1.7.1) and rerun the code?

I updated to torch 1.7.1 and got this error

Traceback (most recent call last):
File “train.py”, line 90, in
scalerG.scale(errG_mse).backward()
File “/usr/local/lib/python3.6/dist-packages/torch/tensor.py”, line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File “/usr/local/lib/python3.6/dist-packages/torch/autograd/init.py”, line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_BAD_PARAM
You can try to repro this exception using the following code snippet. If that doesn’t trigger the error, please include your original repro script when reporting this issue.

import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.allow_tf32 = True
data = torch.randn([10, 80, 128, 128], dtype=torch.float, device=‘cuda’, requires_grad=True)
net = torch.nn.Conv2d(80, 80, kernel_size=[3, 3], padding=[1, 1], stride=[1, 1], dilation=[1, 1], groups=1)
net = net.cuda().float()
out = net(data)
out.backward(torch.randn_like(out))
torch.cuda.synchronize()

ConvolutionParams
data_type = CUDNN_DATA_FLOAT
padding = [1, 1, 0]
stride = [1, 1, 0]
dilation = [1, 1, 0]
groups = 1
deterministic = false
allow_tf32 = true
input: TensorDescriptor 0x7fd7080399b0
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 10, 80, 128, 128,
strideA = 1310720, 16384, 128, 1,
output: TensorDescriptor 0x7fd70804b250
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 10, 80, 128, 128,
strideA = 1310720, 16384, 128, 1,
weight: FilterDescriptor 0x7fd70808a2f0
type = CUDNN_DATA_FLOAT
tensor_format = CUDNN_TENSOR_NCHW
nbDims = 4
dimA = 80, 80, 3, 3,
Pointer addresses:
input: 0x7fd600000000
output: 0x7fd3fd800000
weight: 0x7fd79b68b800
Additional pointer addresses:
grad_output: 0x7fd3fd800000
grad_weight: 0x7fd79b68b800
Backward filter algorithm: 1

Thanks for the update. Could you post the input shapes you are using so that I could try to reproduce it locally?

Input Shape: [24,2, 256,256]

Here:

Batch Size: 24
Channels: 2
Image SIze (Height): 256
Image Size (Width): 256

1 Like

Thanks. I cannot reproduce the issue using this code:

# your model defnitions

print(torch.cuda.get_device_name(0))
torch.backends.cudnn.benchmark = True

model = UNet(2).cuda()
x = torch.randn(24, 2, 256, 256, device='cuda')
target = torch.randn(24, 2, 256, 256, device='cuda')
out = model(x)
print(out.shape)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999))

criterion = nn.MSELoss()

scaler = torch.cuda.amp.GradScaler()

print('running amp')
for _ in range(10):
    optimizer.zero_grad()
    with torch.cuda.amp.autocast():
        out = model(x)
        loss = torch.mean(torch.abs(out - target))
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    print(out.shape)

using an RTX2080Ti with PyTorch 1.6.0+CUDA10.2, 1.7.1+CUDA10.2, 1.7.1+CUDA11.0.
Are you seeing the crash using a single GPU as well or only in this particular setup with two different GPUs?

There is no crash using single GPU. The crash happens only with this particular setup with two different GPUs

OK, unfortunately I won’t be able to easily reproduce this issue as I don’t have a machine ready with these particular GPUs.

Ok. I also got this warning before training starts. I don’t think that should be a problem, but would like to know your opinion on it.

Using 2 GPUs!
/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py:30: UserWarning:
There is an imbalance between your GPUs. You may want to exclude GPU 0 which
has less than 75% of the memory or cores of GPU 1. You can do so by setting
the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
environment variable.
warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))

I tried to reduce the batch size. But still ended up getting crash.

This warning is raised, since the two GPUs you are using have a different compute performance and the slower one would be the bottleneck of the application and the data parallel approach might not be beneficial, but as always it depends on the used config and use case.

Can you make sure that the cuda version of the pytorch and cuda version in PATH variable (or) /usr/local/cuda are same?

I faced similar issue. I am not sure if this was the root cause. But make sure of this.

I am running this script in the docker container. The docker image was built from

nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04

The cuda version for both the pytorch and docker environment is 10.2.

I still get the same cudnn error and crash.