I having problem running training on Multiple Gpu when using Dataparallel. The code works fine when only one Gpu is used for training. I have pasted my code below.
batch_loader.py:
from torch.utils import data
import random
import os
import numpy as np
import torch
class TrainFolder(data.Dataset):
def __init__(self, file):
super(TrainFolder, self).__init__()
self.images = []
fid = file
for x in fid:
labelfile = x.replace("input", "target")
info = (x, labelfile)
self.images.append(info)
random.shuffle(self.images)
def __len__(self):
return len(self.images)
def __getitem__(self, index):
image_file, label_file = self.images[index]
img = np.load(image_file)
lab = np.load(label_file)
img = np.rollaxis(img, 2, 0)
lab = np.rollaxis(lab, 2, 0)
img = torch.from_numpy(img[:, :, :])
lab = torch.from_numpy(lab[:, :, :])
return img, lab
network.py:
import math
import torch
import torch.nn as nn
def gen_initialization(m):
if type(m) == nn.Conv2d:
sh = m.weight.shape
nn.init.normal_(m.weight, std=math.sqrt(2.0 / (sh[0]*sh[2]*sh[3])))
nn.init.constant_(m.bias, 0)
elif type(m) == nn.BatchNorm2d:
nn.init.normal_(m.weight)
nn.init.normal_(m.bias)
class TripleConv(nn.Module):
def __init__(self, in_ch, out_ch):
super(TripleConv, self).__init__()
mid_ch = (in_ch + out_ch) // 2
self.conv = nn.Sequential(
nn.Conv2d(in_ch, mid_ch, kernel_size=3, stride=1, padding=1, bias=True),
nn.ReLU(),
nn.Conv2d(mid_ch, mid_ch, kernel_size=3, stride=1, padding=1, bias=True),
nn.ReLU(),
nn.Conv2d(mid_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
nn.ReLU()
)
self.conv.apply(gen_initialization)
def forward(self, x):
return self.conv(x)
class Down(nn.Module):
def __init__(self, in_ch, out_ch):
super(Down, self).__init__()
self.triple_conv = TripleConv(in_ch, out_ch)
self.avg_pool_conv = nn.AvgPool2d(2, 2)
self.in_ch = in_ch
self.out_ch = out_ch
def forward(self, x):
self.cache = self.triple_conv(x)
pad = torch.zeros(x.shape[0], self.out_ch - self.in_ch, x.shape[2], x.shape[3], device=x.device)
x = torch.cat((x, pad), dim=1)
self.cache += x
return self.avg_pool_conv(self.cache)
class Center(nn.Module):
def __init__(self, in_ch, out_ch):
super(Center, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
nn.ReLU()
)
self.conv.apply(gen_initialization)
def forward(self, x):
return self.conv(x)
class Up(nn.Module):
def __init__(self, in_ch, out_ch):
super(Up, self).__init__()
self.upsample = nn.Upsample(scale_factor=2, mode='bilinear',
align_corners=True)
self.triple_conv = TripleConv(in_ch, out_ch)
def forward(self, x, cache):
x = self.upsample(x)
x = torch.cat((x, cache), dim=1)
x = self.triple_conv(x)
return x
class UNet(nn.Module):
def __init__(self, in_ch, first_ch=None):
super(UNet, self).__init__()
if not first_ch:
first_ch = 32
self.down1 = Down(in_ch, first_ch)
self.down2 = Down(first_ch, first_ch*2)
self.down3 = Down(first_ch*2, first_ch*4)
self.down4 = Down(first_ch*4, first_ch*8)
self.center = Center(first_ch*8, first_ch*8)
self.up4 = Up(first_ch*8*2, first_ch*4)
self.up3 = Up(first_ch*4*2, first_ch*2)
self.up2 = Up(first_ch*2*2, first_ch)
self.up1 = Up(first_ch*2, first_ch)
self.output = nn.Conv2d(first_ch, in_ch, kernel_size=3, stride=1,
padding=1, bias=True)
self.output.apply(gen_initialization)
def forward(self, x):
x = self.down1(x)
x = self.down2(x)
x = self.down3(x)
x = self.down4(x)
x = self.center(x)
x = self.up4(x, self.down4.cache)
x = self.up3(x, self.down3.cache)
x = self.up2(x, self.down2.cache)
x = self.up1(x, self.down1.cache)
x = self.output(x)
return x
train.py:
from configobj import ConfigObj
from tqdm import tqdm
import os
import network
import glob
import random
import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from batch_loader import TrainFolder
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
def init_parameters():
tc, vc = ConfigObj(), ConfigObj()
tc.batch_size, vc.batch_size = 20, 4
tc.n_channels, vc.n_channels = 2, 2
tc.image_size, vc.image_size = 256, 256
tc.use_fp16, vc.use_fp16 = False, False # enable to use fp16 float precision instead of fp32
return tc, vc
if __name__ == '__main__':
num_workers = 10
torch.manual_seed(47)
torch.backends.cudnn.benchmark = True
train_samples = glob.glob('/home/data/nas/Processed_Data/training_data/spa_network/npyfiles/train/input/*.npy')
valid_samples = glob.glob('/home/data/nas/Processed_Data/training_data/spa_network/npyfiles/valid/input/*.npy')
random.shuffle(train_samples)
trainData = TrainFolder(train_samples)
validData = TrainFolder(valid_samples)
train_config, valid_config = init_parameters()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
input = torch.Tensor(train_config.batch_size, train_config.n_channels, train_config.image_size, train_config.image_size).to(device)
input.requires_grad = False
label = torch.Tensor(train_config.batch_size, train_config.n_channels, train_config.image_size, train_config.image_size).to(device)
label.requires_grad = False
valid_input = torch.Tensor(valid_config.batch_size, valid_config.n_channels, valid_config.image_size, valid_config.image_size).to(device)
valid_input.requires_grad = False
valid_label = torch.Tensor(valid_config.batch_size, valid_config.n_channels, valid_config.image_size, valid_config.image_size).to(device)
valid_label.requires_grad = False
train_data_loader = DataLoader(dataset=trainData, num_workers=num_workers, batch_size=train_config.batch_size, shuffle=True, drop_last=False, pin_memory=True)
valid_data_loader = DataLoader(dataset=validData, num_workers=num_workers, batch_size=valid_config.batch_size, shuffle=True, drop_last=False, pin_memory=True)
netG = network.UNet(2, first_ch=32)
if torch.cuda.device_count() > 1 :
print("Using ", torch.cuda.device_count(), "GPUs!")
netG = nn.DataParallel(netG)
netG.to(device)
optimizerG = optim.Adam(netG.parameters(), lr=1e-3, betas=(0.9, 0.999))
# Initialize BCELoss function
criterion = nn.MSELoss().to(device=device)
scalerG = torch.cuda.amp.GradScaler(enabled=train_config.use_fp16)
print('Start training')
niter = 10000
for epoch in range(niter):
netG.train()
train_g_mse_error = 0
for i, data in enumerate(tqdm(train_data_loader)):
input.copy_(data[0])
label.copy_(data[1])
# train the generator over here
netG.zero_grad()
optimizerG.zero_grad()
with torch.cuda.amp.autocast(enabled=train_config.use_fp16):
output = netG(input)
errG_mse = torch.mean(torch.abs(output - label))
scalerG.scale(errG_mse).backward()
train_g_mse_error += errG_mse.mean()
scalerG.step(optimizerG)
scalerG.update()
train_g_mse_error = train_g_mse_error / len(train_data_loader)
netG.eval()
with torch.no_grad():
valid_g_mse_error = 0
for i, batch in enumerate(tqdm(valid_data_loader)):
valid_input.copy_(batch[0])
valid_label.copy_(batch[1])
with torch.cuda.amp.autocast(enabled=valid_config.use_fp16):
G_output = netG(valid_input)
valid_errG_mse = torch.mean(torch.abs(G_output - valid_label))
valid_g_mse_error += valid_errG_mse.mean()
valid_g_mse_error = valid_g_mse_error / len(valid_data_loader)
if epoch % 5 == 0:
torch.save(netG.state_dict(), f'model/network_epoch{epoch}.pth')
Error:
Traceback (most recent call last):
File “train.py”, line 85, in
scalerG.scale(errG_mse).backward()
File “/usr/local/lib/python3.6/dist-packages/torch/tensor.py”, line 185, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File “/usr/local/lib/python3.6/dist-packages/torch/autograd/init.py”, line 127, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_BAD_PARAM
Exception raised from operator() at /pytorch/aten/src/ATen/native/cudnn/Conv.cpp:1141 (most recent call first):
Environment:
Ubuntu. 18.04
Cuda: 10.2
Pytorch- 1.6.0
Cudnn-7.5
Gpu0- Rtx 1080
Gpu1- Rtx 2080