Memory waste in GPU

Hi I am reproducing the paper ,but get out of Memory.
naming <xUnit: Learning a Spatial Activation Function for Efficient Image Restoration>
code implanted in https://github.com/kligvasser/xUnit
main code are here:

class Gaussian(nn.Module):
    def forward(self,input):
        return torch.exp(-torch.mul(input,input))

class Modulecell(nn.Module):
    def __init__(self,in_channels=3,out_channels=64,kernel_size=3,skernel_size=9):
        super(Modulecell,self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(in_channels,out_channels,kernel_size=kernel_size,padding=((kernel_size-1)//2)))
        self.module = nn.Sequential(
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=skernel_size,stride=1,padding=((skernel_size-1)//2),groups=out_channels),
            nn.BatchNorm2d(out_channels),
            Gaussian())
    def forward(self,x):
        x1 = self.features(x)
        x2 = self.module(x1)
        x = torch.mul(x1,x2)
        return x

class xDnCNN(nn.Module):
    def __init__(self,channels=64):
        super(xDnCNN,self).__init__()
        self.md1 = nn.Sequential(
            Modulecell(in_channels=3,out_channels=channels,kernel_size=3))
        self.md2 = nn.Sequential(
            nn.BatchNorm2d(channels),
            Modulecell(in_channels=channels,out_channels=channels,kernel_size=3))
        self.md3 = nn.Sequential(
            nn.BatchNorm2d(channels),
            Modulecell(in_channels=channels,out_channels=channels,kernel_size=3))
        self.md4 = nn.Sequential(
            nn.BatchNorm2d(channels),
            Modulecell(in_channels=channels,out_channels=channels,kernel_size=3))
        self.md5 = nn.Sequential(
            nn.BatchNorm2d(channels),
            Modulecell(in_channels=channels,out_channels=channels,kernel_size=3))
        self.md6 = nn.Sequential(
            nn.BatchNorm2d(channels),
            Modulecell(in_channels=channels,out_channels=channels,kernel_size=3))
        self.md7 = nn.Sequential(
            nn.BatchNorm2d(channels),
            Modulecell(in_channels=channels,out_channels=channels,kernel_size=3))
        self.md8 = nn.Sequential(
            nn.BatchNorm2d(channels),
            Modulecell(in_channels=channels,out_channels=channels,kernel_size=3))
        self.joints = nn.Conv2d(channels,3,kernel_size=3,padding=1)
    def forward(self,x):
        x = self.md1(x)
        x = self.md2(x)
        x = self.md3(x)
        x = self.md4(x)
        x = self.md5(x)
        x = self.md6(x)
        x = self.md7(x)
        x = self.md8(x)
        x = self.joints(x)
        return x
###################################################
#####The code does not show the training loop, ############
###########here is my training loop#####################
def train(net, optimizer, criterion):
#######net: xDnCNN showed above
#######optimizer:adam
#######criterion:MSE_loss

    net = torch.nn.DataParallel(net, device_ids=list(range(opt.ngpu)))
    net.cuda()
    cudnn.benchmark = True
    net.train()
    dn_loss = 0
    epoch = 0
    print('Loading Dataset...')
    dataset = VOCDetection(root='/home/jgw/VOC', nroot='/home/jgw/noisedata/VOCG25',
                           image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
                           preproc=preproc(512))
    epoch_size = len(dataset) // opt.batch_size
    max_iter = opt.max_epoch * epoch_size
    stepvalues_VOC = (150 * epoch_size, 200 * epoch_size, 250 * epoch_size)
    step_index = 0
    lr = opt.lr

    for iteration in range(max_iter):
        if iteration % epoch_size == 0:
            batch_iterator = iter(data.DataLoader(dataset, opt.batch_size, shuffle=False,
                                                  num_workers=opt.num_workers))
            if (epoch % 10 == 0 and epoch > 0) or (epoch % 5 == 0 and epoch > 200):
                torch.save(net.state_dict(), '/home/jgw/Downloads' + '_' + repr(epoch) + '.pth')
            epoch += 1
        load_t0 = time.time()
        if iteration in stepvalues_VOC:
            step_index += 1
        lr = adjust_learning_rate(optimizer, 0.1, epoch, step_index, iteration, epoch_size)
        images, nimages = next(batch_iterator)
        images = images.permute(0, 3, 2, 1)
        nimages = nimages.permute(0, 3, 2, 1)
        if opt.cuda:
            images = images.type(torch.FloatTensor)
            images = Variable(images.cuda())
            nimages = nimages.type(torch.FloatTensor)
            nimages = Variable(nimages.cuda())
        out = net(nimages)
        optimizer.zero_grad()
        loss_l2 = criterion(out, images)
        loss_l2.backward()
        optimizer.step()

In the original paper, the author said that this work is based on a Nvidia 1080Ti .
batch_size is 64 and I try it on VOC0712 dataset.
But I have (4x12GB) totally 48GB memory.
erro:RuntimeError: CUDA out of memory.
I wonder that some bugs are in my code lead to memory waste .
How to fix this?
Thank you for your time and considerations! :grin:

Since you are using nn.DataParallel, your model will be replicated onto each specified GPU and the data will be split among the batch dimension so that each replica on each device can calculate the forward and backward pass independently. The 4*12GB=48GB are thus not usable together as one device for your model.
Could you try to lower the batch size, remove nn.DataParallel, and check which batch size would work for a single GPU workflow?

Thank you for your answer.I am try to reduce batch size and find batch_size=16 works well.
But I am confusing that under the nn.DataParallel,when batch_size = 32 ,each GPU actually works for 8 images.To my knowledge ,I think 8 images per GPU might be ok because the capacity of this model is relatively small comparing other high level frameworks(object detection semantic segmentation exc…).So I want to know whether my code have bugs or not ,or other way to solve it ( maintain the batch_size )