Work is not evenly distributed to GPU

I’m working on classification of ImageNet2012 using VGG16.

I’m currently working with 8 GPUs, which are gtx1080ti.

The problem is that the job allocation is only directed at the 0th GPU. So, they cannot inference with a large batch size.

Below is my code. (The VGG16 parameters were extracted from the pre-trained model.)

from __future__ import print_function

import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
from torch.autograd import Variable
import torchvision.models as models

from utils import progress_bar

import os
import VGG16_conv1
import VGG16_conv2
import VGG16_conv3
import VGG16_conv4
import VGG16_conv5
import VGG16_conv6
import VGG16_conv7
import VGG16_conv8
import VGG16_conv9
import VGG16_conv10
import VGG16_conv11
import VGG16_conv12
import VGG16_conv13
import VGG16_linear1
import VGG16_linear2
import VGG16_linear3

use_cuda = torch.cuda.is_available()
best_acc = 0  # best test accuracy

valdir = os.path.join("/home/mhha/", 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Scale(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=100, shuffle=False,
        num_workers=4, pin_memory=True)

net = models.vgg16(pretrained=True)

net_conv1 = VGG16_conv1.VGG16_CONV1()
net_conv2 = VGG16_conv2.VGG16_CONV2()
net_conv3 = VGG16_conv3.VGG16_CONV3()
net_conv4 = VGG16_conv4.VGG16_CONV4()
net_conv5 = VGG16_conv5.VGG16_CONV5()
net_conv6 = VGG16_conv6.VGG16_CONV6()
net_conv7 = VGG16_conv7.VGG16_CONV7()
net_conv8 = VGG16_conv8.VGG16_CONV8()
net_conv9 = VGG16_conv9.VGG16_CONV9()
net_conv10 = VGG16_conv10.VGG16_CONV10()
net_conv11 = VGG16_conv11.VGG16_CONV11()
net_conv12 = VGG16_conv12.VGG16_CONV12()
net_conv13 = VGG16_conv13.VGG16_CONV13()
net_linear1 = VGG16_linear1.VGG16_LINEAR1()
net_linear2 = VGG16_linear2.VGG16_LINEAR2()
net_linear3 = VGG16_linear3.VGG16_LINEAR3()

print("saving CONV1 weights")
net_conv1.conv1[0].weight = net.features[0].weight
net_conv1.conv1[0].bias = net.features[0].bias

print("saving CONV2 weights")
net_conv2.conv2[0].weight = net.features[2].weight
net_conv2.conv2[0].bias = net.features[2].bias

print("saving CONV3 weights")
net_conv3.conv3[0].weight = net.features[5].weight
net_conv3.conv3[0].bias = net.features[5].bias

print("saving CONV4 weights")
net_conv4.conv4[0].weight = net.features[7].weight
net_conv4.conv4[0].bias = net.features[7].bias

print("saving CONV5 weights")
net_conv5.conv5[0].weight = net.features[10].weight
net_conv5.conv5[0].bias = net.features[10].bias

print("saving CONV6 weights")
net_conv6.conv6[0].weight = net.features[12].weight
net_conv6.conv6[0].bias = net.features[12].bias

print("saving CONV7 weights")
net_conv7.conv7[0].weight = net.features[14].weight
net_conv7.conv7[0].bias = net.features[14].bias

print("saving CONV8 weights")
net_conv8.conv8[0].weight = net.features[17].weight
net_conv8.conv8[0].bias = net.features[17].bias

print("saving CONV9 weights")
net_conv9.conv9[0].weight = net.features[19].weight
net_conv9.conv9[0].bias = net.features[19].bias

print("saving CONV10 weights")
net_conv10.conv10[0].weight = net.features[21].weight
net_conv10.conv10[0].bias = net.features[21].bias

print("saving CONV11 weights")
net_conv11.conv11[0].weight = net.features[24].weight
net_conv11.conv11[0].bias = net.features[24].bias

print("saving CONV12 weights")
net_conv12.conv12[0].weight = net.features[26].weight
net_conv12.conv12[0].bias = net.features[26].bias

print("saving CONV13 weights")
net_conv13.conv13[0].weight = net.features[28].weight
net_conv13.conv13[0].bias = net.features[28].bias

print("saving FC1 weights")
net_linear1.linear1[0].weight = net.classifier[0].weight
net_linear1.linear1[0].bias = net.classifier[0].bias

print("saving FC2 weights")
net_linear2.linear2[0].weight = net.classifier[3].weight
net_linear2.linear2[0].bias = net.classifier[3].bias

print("saving FC3 weights")
net_linear3.linear3[0].weight = net.classifier[6].weight
net_linear3.linear3[0].bias = net.classifier[6].bias

if use_cuda:
    net_conv1.cuda()
    net_conv1 = torch.nn.DataParallel(net_conv1, device_ids=range(torch.cuda.device_count()))
    net_conv2.cuda()
    net_conv2 = torch.nn.DataParallel(net_conv2, device_ids=range(torch.cuda.device_count()))
    net_conv3.cuda()
    net_conv3 = torch.nn.DataParallel(net_conv3, device_ids=range(torch.cuda.device_count()))
    net_conv4.cuda()
    net_conv4 = torch.nn.DataParallel(net_conv4, device_ids=range(torch.cuda.device_count()))
    net_conv5.cuda()
    net_conv5 = torch.nn.DataParallel(net_conv5, device_ids=range(torch.cuda.device_count()))
    net_conv6.cuda()
    net_conv6 = torch.nn.DataParallel(net_conv6, device_ids=range(torch.cuda.device_count()))
    net_conv7.cuda()
    net_conv7 = torch.nn.DataParallel(net_conv7, device_ids=range(torch.cuda.device_count()))
    net_conv8.cuda()
    net_conv8 = torch.nn.DataParallel(net_conv8, device_ids=range(torch.cuda.device_count()))
    net_conv9.cuda()
    net_conv9 = torch.nn.DataParallel(net_conv9, device_ids=range(torch.cuda.device_count()))
    net_conv10.cuda()
    net_conv10 = torch.nn.DataParallel(net_conv10, device_ids=range(torch.cuda.device_count()))
    net_conv11.cuda()
    net_conv11 = torch.nn.DataParallel(net_conv11, device_ids=range(torch.cuda.device_count()))
    net_conv12.cuda()
    net_conv12 = torch.nn.DataParallel(net_conv12, device_ids=range(torch.cuda.device_count()))
    net_conv13.cuda()
    net_conv13 = torch.nn.DataParallel(net_conv13, device_ids=range(torch.cuda.device_count()))
    net_linear1.cuda()
    net_linear1 = torch.nn.DataParallel(net_linear1, device_ids=range(torch.cuda.device_count()))
    net_linear2.cuda()
    net_linear2 = torch.nn.DataParallel(net_linear2, device_ids=range(torch.cuda.device_count()))
    net_linear3.cuda()
    net_linear3 = torch.nn.DataParallel(net_linear3, device_ids=range(torch.cuda.device_count()))
    cudnn.benchmark = True

criterion = nn.CrossEntropyLoss()

def test():
    global best_acc
    net_conv1.eval()
    net_conv2.eval()
    net_conv3.eval()
    net_conv4.eval()
    net_conv5.eval()
    net_conv6.eval()
    net_conv7.eval()
    net_conv8.eval()
    net_conv9.eval()
    net_conv10.eval()
    net_conv11.eval()
    net_conv12.eval()
    net_conv13.eval()
    net_linear1.eval()
    net_linear2.eval()
    net_linear3.eval()
    test_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(val_loader):
        if use_cuda:
            inputs, targets = inputs.cuda(), targets.cuda()
        inputs, targets = Variable(inputs), Variable(targets)

        outputs_conv1 = net_conv1(inputs)
        outputs_conv2 = net_conv2(outputs_conv1)
        outputs_conv3 = net_conv3(outputs_conv2)
        outputs_conv4 = net_conv4(outputs_conv3)
        outputs_conv5 = net_conv5(outputs_conv4)
        outputs_conv6 = net_conv6(outputs_conv5)
        outputs_conv7 = net_conv7(outputs_conv6)
        outputs_conv8 = net_conv8(outputs_conv7)
        outputs_conv9 = net_conv9(outputs_conv8)
        outputs_conv10 = net_conv10(outputs_conv9)
        outputs_conv11 = net_conv11(outputs_conv10)
        outputs_conv12 = net_conv12(outputs_conv11)
        outputs_conv13 = net_conv13(outputs_conv12)
        outputs_linear1 = net_linear1(outputs_conv13)
        outputs_linear2 = net_linear2(outputs_linear1)
        outputs_linear3 = net_linear3(outputs_linear2)

        loss = criterion(outputs_linear3, targets)

        test_loss += loss.data
        _, predicted = torch.max(outputs_linear3.data, 1)
        total += targets.size(0)
        correct += predicted.eq(targets.data).cpu().sum()

        progress_bar(batch_idx, len(val_loader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
            % (test_loss/(batch_idx+1), 100.*float(correct)/float(total), correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'net_conv1': net_conv1.module if use_cuda else net_conv1,
            'net_conv2': net_conv2.module if use_cuda else net_conv2,
            'net_conv3': net_conv3.module if use_cuda else net_conv3,
            'net_conv4': net_conv4.module if use_cuda else net_conv4,
            'net_conv5': net_conv5.module if use_cuda else net_conv5,
            'net_conv6': net_conv6.module if use_cuda else net_conv6,
            'net_conv7': net_conv7.module if use_cuda else net_conv7,
            'net_conv8': net_conv8.module if use_cuda else net_conv8,
            'net_conv9': net_conv9.module if use_cuda else net_conv9,
            'net_conv10': net_conv10.module if use_cuda else net_conv10,
            'net_conv11': net_conv11.module if use_cuda else net_conv11,
            'net_conv12': net_conv12.module if use_cuda else net_conv12,
            'net_conv13': net_conv13.module if use_cuda else net_conv13,
            'net_linear1': net_linear1.module if use_cuda else net_linear1,
            'net_linear2': net_linear2.module if use_cuda else net_linear2,
            'net_linear3': net_linear3.module if use_cuda else net_linear3,
            'acc': acc,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt_vgg16_layer_by_layer_v1.t7')
        best_acc = acc

# Train+inference vs. Inference
print("Layer by layer VGG16 Test \n")
test()

Where is the problem??

Can you print the result of torch.cuda.device_count()?

All 8 GPUs are working well but 0th GPU is working more than the others.

What happens if you increase the num_workers of your dataloader to 4*torch.cuda.device_count()?

EDIT: Could you show the output of nvidia-smi?

And if you increase the batch_size to the maximum they are all equally used or is GPU 0 running out of memory?

out of memory of GPU0

have you tried what is the maximal batchsize to fit on a single GPU and multiplied this value by 8 or have you simply used an arbitrary high batchsize?

I used just large size and then reduced it.

I think problem is GPU 0. BUT I dont know how to solve it