Low GPU and high CPU utilization with Dataparallel

I am using the following code on 8 GPUs. GPU utilization is too low, around 10%. Can someone please help me to understand what is wrong with the code?

 def conv3x3(in_planes, out_planes, stride=1):

        return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                 padding=1, bias=False)

class BasicBlock(nn.Module):
          expansion = 1

def __init__(self, in_planes, planes, stride=1):
    
    super(BasicBlock, self).__init__()
    
    self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
    
    self.bn1 = nn.BatchNorm2d(planes)
    
    self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
    
    self.bn2 = nn.BatchNorm2d(planes)

    self.shortcut = nn.Sequential()
    if stride != 1 or in_planes != self.expansion*planes:
        self.shortcut = nn.Sequential(
            nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
            nn.BatchNorm2d(self.expansion*planes)
        )

def forward(self, x):
    
    out = F.relu(self.bn1(self.conv1(x)))
    
    out = self.bn2(self.conv2(out))
    
    out += self.shortcut(x)
    
    out = F.relu(out)
    
    return out       
        

class ResNet(nn.Module):

def __init__(self, block, num_blocks, num_classes=10):
    
    super(ResNet, self).__init__()

    dtype = torch.cuda.FloatTensor         
    
    n_cells = 2
    
    self.in_planes = 64
    
    self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    
    self.bn1 = nn.BatchNorm2d(64)
    
    self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
    
    self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
    
    self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
    
    self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
    
    self.linear = nn.Linear(512*block.expansion, num_classes)
    
    self.relu = nn.ReLU()
    #making a list of layers
    layer_list = [self.layer1[0],self.layer1[1],self.layer1[2],\
                  self.layer2[0],self.layer2[1],self.layer2[2],self.layer2[3],\
                  self.layer3[0],self.layer3[1],self.layer3[2],self.layer3[3],\
                  self.layer3[4],self.layer3[5],\
                  self.layer4[0],self.layer4[1],self.layer4[2]]
    self.layer_list = nn.ModuleList(layer_list)
def _make_layer(self, block, planes, num_blocks, stride):
    
    strides = [stride] + [1]*(num_blocks-1)
    
    layers = []
    for stride in strides:
        layers.append(block(self.in_planes, planes, stride))
        self.in_planes = planes * block.expansion
    return nn.Sequential(*layers)   
      
def forward(self, x, new_weights):
    
    dtype = torch.cuda.FloatTensor
    
    device_ids = [0,1,2,3,4,5,6,7]
    
    
    
    n_samples = int(x.shape[0])
    
    batch_size = n_samples
    
    n_cells = 2
    
    conv =  Variable(torch.zeros(1,1,3,3).cuda())
    
    rnn = nn.RNNCell(10,n_cells).cuda()
    
    linear = nn.Linear(n_cells,1).cuda()
    
    sigmoid = nn.Sigmoid().cuda()
    
    v1 = int(10*n_cells);v2=v1+int(n_cells**2);v3=v2+int(n_cells)

    v4 = v3+int(n_cells);v5=v4+3*3;v6=v5+int(n_cells);v7=v6+1
    
    new_weights = new_weights.data
    
    
    value = new_weights[:v1].cuda()
       
    
    value = new_weights[v1:v2].cuda()

    rnn.weight_hh = torch.nn.Parameter(value.view(n_cells,n_cells))   
    
    value = new_weights[v2:v3].cuda()

    rnn.bias_ih = torch.nn.Parameter(value.view(n_cells))
    
    value = new_weights[v3:v4].cuda()

    rnn.bias_hh = torch.nn.Parameter(value.view(n_cells))
    
    value = new_weights[v4:v5].cuda()

    conv.weight = torch.nn.Parameter(value.view(1,1,3,3))
    
    value = new_weights[v5:v6].cuda()

    linear.weight = torch.nn.Parameter(value.view(1,n_cells))
    
    value = new_weights[v6:v7].cuda()

    linear.bias = torch.nn.Parameter(value.view(1))        
    
    out = self.conv1(x)
    
    out = self.bn1(out)
    
    out = self.relu(out)#F.relu(out)
    
    hidden = Variable(torch.zeros(n_samples,n_cells).type(dtype)).cuda()

    for layer in self.layer_list:
    
        path_1 = layer(out)
        
        path_2 = self.relu(layer.shortcut(out))#F.relu(layer.shortcut(out))
        
        ht = out.shape[2];wd = out.shape[3]
    
        n_samples = out.shape[0]
    
        batch_size = out.shape[0]

        input_to_rnn = F.conv2d(out.sum(1).view(n_samples,1,ht,wd),conv)
        
        input_to_rnn.cuda()
    
        input_to_rnn = self.relu(input_to_rnn)#F.relu(input_to_rnn)
        
        input_to_rnn.cuda()
    
        if (input_to_rnn.shape[2]>=4) and (input_to_rnn.shape[3]>=4):
        
            input_to_rnn = F.avg_pool2d(input_to_rnn,4)
       
        input_to_rnn = input_to_rnn.view(n_samples,1,-1)
    
        if input_to_rnn.shape[2]<10:
        
            repl = Variable(torch.zeros(batch_size,1,10-int(input_to_rnn.shape[2])).cuda())

            input_to_rnn = torch.cat((input_to_rnn,repl),2).cuda()
    
        one_d = math.floor(float(input_to_rnn.shape[2])/10)
         
        input_to_rnn = F.avg_pool1d(input_to_rnn,one_d)
    
        input_to_rnn = input_to_rnn.contiguous()
    
        input_to_rnn = input_to_rnn[:,:,:10];
    
        input_to_rnn = input_to_rnn.contiguous()
    
        input_to_rnn = input_to_rnn.view(n_samples,10)
    
        hidden = rnn(input_to_rnn,hidden).cuda()
        
        logistic = linear(hidden).cuda()
    
        prob = sigmoid(logistic).cuda()           
    
        shape = path_1.shape
    
        num_features = int(shape[1]*shape[2]*shape[3])
    
        mask = torch.bernoulli(prob).cuda() #(device = device_ids[0])
    
        s = 1-torch.mean(mask)
    
        s = s.cuda()#(device = device_ids[0])
    
        mask = mask.repeat(1,num_features).cuda()
    
        mask = mask.view(batch_size,int(shape[1]),int(shape[2]),int(shape[3])).cuda()
    
        mask = mask.cuda()#type('torch.cuda.FloatTensor')
    
        unmask = Variable(torch.ones(batch_size,int(shape[1]),int(shape[2]),int(shape[3])).cuda())-mask 
        
        out = mask*path_1 + self.relu(unmask*path_2)#F.relu(unmask*path_2)

    out = F.avg_pool2d(out, 4)

    out = out.view(out.size(0), -1)
    
    out = self.linear(out)
    
    return out



def main():
   device_ids = [0,1,2,3,4,5,6,7]
   dtype = torch.cuda.FloatTensor
   transform_train = \
   transforms.Compose([transforms.RandomCrop(32, padding=4),transforms.RandomHorizontalFlip(),\
                       transforms.ToTensor(),\
                       transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
   folder_train = 'folder_train'
   folder_test = 'folder_train'
   trainset = datasets.CIFAR10(folder_train,train=True,download=True,transform=transform_train)
   bsz = 256


  train_set = torch.utils.data.DataLoader\               
         (trainset,batch_size=bsz,shuffle=True,num_workers=2,pin_memory=True)  



transform_test = transforms.Compose([transforms.ToTensor(),\
                                     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

testset = torchvision.datasets.CIFAR10(folder_test, train=False, download=True,\
                                       transform=transform_test)
bsz_test = 64

test_set = torch.utils.data.DataLoader
(testset,batch_size=bsz_test,shuffle=False,num_workers=2,pin_memory=True)
 
model= ResNet(BasicBlock, [3,4,6,3]) #for resnet34
model = nn.DataParallel(model,device_ids = device_ids)
#nn.DataParallel(model_1,device_ids = device_ids)#.cuda(device=device_ids[0])
model = model.cuda()


print ('gpu count',torch.cuda.device_count())

dtype_1 = torch.cuda.LongTensor

dtype = torch.cuda.FloatTensor

for epoch in range(100):
    weights = Variable(torch.zeros(40*8).cuda())
    for data, target in train_set_spsa:
        data, target = Variable(data.type(dtype)), Variable(target.type(dtype_1))
        output = model(data,weights)
1 Like