Low GPU (GTX 1070) utilization while training FCN


I am currently trying to train an FCN on my GPU. When I train the FCN, I see that the memory usage of my GPU is very high, but the processing utilization is very low. Is this normal? If not, could there be something that I am doing wrong?

Some code snippets that may help

# dataset snippet
def make_dataset(segmentation_type, mode):
   if segmentation_type == 'semantic':
       mask_dir_name = 'labels'
       raise ValueError('Segmentation type {} unsupported'.format(segmentation_type))

   img_path = os.path.join(root, mode, 'images')
   mask_path = os.path.join(root, mode, mask_dir_name)
   assert len(os.listdir(img_path)) == len(os.listdir(mask_path))
   dataset = []
   names = [img.split('.jpg')[0] for img in os.listdir(img_path)]
   for name in names:
       data_item = (os.path.join(img_path, name + '.jpg'), os.path.join(mask_path, name + '.png'))
   return dataset

class Mapillary(Dataset):
   def __init__(self, segmentation_type, mode, joint_transform=None,
                sliding_crop=None, transform=None, target_transform=None):
       assert (segmentation_type in set(('panoptic', 'semantic', 'instance')))
       assert (mode in set(('training', 'validation')))
       self.imgs = make_dataset(segmentation_type, mode)
       self.mode = mode
       self.joint_transform = joint_transform
       self.sliding_crop = sliding_crop
       self.transform = transform
       self.target_transform = target_transform

   def __len__(self):
       return len(self.imgs)

   def __getitem__(self, idx):
       img_path, mask_path = self.imgs[idx]
       img, mask = Image.open(img_path).convert('RGB'), Image.open(mask_path)
       mask = np.array(mask)
       mask_copy = mask.copy()
       mask = Image.fromarray(mask_copy.astype(np.uint8))  

       if self.joint_transform is not None:
           img, mask = self.joint_transform(img, mask)

       if self.sliding_crop is not None:
           img_slices, mask_slices, slices_info = self.sliding_crop(img, mask)
           if self.transform is not None:
               img_slices = [self.transform(e) for e in img_slices]

           if self.target_transform is not None:
               mask_slices = [self.target_transform(e) for e in mask_slices]

           img, mask = torch.stack(img_slices, 0), torch.stack(mask_slices, 0)
           return img, mask, torch.LongTensor(slices_info)

           if self.transform is not None
               img = self.transform(img)

           if self.target_transform is not None:
               mask = self.target_transform(mask)

           return img, mask
       return img, mask 
# dataloader
train_loader = DataLoader(train_set, batch_size=args['train_batch_size'], num_workers=2, shuffle=True)
# train snippet
def train(train_loader, net, criterion, optimizer, epoch, train_args):
    train_loss = AverageMeter()
    curr_iter = (epoch - 1) * len(train_loader)
    for i, data in enumerate(train_loader):
        inputs, labels = data
        assert inputs.size()[2:] == labels.size()[1:]
        N = inputs.size(0)
        inputs = Variable(inputs).cuda()
        labels = Variable(labels).cuda()

        outputs = net(inputs)
        assert outputs.size()[2:] == labels.size()[1:]
        assert outputs.size()[1] == mapillary.num_classes

        loss = criterion(outputs, labels) / N

        train_loss.update(loss.data, N)

        curr_iter += 1
        writer.add_scalar('train_loss', train_loss.avg, curr_iter)

        if (i + 1) % train_args['print_freq'] == 0:
            print('[epoch %d], [iter %d / %d], [train loss %.5f]' % (
                epoch, i + 1, len(train_loader), train_loss.avg))

I saw online that increasing the num_workers attribute of the dataloader may increase GPU utilization, but I subsequently run into an out of memory error if I increase num_workers. Most of the code I am using was adapted from https://github.com/zijundeng/pytorch-semantic-segmentation. I am quite new to all of this, so any advice would be greatly appreciated.