Training speed is so slow. (GPU volatile is so low)

class MyCustomDataset():
    def __init__(self, cropped_1x32_dataset, targets, transforms=None):
        for i in range(32):
            self.__setattr__('data_{}'.format(i), cropped_1x32_dataset[i])
        self.targets = targets
        self.transforms = transforms
        
    def __getitem__(self, index):
        for i in range(32):
            globals()['data_{}'.format(i)] = self.__getattribute__('data_{}'.format(i))[index]
        y = self.targets[index]
        data = [globals()['data_{}'.format(i)] for i in range(32)]
        if self.transforms is not None:
            for i in range(32):
                data[i] = self.transforms(ToPIL(data[i]))
        return data,y
    
    def __len__(self):
        return len(self.data_0)
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4824, 0.4467),
                             std=(0.2471, 0.2436, 0.2616)),
])
# train
my_train_dataset = MyCustomDataset(train_cropped_1x32_dataset, train_dataset.targets, transform_train)
my_train_loader = torch.utils.data.DataLoader(dataset = my_train_dataset,
                                              batch_size = batch_size,
                                              shuffle = True,
                                              num_workers=os.cpu_count())
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
batch_size = 128
learning_rate = 0.1

device = 'cuda'
model = Vgg16()
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

if torch.cuda.device_count() > 0:
    print("USE", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
    cudnn.benchmark = True
else:
    print("USE ONLY CPU!")

if torch.cuda.is_available():
    model.cuda()
def train(epoch):
    model.train()
    train_loss = 0
    total = 0
    correct = 0
    
    for batch_idx, (cropped_1x32_dataset, target) in enumerate(my_train_loader):
        for i in range(32):   
            cropped_1x32_dataset[i] = cropped_1x32_dataset[i].to(device)
        target = target.to(device)

        optimizer.zero_grad()
        output = model(cropped_1x32_dataset, return_features=False)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        train_loss += loss.data
        _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        correct += predicted.eq(target.data).cpu().sum()
        if batch_idx % 10 == 9:
            print('Epoch: {} | Batch: {} |  Loss: ({:.4f}) | Acc: ({:.2f}%) ({}/{})'
                  .format(epoch+1, batch_idx+1, train_loss/(batch_idx+1), 100.*correct.item()/total, correct, total))

I’m working on os.environ["CUDA_VISIBLE_DEVICES"] = '1'
But as you can see, when I check nvidia-smi almost everytime volatile shows 0%.
Sometimes I shows upto 10~30%. So training speed is so slow.

How can I use full GPU power to speed up train?
Thank you.

Could you time your data loading as done in the ImageNet example?
I’m wondering why you are using globals() inside your Dataset as I would assume this could slow down your data loading pipeline.

1 Like