GPU is getting detected , but not utilised

I am training a GAN for Mask Removal from human face .
While Training , my device is coming as ‘cuda’ , my model and data are all specified to ‘cuda’ ,
but while training , all my training is happening only in ‘cpu’ and no gpu is remaining unutilised

Even while training , i checked my tensor device , which is cuda.
This is running perfectly in cpu , and not gpu even when the device is ‘cuda’
‘’’
class RemoveMaskDataset(Dataset):

def __init__(self , base_dir):

    super(RemoveMaskDataset , self).__init__()

    self.base_dir = base_dir 
    
    self.with_mask_dir_path =  os.path.join(self.base_dir , 'with_mask')
    self.without_mask_dir_path = os.path.join(self.base_dir , 'without_mask')

    self.masked_images_names = os.listdir(self.with_mask_dir_path)
    self.without_mask_images_names = os.listdir(self.without_mask_dir_path)

    self.masked_images_paths = [os.path.join(self.with_mask_dir_path , name) for name in self.masked_images_names]
    self.without_masked_images_paths = [os.path.join(self.without_mask_dir_path , name) for name in self.without_mask_images_names]

    self.transform = transforms.Compose([
        ToTensor() , 
        Resize((64, 64) , antialias=True),
    ])
    
def __len__(self):

    return len(self.masked_images_names)

def __getitem__(self , idx):

    masked_img_path = self.masked_images_paths[idx]
    without_mask_img_path = self.without_masked_images_paths[idx]


    mask_img = cv2.imread(masked_img_path)
    without_mask = cv2.imread(without_mask_img_path)

    mask_img_rgb = cv2.cvtColor(mask_img, cv2.COLOR_BGR2RGB)
    without_mask_rgb = cv2.cvtColor(without_mask , cv2.COLOR_BGR2RGB)

    return self.transform(mask_img_rgb) , self.transform(without_mask_rgb)

class Generator(nn.Module):

def __init__(self , latent_dim):

    super(Generator , self).__init__()

    self.latent_dim = latent_dim

    self.convtr1 = nn.ConvTranspose2d(self.latent_dim , 512 , 4 , 1 , 0 , bias = False)
    self.batchnorm1 = nn.BatchNorm2d(512)
    self.relu1 = nn.ReLU()
    
    self.convtr2 = nn.ConvTranspose2d(512 , 256 , 4 , 2 , 1 , bias = False)
    self.batchnorm2 = nn.BatchNorm2d(256)
    self.relu2 = nn.ReLU()

    self.convtr3 = nn.ConvTranspose2d(256 , 128 ,  4 , 2 , 1 , bias = False)
    self.batchnorm3 = nn.BatchNorm2d(128)
    self.relu3 = nn.ReLU()


    self.convtr4 = nn.ConvTranspose2d(128 , 64 , 4 , 2 , 1 , bias = False)
    self.batchnorm4 = nn.BatchNorm2d(64)
    self.relu4 = nn.ReLU()

    self.convtr5 = nn.ConvTranspose2d(64 , 3 , 4 , 2 , 1 , bias = False)

def forward(self , input):

    x = self.relu1(self.batchnorm1(self.convtr1(input)))
    x = self.relu2(self.batchnorm2(self.convtr2(x)))
    x = self.relu3(self.batchnorm3(self.convtr3(x)))
    x = self.relu4(self.batchnorm4(self.convtr4(x)))
    x = self.convtr5(x)

    return x 

class Discriminator(nn.Module):

def __init__(self):

    super(Discriminator , self).__init__()

    self.conv1 = nn.Conv2d(3 , 64 , 4 , 2 , 1 , bias = False)
    self.act1 = nn.LeakyReLU()

    self.conv2 = nn.Conv2d(64 , 128 , 4 , 2 , 1 , bias = False)
    self.bnrm2 = nn.BatchNorm2d(128)
    self.act2 = nn.LeakyReLU(128)

    self.conv3 = nn.Conv2d(128 , 256 , 4 , 2 , 1 , bias = False)
    self.bnrm3 = nn.BatchNorm2d(256)
    self.act3 = nn.LeakyReLU(256)

    self.conv4 = nn.Conv2d(256 , 512 , 4 , 2,  1 , bias = False)
    self.bnrm4 = nn.BatchNorm2d(512)
    self.act4 = nn.LeakyReLU()

    self.final_conv = nn.Conv2d(512 , 1 , 4 , 1, 0 , bias = False)
    self.sigmoid = nn.Sigmoid()


def forward(self , input):

    x = self.act1(self.conv1(input))
    x = self.act2(self.bnrm2(self.conv2(x)))
    x = self.act3(self.bnrm3(self.conv3(x)))
    x = self.act4(self.bnrm4(self.conv4(x)))

    x = self.final_conv(x)
    x = self.sigmoid(x)

    return x 

D_loss_plot, G_loss_plot = ,
for epoch in tqdm(range(1, num_epochs + 1)):
D_loss_list, G_loss_list = ,

for index, (input_images, output_images) in enumerate(dataloader):
            
    # Discriminator training
    discriminator_optimizer.zero_grad()
    input_images, output_images = input_images.to(device), output_images.to(device)
    
    print(input_images.device)
    print(output_images.device)

    real_target = Variable(torch.ones(input_images.size(0)).to(device)).unsqueeze(1)
    output_target = Variable(torch.zeros(output_images.size(0)).to(device)).unsqueeze(1)

    D_real_loss = discriminator_loss(discriminator(input_images).view(-1), real_target.view(-1))
    D_real_loss.backward()

    noise_vector = torch.randn(input_images.size(0), latent_dim, 1, 1, device=device)
    noise_vector = noise_vector.to(device)

    generated_image = generator(noise_vector)
    output = discriminator(generated_image.detach())
    D_fake_loss = discriminator_loss(output.view(-1), output_target.view(-1))
    D_fake_loss.backward()

    D_total_loss = D_real_loss + D_fake_loss
    D_loss_list.append(D_total_loss)
    discriminator_optimizer.step()

    # Generator training
    generator_optimizer.zero_grad()
    G_loss = generator_loss(discriminator(generated_image).view(-1), real_target.view(-1))
    G_loss_list.append(G_loss)
    G_loss.backward()
    generator_optimizer.step()

# Print and save results
print('Epoch: [%d/%d]: D_loss: %.3f, G_loss: %.3f' % (
    epoch, num_epochs, torch.mean(torch.FloatTensor(D_loss_list)),
    torch.mean(torch.FloatTensor(G_loss_list))))

D_loss_plot.append(torch.mean(torch.FloatTensor(D_loss_list)))
G_loss_plot.append(torch.mean(torch.FloatTensor(G_loss_list)))

torch.save(generator.state_dict(), f'./{save_dir}/generator_epoch_{epoch}.pth')
torch.save(discriminator.state_dict(), f'./{save_dir}/discriminator_epoch_{epoch}.pth')

‘’’

What should i do to fix this solution.

Hi
You can define in similiar way

   import torch.nn.init as init
    class ImageClassification(ImageClassificationBase):
        def __init__(self):
            super().__init__()
            self.network = nn.Sequential(
                #image size is [1,900,300] as [channel, height,width]
                nn.Conv2d(1, 32, kernel_size = 3, padding = 1),
                nn.LeakyReLU(0.01),
                nn.BatchNorm2d(32),
                nn.AvgPool2d(kernel_size=2, stride=2),

                nn.Conv2d(32,32, kernel_size = 3,  padding = 1),
                nn.LeakyReLU(0.01),
                nn.BatchNorm2d(32),
                nn.AvgPool2d(kernel_size=2, stride=2),
            
                nn.Conv2d(32, 64, kernel_size = 3, padding = 1),
                nn.LeakyReLU(0.01),
                nn.BatchNorm2d(64),
                nn.AvgPool2d(kernel_size=2, stride=2),
            
                nn.Conv2d(64 ,64, kernel_size = 3, padding = 1),
                nn.LeakyReLU(0.01),
                nn.BatchNorm2d(64),
                nn.AvgPool2d(kernel_size=2, stride=2),
                                    
                nn.Flatten(),                
                nn.Dropout(0.3),

                nn.Linear(64 * 56 * 18, 64),  # Assuming input size after convolutional layers is 64 * 56 * 18
                nn.LeakyReLU(0.01),
                nn.BatchNorm1d(64),
                nn.Dropout(0.2),
            
                nn.Linear(64, 64),
                nn.LeakyReLU(0.01),
                nn.BatchNorm1d(64),
                nn.Dropout(0.2),
            
                nn.Linear(64, 10)  # Output layer
            )
            # Initialize the weights of convolutional layers
            #self._initialize_weights()

        #def _initialize_weights(self):
        #    for m in self.modules():
        #        if isinstance(m, nn.Conv2d):
        #            init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
        
        def forward(self, xb):
            return self.network(xb)

    def get_default_device():
        #Set Device to GPU or CPU
        if torch.cuda.is_available():
            return torch.device('cuda')
        else:
            return torch.device('cpu')
        

    def to_device(data, device):
        "Move data to the device"
        if isinstance(data,(list,tuple)):
            return [to_device(x,device) for x in data]
        return data.to(device,non_blocking = True)

    class DeviceDataLoader():
        #Wrap a dataloader to move data to a device
        
        def __init__(self, dl, device):
            self.dl = dl
            self.device = device
        
        def __iter__(self):
            #Yield a batch of data after moving it to device
            for b in self.dl:
                yield to_device(b,self.device)
                
        def __len__(self):
            #Number of batches
            return len(self.dl)

    device = get_default_device()
    device

    torch.cuda.empty_cache()
    model = ImageClassification()

    random_seed = 42
    torch.manual_seed(random_seed)

    train_loader = DeviceDataLoader(train_loader, device)
    test_loader = DeviceDataLoader(test_loader, device)

    to_device(model, device)

How did you check and verify the GPU is not used?

Hi
Thanks learner1234

I tried out this way , but am still facing the same issue , the model is still running only in CPU , even though the model and data is specified as cuda .

So when i run it in kaggle , the CPU usage shoots up and becomes full , but the gpu usage is 0.
image

When I try it in my laptop , I open another terminal , and run nvidia-smi -l 1 , and the volatile gpu usage there is almost 0%. When i check the same in task manager , my CPU is in full usage always , but GPU usage is NIL

ok, have you checked the cuda version and torch version compatibility
i am using torch 1.10 and cuda 11.2
please check it once

Hi , I am having torch 2.1.2 and cuda 12.1. Its directly from kaggle

could you try once this line

# CUDA 12.1
pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121

and update the nvidia driver?

I ran that command , and updated the drivers too , but still facing the same issue

This doesn’t mean your GPU isn’t used, but could point towards e.g. a CPU bottleneck.
You can profile your code to check where the bottleneck is.
A quick verification can also be done via the native profiler, which should show valid CUDA kernels, e.g. in:

import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity


model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()

with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
# STAGE:2024-03-09 11:10:24 228565:228565 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
# STAGE:2024-03-09 11:10:24 228565:228565 ActivityProfilerController.cpp:320] Completed Stage: Collection
# STAGE:2024-03-09 11:10:24 228565:228565 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
# -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
#                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
# -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
#                                         model_inference         0.00%       0.000us         0.00%       0.000us       0.000us      85.717ms        98.08%      85.717ms      42.858ms             2  
#                                         model_inference         1.72%       2.266ms       100.00%     131.573ms     131.573ms       0.000us         0.00%       1.682ms       1.682ms             1  
#                                            aten::conv2d         0.14%     186.000us        72.58%      95.499ms       4.775ms       0.000us         0.00%       1.280ms      64.000us            20  
#                                       aten::convolution         0.12%     158.000us        72.44%      95.313ms       4.766ms       0.000us         0.00%       1.280ms      64.000us            20  
#                                      aten::_convolution         0.16%     215.000us        72.32%      95.155ms       4.758ms       0.000us         0.00%       1.280ms      64.000us            20  
#                                 aten::cudnn_convolution        36.92%      48.585ms        72.15%      94.940ms       4.747ms       1.280ms         1.46%       1.280ms      64.000us            20  
# sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nh...         0.00%       0.000us         0.00%       0.000us       0.000us     290.000us         0.33%     290.000us      96.667us             3  
#                            aten::_batch_norm_impl_index         0.14%     183.000us         5.72%       7.520ms     376.000us       0.000us         0.00%     223.000us      11.150us            20  
#                                  aten::cudnn_batch_norm         1.11%       1.460ms         5.58%       7.337ms     366.850us     223.000us         0.26%     223.000us      11.150us            20  
# sm86_xmma_fprop_implicit_gemm_indexed_tf32f32_tf32f3...         0.00%       0.000us         0.00%       0.000us       0.000us     223.000us         0.26%     223.000us      55.750us             4  
# -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
# Self CPU time total: 131.579ms
# Self CUDA time total: 87.399ms

PyTorch binaries ship with their own CUDA dependency and your locally installed CUDA toolkit will be used if you build PyTorch from source or a custom CUDA extension.
To execute PyTorch workloads on the GPU, you would only need to install the PyTorch binaries with CUDA support and would need to properly install an NVIDIA driver.