GPU utilization is 0% while using Pytorch, though the memory is being used partially

While working on a kaggle competition involving a binary classification task, the GPU doesn’t seem to be utilized(0%). The GPU is utilized 100% for normal tensor operations but not for the task at hand. Few solutions were provided in this post(Credits to @ptrblck). Unfortunately, though I couldn’t solve the issue. I am a beginner in the field and am looking out for help.

Here’s the code:

df = pd.read_csv("../input/siim-isic-melanoma-classification/train.csv")
df.head(3)
meta_data = df[['image_name','target']]
meta_data.head()
meta_data.to_csv('meta_data.csv',index=False)
path = "../input/siim-isic-melanoma-classification/jpeg/train/"
class Image_Pipeline(Dataset):
    
    def __init__(self,path_dir,csv_file,transform=None):
        self.df = pd.read_csv(csv_file)
        self.path = path_dir
        self.transform = transform
        
    def __getitem__(self,index):
        image_name = self.df.image_name.str.cat(['.jpg']*len(df)).values[index]
        image = Image.open(self.path+image_name).convert("RGB")
        #image = cv2.imread(os.path.join(self.path,image_name))
        label = torch.tensor(self.df.target.values[index],dtype = torch.long)
        
        if self.transform is not None:
            image = self.transform(image)
        return image,label
    
    def targets(self):
        label = torch.tensor(self.df.target.values,dtype = torch.float32)
        return label
    
    def __len__(self):
        return len(self.df)

batch_size = 16
val_pct = 0.2

get_transform = transforms.Compose([transforms.Resize((224,224)),
                                    transforms.ToTensor()]),
                                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
dataset = Image_Pipeline(path,'meta_data.csv',transform = get_transform)
    
def split_train_val(dataset, valid_pct, batch):
    
    train_idx, valid_idx = train_test_split(np.arange(len(dataset.targets())), 
                                            test_size=valid_pct,
                                            shuffle=True,
                                            stratify=dataset.targets())
    train_set = DataLoader(dataset,batch_size=batch,sampler=SubsetRandomSampler(train_idx))#,num_workers=4)
    val_set = DataLoader(dataset,batch_size=batch,sampler=SubsetRandomSampler(valid_idx))#,num_workers=4)
    print("Training data size: {} \nValidation data size: {}".format(len(train_set),len(val_set)))
    return train_set,val_set

traindata,validation = split_train_val(dataset,val_pct,batch_size)
#To verify that the dataset and the splitted train val are of the same size
print((len(traindata)*batch_size)+(len(validation)*batch_size),len(dataset))
device=('cuda' if torch.cuda.is_available() else 'cpu')
model=models.resnet34(pretrained=True)

def freeze_till_last(model):
    for param in model.parameters():
        param.requires_grad=False
        
freeze_till_last(model)
incoming = model.fc.in_features
model.fc = nn.Linear(in_features = incoming, out_features=1)

model.fc.weight.requires_grad=True
model.fc.bias.requires_grad=True
import torch.optim as optim
from torch.optim import lr_scheduler

model.to(device)

def fit(model, traind, validation,epochs=1): # loss_fn, optimizer, epochs=1): #
    print(device)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(),lr=0.0001)
    model.train()
    
    
    torch.cuda.synchronize()
    end = time.time()
    for epoch in trange(epochs):
        for data,label in traind:
            print("ellapsed time:{}".format(time.time()-end))
            torch.cuda.synchronize()
            model.to(device)
            data=data.to(device)
            label=label.to(device)            
            output = model(data)
            output = output.to(device)
            loss = loss_fn(output.view(1,batch_size)[0],label.to(torch.float))
            loss.backward()
            optimizer.step()
            print("loss:{:.3f}".format(loss.item()))
            model.zero_grad()
            end = time.time()
arg = [model,traindata,validation]
fit(*arg)

Thank you in advance.