CPU maxed out on training resnext50_32x4d....while gpu not being used hence slow training

While training my model with pretrained resnext50_32x4d on virtual machine(kaggle virtual machine),the training is very slow and the cpu is being maxed while gpu remains unused. Cannot sort out or find the root of the problem…might be simple or basic one but me being new to pytorch and less experienced it’s bugging me.
Dataset is from this kaggle competition link

Model
class Resnext50_32x4d(nn.Module):
def init(self):
super().init()
self.model = models.resnext50_32x4d(pretrained = True)
self.l1 = nn.Linear(1000,1)

def forward(self,image,view=True):
    #if view==True : print("Image shape {}".format(image.shape))
    img = self.model(image)
    out = self.l1(img)
    #print("Output Shape {}".format(out.shape))
    
    return out

Dataloader
class Data_Loader(Dataset):
def init(self,image_path,im_name,target,valid=False):
self.name = im_name
self.target = target
self.path = image_path
self.valid = valid

def __len__(self):
    return (len(self.name))

def __getitem__(self,index):
    
    if self.valid==False:
        im = self.name[index]
        self.train_y = self.target[index]
        im_tensor = image_aug(self.path,im)
        
        return im_tensor,self.train_y
    
    else:
        im = self.name[index]
        self.valid_y = self.target[index]
        im_tensor = image_aug(self.path,im,valid=True)
        
        return im_tensor,self.valid_y

Training
batch_t = 32
batch_v = 16
device = ‘cuda’
image_path = ‘/kaggle/input/siim-isic-melanoma-classification/jpeg/train/’
train_df = df[df.kfold!=0].reset_index(drop=True)
valid_df = df[df.kfold==0].reset_index(drop=True)
train_im = train_df.image_name.values.tolist()
train_y = train_df.target.values
valid_im = valid_df.image_name.values.tolist()
valid_y = valid_df.target.values
train_dataset = Data_Loader(image_path,train_im,train_y)
train_dataset = DataLoader(train_dataset,batch_t,shuffle=False,num_workers=4)
valid_dataset = Data_Loader(image_path,valid_im,valid_y)
valid_dataset = DataLoader(valid_dataset,batch_v,shuffle=False,num_workers=4)

train loop
def train(fold):
model = Resnext50_32x4d()
model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer,
patience=3,
threshold=0.001,
mode=“max”
)
criterion = nn.BCEWithLogitsLoss()
epochs = 25
best_score = 0

for epoch in range(epochs):
    
        model.train()
        batch = 0 
        for train_data,label in train_dataset:
            train_data = train_data.to(device)
            label = torch.tensor(label,dtype = torch.float32)
            label = label.to(device)
            
            optimizer.zero_grad()
            out = model(train_data)
            loss = criterion(out,label.unsqueeze(1).type_as(out))
            batch +=1
            #print("EPOCH {}  Loss {}  batch  {}".format(epoch,loss.item(),batch))
            
            loss.backward()
            optimizer.step()
        

        model.eval()
        true = []
        pred = []
        batch = 0
        for valid_data,valid_label in valid_dataset:
            valid_data = valid_data.to(device)
            valid_label = torch.tensor(valid_label,dtype = torch.float32)
            valid_label = valid_label.to(device)
            batch +=1
            true.append(valid_label.cpu())
            
            with torch.no_grad():
                out = model(valid_data)
                loss = criterion(out,valid_label.unsqueeze(1).type_as(out))
                pred.append(out.cpu())
               # print('Valid Loss {}  batch  {}'.format(loss.item(),batch))
        #print("true   {} '''/n''' pred  {}".format(true,pred))     
        #true=np.vstack((true)).ravel()#torch.tensor(true).view(-1)
        #pred=np.vstack((pred)).ravel()#torch.tensor(pred).view(-1)
        #auc_score = roc_auc_score(true,pred)
        #print("EPOCH {}  AUC Score {}".format(epoch,auc_score))
                
        #if auc_score>best_score:
                    #best_score = auc_score 
                    #torch.save(model,'best_model.pth')
                    #print("Validation Score Improved ======>>>>>> Saving Model")

You could try to narrow down the bottleneck by profiling the code.
E.g. to isolate a data loading bottleneck, you could use the data_time class from the ImageNet example. If the data loading is the bottleneck, you could have a look at this post for potential reasons of this bottleneck and workarounds. Of course you won’t be able to change the machine, as it’s a Kaggle node.