While training my model with pretrained resnext50_32x4d on virtual machine(kaggle virtual machine),the training is very slow and the cpu is being maxed while gpu remains unused. Cannot sort out or find the root of the problem…might be simple or basic one but me being new to pytorch and less experienced it’s bugging me.
Dataset is from this kaggle competition link
Model
class Resnext50_32x4d(nn.Module):
def init(self):
super().init()
self.model = models.resnext50_32x4d(pretrained = True)
self.l1 = nn.Linear(1000,1)
def forward(self,image,view=True):
#if view==True : print("Image shape {}".format(image.shape))
img = self.model(image)
out = self.l1(img)
#print("Output Shape {}".format(out.shape))
return out
Dataloader
class Data_Loader(Dataset):
def init(self,image_path,im_name,target,valid=False):
self.name = im_name
self.target = target
self.path = image_path
self.valid = valid
def __len__(self):
return (len(self.name))
def __getitem__(self,index):
if self.valid==False:
im = self.name[index]
self.train_y = self.target[index]
im_tensor = image_aug(self.path,im)
return im_tensor,self.train_y
else:
im = self.name[index]
self.valid_y = self.target[index]
im_tensor = image_aug(self.path,im,valid=True)
return im_tensor,self.valid_y
Training
batch_t = 32
batch_v = 16
device = ‘cuda’
image_path = ‘/kaggle/input/siim-isic-melanoma-classification/jpeg/train/’
train_df = df[df.kfold!=0].reset_index(drop=True)
valid_df = df[df.kfold==0].reset_index(drop=True)
train_im = train_df.image_name.values.tolist()
train_y = train_df.target.values
valid_im = valid_df.image_name.values.tolist()
valid_y = valid_df.target.values
train_dataset = Data_Loader(image_path,train_im,train_y)
train_dataset = DataLoader(train_dataset,batch_t,shuffle=False,num_workers=4)
valid_dataset = Data_Loader(image_path,valid_im,valid_y)
valid_dataset = DataLoader(valid_dataset,batch_v,shuffle=False,num_workers=4)
train loop
def train(fold):
model = Resnext50_32x4d()
model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer,
patience=3,
threshold=0.001,
mode=“max”
)
criterion = nn.BCEWithLogitsLoss()
epochs = 25
best_score = 0
for epoch in range(epochs):
model.train()
batch = 0
for train_data,label in train_dataset:
train_data = train_data.to(device)
label = torch.tensor(label,dtype = torch.float32)
label = label.to(device)
optimizer.zero_grad()
out = model(train_data)
loss = criterion(out,label.unsqueeze(1).type_as(out))
batch +=1
#print("EPOCH {} Loss {} batch {}".format(epoch,loss.item(),batch))
loss.backward()
optimizer.step()
model.eval()
true = []
pred = []
batch = 0
for valid_data,valid_label in valid_dataset:
valid_data = valid_data.to(device)
valid_label = torch.tensor(valid_label,dtype = torch.float32)
valid_label = valid_label.to(device)
batch +=1
true.append(valid_label.cpu())
with torch.no_grad():
out = model(valid_data)
loss = criterion(out,valid_label.unsqueeze(1).type_as(out))
pred.append(out.cpu())
# print('Valid Loss {} batch {}'.format(loss.item(),batch))
#print("true {} '''/n''' pred {}".format(true,pred))
#true=np.vstack((true)).ravel()#torch.tensor(true).view(-1)
#pred=np.vstack((pred)).ravel()#torch.tensor(pred).view(-1)
#auc_score = roc_auc_score(true,pred)
#print("EPOCH {} AUC Score {}".format(epoch,auc_score))
#if auc_score>best_score:
#best_score = auc_score
#torch.save(model,'best_model.pth')
#print("Validation Score Improved ======>>>>>> Saving Model")