If i use validation loop, my model is not working?

When i predict without using validation it’s working fine, but If i use validation loop, my model just shows prediction for first class(0) only. can you guys please have look at my codes?


if __name__ == '__main__':
    random.seed(118)
    np.random.seed(118)
    torch.manual_seed(118)
    torch.cuda.manual_seed(118)
    torch.cuda.manual_seed_all(118)
    
    # dataset
    img_root = '/content/drive/MyDrive/MIDD/working_MIDD_CODES2/DataSet'
    save_path = './model'
    if not os.path.exists(save_path): os.mkdir(save_path)
    lr = 0.001
    batch_size = 4
    epoch = 15
    lr_dec=[21,51]
    
    #%%
    data = Data(img_root)
    ttrain1, validd = random_split(data,[5420,1352])   #GT_Sub3
    ttrain, ttest = random_split(ttrain1,[4740,680])
    loader = DataLoader(ttrain, batch_size=batch_size, shuffle=True, num_workers=1)
    loader_V=DataLoader(validd, batch_size=batch_size, shuffle=True, num_workers=1)
    #%%
    net = Mnet().cuda()
    #%%
    net.load_pretrained_modell()
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=lr, weight_decay=0.0005,momentum=0.9)
    # Dd=smp.losses.DiceLoss('multilabel')
    iter_num = len(loader)
    iter_num_k = len(loader_V)
    loss=[]
    lossf=[]
    lossf_V=[]
    loss1=[]
    loss2=[]
    loss1_V=[]
    loss2_V=[]
    loss_V=[]
    overall_accuracy=[]
    overall_accuracy_V=[]
    net.train()
    # weights=torch.tensor([ 0.26960305, 14.45963237, 17.68206829,  6.05581275])
    # aa=np.array([ 0.26, 14.45, 17.68,  6.05])
    # weights=torch.from_numpy(aa)
    accuracy=[]
    # class0_accuracy=[]
    # class1_accuracy=[]
    # class2_accuracy=[]
    # class3_accuracy=[]
#%%
    for epochi in range(0, epoch):
        if epochi in lr_dec :
            lr=lr/10
            optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=lr, weight_decay=0.0005,momentum=0.9)
            print(lr)
        prefetcher = DataPrefetcher(loader)
        rgb, t, label = prefetcher.next()
        
        r_sal_loss = 0
        sal_loss1_=0
        sal_loss2_=0
        sal_loss3_=0
        sal_loss1V_=0
        sal_loss2V_=0
        sal_loss3V_=0
        r_sal_loss_V = 0
        total_size=0
        correct=0
        total_size_V=0
        correct_V=0
        net.zero_grad()
        i = 0
        for i in range(0, iter_num):
            i+=1
            # rgb=rgb.cpu()
            # t=t.cpu()
            score, score1, score2,g= net(rgb.float(), t.float())
            # label  = label.permute(0,3,1,2)
            # sal_loss= my_loss1( score.cuda(),score1.cuda(),score2.cuda(),g.cuda(),label)
            # Dd=torch.nn.CrossEntropyLoss(weight=weights.to('cuda'))
            # Dd=DiceLoss(weight=weights.to('cuda'))
            # nn.CrossEntropyLoss 
            Dd=smp.losses.DiceLoss('multilabel')
            sal_loss1=Dd(score,label)
            sal_loss2=Dd(score1,label)
            sal_loss3=Dd(score2,label)
            sal_loss=sal_loss1+sal_loss2+sal_loss3
            r_sal_loss += sal_loss.data
            sal_loss1_+=sal_loss1.data
            sal_loss2_+=sal_loss2.data
            sal_loss3_+=sal_loss3.data
            sal_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            score_acc=torch.argmax(score,dim=1)         
            label_acc1=torch.argmax(label,dim=1)
            correct += (score_acc == label_acc1).float().sum()
            # correct+=(score_acc[0]==label_acc1[0]).sum()+(score_acc[1]==label_acc1[1]).sum()+(score_acc[2]==label_acc1[2]).sum()+(score_acc[3]==label_acc1[3]).sum()
            
            total_size+=label_acc1.cpu().numpy().size
            
            if i % 250 == 0:
                print('epoch: [%2d/%2d], iter: [%5d/%5d]  ||  loss : %5.4f || accuracy : %5.2f' % (
                    epochi, epoch, i, iter_num, r_sal_loss / 100,100*correct/total_size))
                # r_sal_loss = 0
            rgb, t, label = prefetcher.next()

          #validation
        loss.append(r_sal_loss.cpu().numpy()/len(loader))
        lossf.append(sal_loss1_.cpu().numpy()/len(loader))
        loss1.append(sal_loss2_.cpu().numpy()/len(loader))
        loss2.append(sal_loss3_.cpu().numpy()/len(loader))
        overall_accuracy.append(100*correct.cpu().numpy()/total_size)

        prefetcher_V = DataPrefetcher(loader_V)
        rgb, t, label = prefetcher_V.next()
        net.eval()
        # print(correct_V,total_size_V)
        k=0
        
        for k in range(0,iter_num_k):
            k+=1
            score, score1, score2,g= net(rgb.float(), t.float())
            Dd=smp.losses.DiceLoss('multilabel')
            sal_loss1_V=Dd(score,label)
            sal_loss2_V=Dd(score1,label)
            sal_loss3_V=Dd(score2,label)
            sal_loss=sal_loss1_V+sal_loss2_V+sal_loss3_V
            # r_sal_loss += sal_loss.data
            r_sal_loss_V += sal_loss.data
            score_acc=torch.argmax(score,dim=1)         
            label_acc1=torch.argmax(label,dim=1)
            total_size_V+=label_acc1.cpu().numpy().size
            correct_V += (score_acc == label_acc1).float().sum()
            sal_loss1V_+=sal_loss1_V.data
            sal_loss2V_+=sal_loss2_V.data
            sal_loss3V_+=sal_loss3_V.data
            # print(correct,total_size)
            if k % 100 == 0:
                print('epoch: [%2d/%2d], iter: [%5d/%5d]  ||  loss : %5.4f || accuracy_Val: %5.2f' % (
                    epochi, epoch, k, iter_num_k, r_sal_loss_V / 100,100*correct_V/total_size_V))
            rgb, t, label = prefetcher_V.next()  
        # loss_V.append(r_sal_loss_V/len(loader_V))
        loss_V.append(r_sal_loss_V.cpu().numpy()/len(loader_V))
        lossf_V.append(sal_loss1V_.cpu().numpy()/len(loader))
        loss1_V.append(sal_loss2V_.cpu().numpy()/len(loader))
        loss2_V.append(sal_loss3V_.cpu().numpy()/len(loader))
        overall_accuracy_V.append(100*correct_V.cpu().numpy()/total_size_V)

        if epochi %30==0:
            torch.save(net.state_dict(), '%s/epoch_%d.pth' % (save_path, epochi))
        torch.save(net.state_dict(), '%s/final_batch1_try18wg.pth' % (save_path))
 

To me, it’s too noisy to look into. Can you please identify the blocks that are behaving weirdly?

It’s hard to detect, because the loss goes down, but accuracy remains stable 93%, because i have a unbalanced dataset, class 0 acquires around 93%, moreover while i train without validation accuracy reaches upto 96.5%…

net.train() was missing after validation laoop, thaks for looking into the matter.