Classifier predicts only zeros - from Keras to PyTorch

Hey guys!
I’m migrating from Keras to PyTorch and I’m quite new at this, so I apologise in advance if this is a simple issue that I just can’t figure out myself.
I had in Keras a simple CNN based on the VGG architecture to try and classify some simple images containing triangles and circles (dataset can be found here) and it actually learned something (achieved accuracies of 100% after 20 or so epochs)…and now in PyTorch I cannot for the life of me figure out what’s going on…it always predicts zeros and the accuracy does not change. I even initialized the layers according to the Keras default initialization scheme.
Below is the code. (This CNN is supposed to be part of a bigger network, but for now I am interested only in converging this model on this data.)


FILE #1 - ConvMod.py

from torch import nn
class ConvMod(nn.Module):
    def __init__(self, in_channels=3, nfilters=32, conv_filter_size=(3, 3)):
        super(ConvMod, self).__init__()
        self.in_channels = in_channels
        self.nfilters = nfilters
        self.conv_filter_size = conv_filter_size

        self.conv0 = nn.Conv2d(in_channels, self.nfilters, self.conv_filter_size, padding=1, padding_mode='zeros')
        self.relu0 = nn.ReLU()
        self.conv1 = nn.Conv2d(self.nfilters, self.nfilters, self.conv_filter_size, padding=1, padding_mode='zeros')
        self.relu1 = nn.ReLU()

    def forward(self, x):
        x = self.relu0(self.conv0(x))
        x = self.relu1(self.conv1(x))
        return x
            
    def _init_weights_clf(self, m):
        if(type(m) == nn.Conv2d):
            m.bias.data.fill_(0.0) 
            nn.init.xavier_uniform_(m.weight)

FILE #2 - VGGClf.py

from torch import nn
from ConvMod import ConvMod
class VGGClf(nn.Module):
    def __init__(self, in_channels=3, dec_conv_filter_size=(3, 3), dec_pool_size=2, img_size=(224, 224), dropout=0.3, num_classes=2):

        super(VGGClf, self).__init__()
        self.in_channels = in_channels
        self.dec_conv_filter_size = dec_conv_filter_size
        self.dec_pool_size = dec_pool_size
        self.dropout = dropout
        self.img_size = img_size
        self.num_classes = num_classes
        
        self.conv_mod0 = ConvMod(self.in_channels, 32, self.dec_conv_filter_size)
        self.pool0 = nn.MaxPool2d(self.dec_pool_size)

        self.conv_mod1 = ConvMod(32, 64, self.dec_conv_filter_size)
        self.pool1 = nn.MaxPool2d(self.dec_pool_size)

        self.conv_mod2 = ConvMod(64, 128, self.dec_conv_filter_size)
        self.pool2 = nn.MaxPool2d(self.dec_pool_size)

        self.conv_mod3 = ConvMod(128, 256, self.dec_conv_filter_size)
        self.pool3 = nn.MaxPool2d(self.dec_pool_size)

        self.dense0 = nn.Linear(256, 128)
        #self.bn0 = nn.BatchNorm1d(128)
        self.actv0 = nn.Sigmoid()
        self.dropout0 = nn.Dropout(p=self.dropout)
        self.dense1 = nn.Linear(128, 128)
        self.actv1 = nn.Sigmoid()
        self.dropout1 = nn.Dropout(p=self.dropout)

        self.last_dense = nn.Linear(128, self.num_classes)
        self.softmax = nn.LogSoftmax(dim=1)
    
        self.apply(self._init_weights)
    
    def _init_weights(self, m):
        if(type(m) == ConvMod):
            m.apply(m._init_weights_clf)
        elif(type(m) == nn.Linear):
            m.bias.data.fill_(0.0) 
            nn.init.xavier_uniform_(m.weight)

    def forward(self, x):
        x = self.conv_mod0(x)
        x = self.pool0(x)

        x = self.conv_mod1(x)
        x = self.pool1(x)

        x = self.conv_mod2(x)
        x = self.pool2(x)

        x = self.conv_mod3(x)
        x = self.pool3(x)

        x = nn.functional.max_pool2d(x, kernel_size=x.size()[2:])
        x = x.view(x.size(0), -1)      
        #x = self.bn0(x)
        x = self.dense0(x)
        x = self.dropout0(self.actv0(x))
       
        x = self.dense1(x)
        x = self.dropout1(self.actv1(x))

        x = self.last_dense(x)
        x = self.softmax(x)

        return x

FILE #3 - ExplainerClassifierCNN.py

from VGG import VGGClf
from tqdm import tqdm
import torch
from sklearn.metrics import accuracy_score
from torch import nn

class ExplainerClassifierCNN(nn.Module):
    def __init__(self, exp_in_channels=3, exp_conv_filter_size=(3, 3), exp_pool_size=2,
                 dec_in_channels=3, dec_conv_filter_size=(3, 3), dec_pool_size=2,
                 img_size=(224, 224), dropout=0.3, num_classes=2, clf='VGG'):
        super(ExplainerClassifierCNN, self).__init__()

        self.exp_in_channels = exp_in_channels
        self.exp_conv_filter_size = exp_conv_filter_size
        self.exp_pool_size = exp_pool_size

        self.dec_in_channels = dec_in_channels
        self.dec_conv_filter_size = dec_conv_filter_size
        self.dec_pool_size = dec_pool_size
        
        self.dropout = dropout
        self.img_size = img_size
        self.num_classes = num_classes  

        self.clf = clf

        self.clf = VGGClf(in_channels=self.dec_in_channels, 
                    dec_conv_filter_size=self.dec_conv_filter_size,
                    dec_pool_size=self.dec_pool_size, img_size=self.img_size,
                    dropout=self.dropout, num_classes=self.num_classes)

    def train(self, dataloader, optimizer, device):
        self.clf.train()

        dec_criterion = torch.nn.NLLLoss(reduction='mean') 
        for batch_imgs, batch_labels in tqdm(dataloader, disable=False):
            optimizer.zero_grad()
            batch_imgs, batch_labels = batch_imgs.to(device), batch_labels.to(device)

            batch_probs = self.clf(batch_imgs)
            dec_loss = dec_criterion(batch_probs, batch_labels)

            dec_loss.backward()
            optimizer.step()  

    def validation(self, dataloader, device):
        self.clf.eval()
        print('\nVAL')

        val_loss = 0
        val_acc = 0

        whole_preds = []
        whole_labels = []
        dec_criterion = torch.nn.NLLLoss(reduction='sum') 
        with torch.no_grad():

            for batch_imgs, batch_labels in tqdm(dataloader, disable=False):
                
                batch_imgs, batch_labels = batch_imgs.to(device), batch_labels.to(device)

                batch_probs = self.clf(batch_imgs)
            
                batch_dec_loss = dec_criterion(batch_probs, batch_labels)

                val_loss += batch_dec_loss.item()

                batch_max_probs, batch_preds = torch.max(batch_probs, 1)

                whole_labels.extend(batch_labels.data.cpu().numpy())
                whole_preds.extend(batch_preds.data.cpu().numpy())

            val_loss /= len(dataloader.dataset)         
            val_acc += accuracy_score(whole_labels, whole_preds)

        return val_loss, val_acc

FILE #4 - Dataset.py

import torch
import cv2
import pandas as pd
import os
class Dataset(torch.utils.data.Dataset):
    def __init__(self, folder):
        self.df = pd.read_excel(os.path.join(folder, 'data.xlsx'), index_col=None)
        self.len = len(self.df)
        self.img_ids = self.df['imageID'].values
        self.labels = self.df['label'].values
        self.folder = folder

    def __getitem__(self, index):
        img_path = os.path.join(self.folder, self.img_ids[index])
        img = cv2.imread(img_path)   
        img = img / 255.
        img = img.transpose(2, 1, 0)
        img = torch.tensor(img, dtype=torch.float32)
        label = torch.tensor(int(self.labels[index]), dtype=torch.int64)
       
        return img, label
 
    def __len__(self):
        return self.len

FILE #5 – train.py

import os
from ExplainerClassifierCNN import ExplainerClassifierCNN
from Dataset import Dataset
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch import optim
from sklearn.model_selection import train_test_split
from torch.utils.data.sampler import SubsetRandomSampler

torch.manual_seed(0)
np.random.seed(0)
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

dataset_folder = '/media/TOSHIBA6T/ICRTO/Synthetic Dataset/simplified_no_colour/'
nr_classes = 2
img_size = (224, 224)
nr_epochs = 75
batch_size = 32
model = ExplainerClassifierCNN(num_classes=nr_classes)
model.to(device)
opt = optim.Adadelta(model.parameters(), lr=1.0, rho=0.9, eps=1e-7)


dataset = Dataset(dataset_folder)
dataset_size = len(dataset)
indices = list(range(dataset_size))
train_indices, val_indices = train_test_split(indices, stratify=dataset.labels, random_state=42, shuffle=True)
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=2, pin_memory=True, drop_last=False)
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler, num_workers=2, pin_memory=True, drop_last=False)


for epoch in range(nr_epochs): 
    print('Epoch %d / %d' % (epoch, nr_epochs))

    model.train(train_loader, opt, device)
    train_loss, train_acc = model.validation(train_loader, device)
    val_loss, val_acc = model.validation(val_loader, device)

    print('Train Loss %f \t Train Acc %f' % (train_loss, train_acc))
    print('Val Loss %f \tVal Acc %f' % (val_loss, val_acc))
    print()

Epoch 0 / 50
Train Loss 0.500090 	 Train Acc 0.617333
Val Loss 0.166451 	Val Acc 0.620000

Epoch 1 / 50
Train Loss 0.525319 	 Train Acc 0.617333
Val Loss 0.174417 	Val Acc 0.620000

Epoch 2 / 50
Train Loss 0.505072 	 Train Acc 0.617333
Val Loss 0.168213 	Val Acc 0.620000

Epoch 3 / 50
Train Loss 0.501846 	 Train Acc 0.617333
Val Loss 0.166844 	Val Acc 0.620000

Epoch 4 / 50
Train Loss 0.502983 	 Train Acc 0.617333
Val Loss 0.167201 	Val Acc 0.620000

Epoch 5 / 50
Train Loss 0.498972 	 Train Acc 0.617333
Val Loss 0.166000 	Val Acc 0.620000

Epoch 6 / 50
Train Loss 0.508621 	 Train Acc 0.617333
Val Loss 0.169002 	Val Acc 0.620000

Epoch 7 / 50
Train Loss 0.506726 	 Train Acc 0.617333
Val Loss 0.168394 	Val Acc 0.620000

Epoch 8 / 50
Train Loss 0.499205 	 Train Acc 0.617333
Val Loss 0.166050 	Val Acc 0.620000

Epoch 9 / 50
Train Loss 0.504451 	 Train Acc 0.617333
Val Loss 0.167995 	Val Acc 0.620000

Epoch 10 / 50
Train Loss 0.503223 	 Train Acc 0.617333
Val Loss 0.167275 	Val Acc 0.620000

Epoch 11 / 50
Train Loss 0.499018 	 Train Acc 0.617333
Val Loss 0.166024 	Val Acc 0.620000

Epoch 12 / 50
Train Loss 0.503168 	 Train Acc 0.617333
Val Loss 0.167547 	Val Acc 0.620000

Epoch 13 / 50
Train Loss 0.499027 	 Train Acc 0.617333
Val Loss 0.166016 	Val Acc 0.620000

Epoch 14 / 50
Train Loss 0.499334 	 Train Acc 0.617333
Val Loss 0.166166 	Val Acc 0.620000

Epoch 15 / 50
Train Loss 0.500015 	 Train Acc 0.617333
Val Loss 0.166282 	Val Acc 0.620000

Epoch 16 / 50
Train Loss 0.501607 	 Train Acc 0.617333
Val Loss 0.166769 	Val Acc 0.620000

Epoch 17 / 50
Train Loss 0.499510 	 Train Acc 0.617333
Val Loss 0.166135 	Val Acc 0.620000

Epoch 18 / 50
Train Loss 0.499713 	 Train Acc 0.617333
Val Loss 0.166193 	Val Acc 0.620000

Epoch 19 / 50
Train Loss 0.500646 	 Train Acc 0.617333
Val Loss 0.166653 	Val Acc 0.620000

Epoch 20 / 50
Train Loss 0.510056 	 Train Acc 0.617333
Val Loss 0.169461 	Val Acc 0.620000

Epoch 21 / 50
Train Loss 0.502554 	 Train Acc 0.617333
Val Loss 0.167065 	Val Acc 0.620000

Epoch 22 / 50
Train Loss 0.499017 	 Train Acc 0.617333
Val Loss 0.166018 	Val Acc 0.620000

Epoch 23 / 50
Train Loss 0.499552 	 Train Acc 0.617333
Val Loss 0.166146 	Val Acc 0.620000

Epoch 24 / 50
Train Loss 0.499109 	 Train Acc 0.617333
Val Loss 0.166073 	Val Acc 0.620000

Epoch 25 / 50
Train Loss 0.500361 	 Train Acc 0.617333
Val Loss 0.166386 	Val Acc 0.620000

Epoch 26 / 50
Train Loss 0.499047 	 Train Acc 0.617333
Val Loss 0.166043 	Val Acc 0.620000

Epoch 27 / 50
Train Loss 0.499380 	 Train Acc 0.617333
Val Loss 0.166098 	Val Acc 0.620000

Epoch 28 / 50
Train Loss 0.500228 	 Train Acc 0.617333
Val Loss 0.166501 	Val Acc 0.620000

Epoch 29 / 50
Train Loss 0.500201 	 Train Acc 0.617333
Val Loss 0.166337 	Val Acc 0.620000

Epoch 30 / 50
Train Loss 0.499016 	 Train Acc 0.617333
Val Loss 0.166020 	Val Acc 0.620000

Epoch 31 / 50
Train Loss 0.499676 	 Train Acc 0.617333
Val Loss 0.166182 	Val Acc 0.620000

Epoch 32 / 50
Train Loss 0.499259 	 Train Acc 0.617333
Val Loss 0.166066 	Val Acc 0.620000

Epoch 33 / 50
Train Loss 0.499635 	 Train Acc 0.617333
Val Loss 0.166282 	Val Acc 0.620000

Epoch 34 / 50
Train Loss 0.499112 	 Train Acc 0.617333
Val Loss 0.166074 	Val Acc 0.620000

Epoch 35 / 50
Train Loss 0.499023 	 Train Acc 0.617333
Val Loss 0.166028 	Val Acc 0.620000

Epoch 36 / 50
Train Loss 0.499261 	 Train Acc 0.617333
Val Loss 0.166136 	Val Acc 0.620000

Epoch 37 / 50
Train Loss 0.506009 	 Train Acc 0.617333
Val Loss 0.168536 	Val Acc 0.620000

Epoch 38 / 50
Train Loss 0.503391 	 Train Acc 0.617333
Val Loss 0.167625 	Val Acc 0.620000

Epoch 39 / 50
Train Loss 0.499082 	 Train Acc 0.617333
Val Loss 0.166060 	Val Acc 0.620000

Epoch 40 / 50
Train Loss 0.499019 	 Train Acc 0.617333
Val Loss 0.166017 	Val Acc 0.620000

Epoch 41 / 50
Train Loss 0.499290 	 Train Acc 0.617333
Val Loss 0.166074 	Val Acc 0.620000

Epoch 42 / 50
Train Loss 0.499075 	 Train Acc 0.617333
Val Loss 0.166022 	Val Acc 0.620000

Epoch 43 / 50
Train Loss 0.499679 	 Train Acc 0.617333
Val Loss 0.166298 	Val Acc 0.620000

Epoch 44 / 50
Train Loss 0.499049 	 Train Acc 0.617333
Val Loss 0.166044 	Val Acc 0.620000

Epoch 45 / 50
Train Loss 0.499037 	 Train Acc 0.617333
Val Loss 0.166037 	Val Acc 0.620000

Epoch 46 / 50
Train Loss 0.500793 	 Train Acc 0.617333
Val Loss 0.166517 	Val Acc 0.620000

Epoch 47 / 50
Train Loss 0.499311 	 Train Acc 0.617333
Val Loss 0.166080 	Val Acc 0.620000

Epoch 48 / 50
Train Loss 0.499411 	 Train Acc 0.617333
Val Loss 0.166196 	Val Acc 0.620000

Epoch 49 / 50
Train Loss 0.499119 	 Train Acc 0.617333
Val Loss 0.166031 	Val Acc 0.620000

When I add a BatchNormalization layer before the first dense layer (commented out sections in file VGG.py), the results change to this:

Epoch 0 / 50
Train Loss 0.459918 Train Acc 0.641333
Val Loss 0.154426 Val Acc 0.632000

Epoch 1 / 50
Train Loss 0.525584 Train Acc 0.618667
Val Loss 0.172788 Val Acc 0.620000

Epoch 2 / 50
Train Loss 2.440474 Train Acc 0.382667
Val Loss 0.817006 Val Acc 0.380000

Epoch 3 / 50
Train Loss 1.514213 Train Acc 0.617333
Val Loss 0.500490 Val Acc 0.620000

Epoch 4 / 50
Train Loss 0.014679 Train Acc 1.000000
Val Loss 0.005718 Val Acc 1.000000

Epoch 5 / 50
Train Loss 0.009080 Train Acc 1.000000
Val Loss 0.004307 Val Acc 0.996000

Epoch 6 / 50
Train Loss 0.021494 Train Acc 1.000000
Val Loss 0.007247 Val Acc 1.000000

Epoch 7 / 50
Train Loss 0.782280 Train Acc 0.617333
Val Loss 0.249638 Val Acc 0.620000

Epoch 8 / 50
Train Loss 0.006897 Train Acc 1.000000
Val Loss 0.002757 Val Acc 1.000000

Epoch 9 / 50
Train Loss 0.004311 Train Acc 1.000000
Val Loss 0.005614 Val Acc 0.996000

Epoch 10 / 50
Train Loss 0.003321 Train Acc 1.000000
Val Loss 0.001180 Val Acc 1.000000

Epoch 11 / 50
Train Loss 0.055934 Train Acc 0.972000
Val Loss 0.032406 Val Acc 0.964000

Epoch 12 / 50
Train Loss 1.654088 Train Acc 0.382667
Val Loss 0.553744 Val Acc 0.380000

Epoch 13 / 50
Train Loss 0.483514 Train Acc 0.670667
Val Loss 0.162235 Val Acc 0.672000

Epoch 14 / 50
Train Loss 0.000435 Train Acc 1.000000
Val Loss 0.000191 Val Acc 1.000000

Epoch 15 / 50
Train Loss 0.096637 Train Acc 0.958667
Val Loss 0.033234 Val Acc 0.964000

Epoch 16 / 50
Train Loss 0.003012 Train Acc 1.000000
Val Loss 0.005062 Val Acc 0.996000

Epoch 17 / 50
Train Loss 0.003122 Train Acc 1.000000
Val Loss 0.001587 Val Acc 1.000000

Epoch 18 / 50
Train Loss 0.000384 Train Acc 1.000000
Val Loss 0.000329 Val Acc 1.000000

Epoch 19 / 50
Train Loss 0.000284 Train Acc 1.000000
Val Loss 0.000180 Val Acc 1.000000

Epoch 20 / 50
Train Loss 0.075182 Train Acc 0.961333
Val Loss 0.017617 Val Acc 0.968000

Epoch 21 / 50
Train Loss 0.341717 Train Acc 0.765333
Val Loss 0.103345 Val Acc 0.788000

Epoch 22 / 50
Train Loss 0.453600 Train Acc 0.796000
Val Loss 0.180800 Val Acc 0.808000

Epoch 23 / 50
Train Loss 0.000235 Train Acc 1.000000
Val Loss 0.000419 Val Acc 1.000000

Epoch 24 / 50
Train Loss 0.000198 Train Acc 1.000000
Val Loss 0.000231 Val Acc 1.000000

Epoch 25 / 50
Train Loss 0.033356 Train Acc 0.992000
Val Loss 0.011122 Val Acc 0.996000

Epoch 26 / 50
Train Loss 0.000121 Train Acc 1.000000
Val Loss 0.001847 Val Acc 0.996000

Epoch 27 / 50
Train Loss 0.023839 Train Acc 0.992000
Val Loss 0.008046 Val Acc 0.992000

Epoch 28 / 50
Train Loss 0.000063 Train Acc 1.000000
Val Loss 0.000259 Val Acc 1.000000

Epoch 29 / 50
Train Loss 0.000093 Train Acc 1.000000
Val Loss 0.001033 Val Acc 0.996000

Epoch 30 / 50
Train Loss 0.000053 Train Acc 1.000000
Val Loss 0.003057 Val Acc 0.996000

Epoch 31 / 50
Train Loss 0.000096 Train Acc 1.000000
Val Loss 0.003063 Val Acc 0.996000

Epoch 32 / 50
Train Loss 0.000192 Train Acc 1.000000
Val Loss 0.000290 Val Acc 1.000000

Epoch 33 / 50
Train Loss 0.000390 Train Acc 1.000000
Val Loss 0.000182 Val Acc 1.000000

Epoch 34 / 50
Train Loss 0.000096 Train Acc 1.000000
Val Loss 0.000080 Val Acc 1.000000

Epoch 35 / 50
Train Loss 0.032301 Train Acc 0.997333
Val Loss 0.010374 Val Acc 0.996000

Epoch 36 / 50
Train Loss 0.000052 Train Acc 1.000000
Val Loss 0.001623 Val Acc 0.996000

Epoch 37 / 50
Train Loss 1.621029 Train Acc 0.393333
Val Loss 0.573597 Val Acc 0.396000

Epoch 38 / 50
Train Loss 0.000514 Train Acc 1.000000
Val Loss 0.009409 Val Acc 0.988000

Epoch 39 / 50
Train Loss 1.140584 Train Acc 0.621333
Val Loss 0.367271 Val Acc 0.624000

Epoch 40 / 50
Train Loss 0.000029 Train Acc 1.000000
Val Loss 0.000040 Val Acc 1.000000

Epoch 41 / 50
Train Loss 0.016918 Train Acc 0.997333
Val Loss 0.006839 Val Acc 1.000000

Epoch 42 / 50
Train Loss 0.000394 Train Acc 1.000000
Val Loss 0.004514 Val Acc 0.996000

Epoch 43 / 50
Train Loss 0.016698 Train Acc 0.993333
Val Loss 0.005521 Val Acc 0.996000

Epoch 44 / 50
Train Loss 0.000007 Train Acc 1.000000
Val Loss 0.000224 Val Acc 1.000000

Epoch 45 / 50
Train Loss 0.000021 Train Acc 1.000000
Val Loss 0.000089 Val Acc 1.000000

Epoch 46 / 50
Train Loss 0.000033 Train Acc 1.000000
Val Loss 0.000218 Val Acc 1.000000

Epoch 47 / 50
Train Loss 0.000034 Train Acc 1.000000
Val Loss 0.000065 Val Acc 1.000000

Epoch 48 / 50
Train Loss 0.000260 Train Acc 1.000000
Val Loss 0.000155 Val Acc 1.000000

Epoch 49 / 50
Train Loss 0.000011 Train Acc 1.000000
Val Loss 0.000339 Val Acc 1.000000

Any help would be appreciated! Thanks in advance :slight_smile:

Could you provide a link to (or post) your Keras model here, so that we could compare both training codes, please?

Just skimming through the code, it seems the learning rate is quite high, but since I cannot compare it to your Keras implementation, that’s just a wild guess.

Hi! First of all thanks for taking a look. Yeah…I used the same learning rate as in Keras, but maybe in PyTorch it should be lower.
Anyways, here is the link to the Keras code.

Hey…looks like I found the problem! So, when I was investigating further how to covert from Keras to PyTorch, I tried to keep things as much as possible the same (like initialisation schemes and such), so I changed the Adadelta default values to correspond to the ones in Keras. In Keras, eps = 1e-7 and it turns out that the model learns as it did before if I keep the epsilon value at the default in PyTorch, which is 1e-6. Nevertheless, this is a weird behaviour…if anyone has an explanation please share.
Thank you!