Loss decreasing when model runs on CPU, but loss is always zero when model runs on GPU

Hi,
I’m trying to train a simple model with cats and dogs data set. When I start training on CPU the loss decreased the way it should be, but when I switched to GPU mode LOSS is always zero, I moved model and tensors to GPU like the bellow code but still loss is zero. Any idea ?

import os
import os.path
import csv
import glob
import numpy as np
# import matplotlib.pyplot as plt
# from PIL import Image
#from sklearn.metrics import confusion_matrix



import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

import torchvision
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms


#some initial setup
np.set_printoptions(precision=2)
use_gpu = torch.cuda.is_available()
np.random.seed(1234)
#print(use_gpu)

DATA_DIR = "/scratch/amirzaei/pytorch/catvsdog/train/"
DATA_TST_DIR = "/scratch/amirzaei/pytorch/catvsdog/test/"

sz = 224
batch_size = 16


trn_dir = f'{DATA_DIR}'
tst_dir = f'{DATA_DIR}'


tfms = transforms.Compose([
    transforms.Resize((sz, sz)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_ds = datasets.ImageFolder(trn_dir, transform=tfms)
valid_ds = datasets.ImageFolder(tst_dir, transform=tfms)
test_ds  = datasets.ImageFolder(tst_dir, transform=tfms)


train_dl = torch.utils.data.DataLoader(train_ds, batch_size = batch_size, shuffle=True, num_workers=8)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size = batch_size, shuffle=True, num_workers=8)
test_dl  = torch.utils.data.DataLoader(test_ds, batch_size = 1, shuffle=False, num_workers=1)

class SimpleCNN(nn.Module):
    
    
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.fc = nn.Linear(56*56*32, 2)
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


model = SimpleCNN()
if use_gpu:
    print('yes gpu')
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    model = model.cuda()

criterion = nn.CrossEntropyLoss().cuda()
optimizer = optim.SGD(model.parameters(), lr=0.002, momentum=0.9)

num_epochs = 10
losses = []

for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(train_dl):
        model.train()
        optimizer.zero_grad()
        outputs = model(Variable(inputs.cuda()))
        
        loss = criterion(outputs.cuda(), Variable(targets.cuda()))
        losses += [loss.item()]
        
        loss.backward()
        
        optimizer.step()
        
        #report
        if ( i+1) % 50 == 0 :
            print( 'epoch [%d/%d], step [%d/%d], loss %f' %( epoch, num_epochs, i, len(train_ds) // batch_size, float(loss.item())))

			
torch.save(model.state_dict(), '/scratch/amirzaei/pytorch/catvsdog/train/SAVED_MODEL.pth')

this is the beginning of the output:

yes gpu
epoch [0/10], step [49/1562], loss 0.000000
epoch [0/10], step [99/1562], loss 0.000000
epoch [0/10], step [149/1562], loss 0.000000
epoch [0/10], step [199/1562], loss 0.000000
epoch [0/10], step [249/1562], loss 0.000000
epoch [0/10], step [299/1562], loss 0.000000
epoch [0/10], step [349/1562], loss 0.000000
epoch [0/10], step [399/1562], loss 0.000000
...

Your output should already be on the GPU.
Could you remove the .cuda() call on your output:

criterion(outputs.cuda(), ...)

Also, Variables are deprecated. You don’t have to warp your tensors in Variables anymore. :wink:

Honestly already checked that, and nothing changed

Yeah, I’ve checked it with a dummy example and it should work.
Could you print the output and target before calling the criterion?

Sorry, I cannot reply real time, because each time I run the code on University’s server and my tasks goes on queue, so I should wait for other to finish first :).

Sure, no problem.
I’ll take another look at the code.
The same code is returning a valid loss using the CPU or did you change something else?

I just removed the “.cuda()” parts and run it on the CPU mode

yes gpu
outputs:
tensor([[-0.7218,  0.0799],
        [-0.3777,  0.0066],
        [-0.0300,  0.1042],
        [-0.0631,  0.0776],
        [-0.2742,  0.1017],
        [-0.1016,  0.3217],
        [-0.4512,  0.1652],
        [-0.2501, -0.0158],
        [-0.1001,  0.0228],
        [-0.1450, -0.1840],
        [-0.5124, -0.0129],
        [-0.3069,  0.0862],
        [-0.4056,  0.0122],
        [-0.0393,  0.1312],
        [-0.1726, -0.0376],
        [-0.1504,  0.3080]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')
outputs:
tensor([[ -7.3676,   7.1940],
        [ -7.8105,   7.8614],
        [ -8.4740,   8.1551],
        [ -6.6943,   6.3968],
        [ -6.9419,   6.5977],
        [ -7.8175,   7.5915],
        [ -7.2899,   7.3079],
        [ -9.4329,   8.9957],
        [ -9.8488,   9.2562],
        [ -6.5058,   6.3480],
        [-10.5865,  10.7625],
        [ -6.3048,   6.3257],
        [ -7.0993,   6.6395],
        [ -5.7157,   5.3780],
        [ -4.9567,   4.7297],
        [ -7.7228,   7.1255]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')
outputs:
tensor([[-11.3238,  11.1269],
        [-14.0514,  14.0194],
        [ -9.9086,   9.7313],
        [-13.4018,  13.1804],
        [-15.4153,  15.5052],
        [-16.0751,  15.5695],
        [-13.1213,  12.6561],
        [-21.4735,  21.5242],
        [-10.5720,  10.1248],
        [-12.9845,  12.7423],
        [-16.3938,  15.6749],
        [-10.4105,  10.3380],
        [-15.3936,  15.1340],
        [-16.4181,  16.6434],
        [-15.4390,  15.3033],
        [-14.1930,  13.9824]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')
outputs:
tensor([[-19.4064,  19.3802],
        [-12.5685,  12.5996],
        [-21.5608,  21.6957],
        [-16.2454,  16.4091],
        [-23.8241,  23.5606],
        [-24.6136,  24.1047],
        [-19.3370,  19.2353],
        [-16.1822,  16.2280],
        [-22.7567,  22.6444],
        [-18.3765,  18.0064],
        [-20.7338,  20.7988],
        [-22.2366,  21.6640],
        [-15.4236,  14.9537],
        [-21.1272,  21.0056],
        [-16.7133,  16.6841],
        [-19.2318,  19.0395]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')
outputs:
tensor([[-25.9788,  25.7405],
        [-32.9146,  32.5946],
        [-17.9632,  17.9339],
        [-20.4449,  20.5898],
        [-24.3951,  24.6153],
        [-27.7594,  27.4260],
        [-20.1770,  19.8166],
        [-21.0899,  20.4372],
        [-29.1212,  28.3369],
        [-24.3874,  24.0906],
        [-24.2173,  24.1907],
        [-38.5717,  38.0031],
        [-32.8707,  32.0967],
        [-19.9623,  19.7503],
        [-16.4334,  16.2315],
        [-22.4122,  22.3104]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')Preformatted text

The first output tensor returns a valid loss, but as you can see the following have really hight logits for the target class (1), which returns a loss approx. 0.

If you use F.log_softmax or F.softmax on the outputs, you see that your model outputs a probability of ~1 for class1.

so you are saying the model is working properly ? I checked for 10 epoch and for all epochs loss is zero, it seems incorrect, isn’t it ? I have shuffle = True, so data set should fetch mixed of cats and dogs images, right ??

It’s hard to tell as I don’t know your complete use case.
From the output and target you’ve posted, the loss is already 0 in the 3rd iteration.

The targets look strange, as you have all ones in each iteration.
Your model might just learn to output class1, if it’s the majority class.
Could you validate your Dataset to see, how many instances there are of class0 and class1?

Currently you are also using the same directory for train, val and test, which should be unrelated to the current problem.

yes, you were right I did something terrible in the dataset, It will take sometime to correct it, I will put the result again. I have another problem about testing phase, may I asked it here?

Sure! If it’s a separate problem, which might need some discussion, it would probably be better to start a new thread and focus here on the original issue.

When we are using datasets of pytorch, is it necessary to have separated folder for each class ? In my testing phase, I have a folder that all cats and dogs pictures are there, unlike the training and dev folders that they were separated, so when I run the test phase it gives me this error RuntimeError: Found 0 files in subfolders of: /scratch/amirzaei/pytorch/catvsdog/test which apparently it looks for separated folder for cats and dogs. this is the my test module:

import os
import os.path
import csv
import glob
import numpy as np
# import matplotlib.pyplot as plt
# from PIL import Image
#from sklearn.metrics import confusion_matrix



import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

import torchvision
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms


#some initial setup
np.set_printoptions(precision=2)
use_gpu = torch.cuda.is_available()
np.random.seed(1234)
#print(use_gpu)

DATA_DIR = "/scratch/amirzaei/pytorch/catvsdog/train/"
DATA_TST_DIR = "/scratch/amirzaei/pytorch/catvsdog/test"

sz = 224
batch_size = 16


trn_dir = f'{DATA_DIR}'
tst_dir = f'{DATA_TST_DIR}'


tfms = transforms.Compose([
    transforms.Resize((sz, sz)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_ds  = datasets.ImageFolder(tst_dir, transform=tfms)
test_dl  = torch.utils.data.DataLoader(test_ds, batch_size = 1, shuffle=False, num_workers=1, pin_memory=False)

class SimpleCNN(nn.Module):
    
    
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.fc = nn.Linear(56*56*32, 2)
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


def test( model, test_loader):
    model.cuda()
    model.eval()
    csv_map = {}
    # switch to evaluate mode
    filepath = 1
    model.eval()
    for i, (images, _ ) in enumerate(test_loader):
        # pop extension, treat as id to map

        image_var = torch.autograd.Variable(images.cuda(), volatile=True)
        y_pred = model(image_var.cuda())
        # get the index of the max log-probability
        smax = nn.Softmax()
        smax_out = smax(y_pred)[0]
        cat_prob = smax_out.data[0]
        dog_prob = smax_out.data[1]
        prob = dog_prob
        if cat_prob > dog_prob:
            prob = 1 - cat_prob
        prob = np.around(prob, decimals=4)
        prob = np.clip(prob, .0001, .999)
        csv_map[filepath] = float(prob.data[0])
        filepath += 1
        # print("{},{}".format(filepath, prob[0]))

    with open('{}entry.csv'.format(DATA_TST_DIR),'w') as csvfile:
        fieldnames = ['id', 'label']
        csv_w = csv.writer(csvfile)
        csv_w.writerow(('id', 'label'))
        for row in sorted(csv_map.items()):
            csv_w.writerow(row)

    return	

model = SimpleCNN()
model.load_state_dict(torch.load('/scratch/amirzaei/pytorch/catvsdog/SAVED_MODEL.pth'))
test(model, test_dl)

If you use datasets.ImageFolder then yes, your images should be located in separate folders which represent the classes.
If you don’t want that, you can easily write your own Dataset and load the images using your own logic.
Here is a good tutorial explaining, how to use Dataset.
One way would be to get all image paths with their label, e.g. by using their name, split them into train and val, and pass them to your Dataset.

the problem is, the file name in test directory are numbers. I got this DataSet from Kaggle, and it is not easy to separate test data set into two category. I think the first solution you offered is more suitable in this case