Loss decreasing when model runs on CPU, but loss is always zero when model runs on GPU

Ali_Mirzaeyan · September 9, 2018, 10:21pm

Hi,
I’m trying to train a simple model with cats and dogs data set. When I start training on CPU the loss decreased the way it should be, but when I switched to GPU mode LOSS is always zero, I moved model and tensors to GPU like the bellow code but still loss is zero. Any idea ?

import os
import os.path
import csv
import glob
import numpy as np
# import matplotlib.pyplot as plt
# from PIL import Image
#from sklearn.metrics import confusion_matrix



import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

import torchvision
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms


#some initial setup
np.set_printoptions(precision=2)
use_gpu = torch.cuda.is_available()
np.random.seed(1234)
#print(use_gpu)

DATA_DIR = "/scratch/amirzaei/pytorch/catvsdog/train/"
DATA_TST_DIR = "/scratch/amirzaei/pytorch/catvsdog/test/"

sz = 224
batch_size = 16


trn_dir = f'{DATA_DIR}'
tst_dir = f'{DATA_DIR}'


tfms = transforms.Compose([
    transforms.Resize((sz, sz)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_ds = datasets.ImageFolder(trn_dir, transform=tfms)
valid_ds = datasets.ImageFolder(tst_dir, transform=tfms)
test_ds  = datasets.ImageFolder(tst_dir, transform=tfms)


train_dl = torch.utils.data.DataLoader(train_ds, batch_size = batch_size, shuffle=True, num_workers=8)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size = batch_size, shuffle=True, num_workers=8)
test_dl  = torch.utils.data.DataLoader(test_ds, batch_size = 1, shuffle=False, num_workers=1)

class SimpleCNN(nn.Module):
    
    
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.fc = nn.Linear(56*56*32, 2)
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


model = SimpleCNN()
if use_gpu:
    print('yes gpu')
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    model = model.cuda()

criterion = nn.CrossEntropyLoss().cuda()
optimizer = optim.SGD(model.parameters(), lr=0.002, momentum=0.9)

num_epochs = 10
losses = []

for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(train_dl):
        model.train()
        optimizer.zero_grad()
        outputs = model(Variable(inputs.cuda()))
        
        loss = criterion(outputs.cuda(), Variable(targets.cuda()))
        losses += [loss.item()]
        
        loss.backward()
        
        optimizer.step()
        
        #report
        if ( i+1) % 50 == 0 :
            print( 'epoch [%d/%d], step [%d/%d], loss %f' %( epoch, num_epochs, i, len(train_ds) // batch_size, float(loss.item())))

			
torch.save(model.state_dict(), '/scratch/amirzaei/pytorch/catvsdog/train/SAVED_MODEL.pth')

this is the beginning of the output:

yes gpu
epoch [0/10], step [49/1562], loss 0.000000
epoch [0/10], step [99/1562], loss 0.000000
epoch [0/10], step [149/1562], loss 0.000000
epoch [0/10], step [199/1562], loss 0.000000
epoch [0/10], step [249/1562], loss 0.000000
epoch [0/10], step [299/1562], loss 0.000000
epoch [0/10], step [349/1562], loss 0.000000
epoch [0/10], step [399/1562], loss 0.000000
...

ptrblck · September 9, 2018, 10:28pm

Your output should already be on the GPU.
Could you remove the .cuda() call on your output:

criterion(outputs.cuda(), ...)

Also, Variables are deprecated. You don’t have to warp your tensors in Variables anymore.

Ali_Mirzaeyan · September 9, 2018, 10:32pm

Honestly already checked that, and nothing changed

ptrblck · September 9, 2018, 10:38pm

Yeah, I’ve checked it with a dummy example and it should work.
Could you print the output and target before calling the criterion?

Ali_Mirzaeyan · September 9, 2018, 10:44pm

Sorry, I cannot reply real time, because each time I run the code on University’s server and my tasks goes on queue, so I should wait for other to finish first :).

ptrblck · September 9, 2018, 10:45pm

Sure, no problem.
I’ll take another look at the code.
The same code is returning a valid loss using the CPU or did you change something else?

Ali_Mirzaeyan · September 9, 2018, 10:45pm

I just removed the “.cuda()” parts and run it on the CPU mode

Ali_Mirzaeyan · September 10, 2018, 11:56am

yes gpu
outputs:
tensor([[-0.7218,  0.0799],
        [-0.3777,  0.0066],
        [-0.0300,  0.1042],
        [-0.0631,  0.0776],
        [-0.2742,  0.1017],
        [-0.1016,  0.3217],
        [-0.4512,  0.1652],
        [-0.2501, -0.0158],
        [-0.1001,  0.0228],
        [-0.1450, -0.1840],
        [-0.5124, -0.0129],
        [-0.3069,  0.0862],
        [-0.4056,  0.0122],
        [-0.0393,  0.1312],
        [-0.1726, -0.0376],
        [-0.1504,  0.3080]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')
outputs:
tensor([[ -7.3676,   7.1940],
        [ -7.8105,   7.8614],
        [ -8.4740,   8.1551],
        [ -6.6943,   6.3968],
        [ -6.9419,   6.5977],
        [ -7.8175,   7.5915],
        [ -7.2899,   7.3079],
        [ -9.4329,   8.9957],
        [ -9.8488,   9.2562],
        [ -6.5058,   6.3480],
        [-10.5865,  10.7625],
        [ -6.3048,   6.3257],
        [ -7.0993,   6.6395],
        [ -5.7157,   5.3780],
        [ -4.9567,   4.7297],
        [ -7.7228,   7.1255]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')
outputs:
tensor([[-11.3238,  11.1269],
        [-14.0514,  14.0194],
        [ -9.9086,   9.7313],
        [-13.4018,  13.1804],
        [-15.4153,  15.5052],
        [-16.0751,  15.5695],
        [-13.1213,  12.6561],
        [-21.4735,  21.5242],
        [-10.5720,  10.1248],
        [-12.9845,  12.7423],
        [-16.3938,  15.6749],
        [-10.4105,  10.3380],
        [-15.3936,  15.1340],
        [-16.4181,  16.6434],
        [-15.4390,  15.3033],
        [-14.1930,  13.9824]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')
outputs:
tensor([[-19.4064,  19.3802],
        [-12.5685,  12.5996],
        [-21.5608,  21.6957],
        [-16.2454,  16.4091],
        [-23.8241,  23.5606],
        [-24.6136,  24.1047],
        [-19.3370,  19.2353],
        [-16.1822,  16.2280],
        [-22.7567,  22.6444],
        [-18.3765,  18.0064],
        [-20.7338,  20.7988],
        [-22.2366,  21.6640],
        [-15.4236,  14.9537],
        [-21.1272,  21.0056],
        [-16.7133,  16.6841],
        [-19.2318,  19.0395]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')
outputs:
tensor([[-25.9788,  25.7405],
        [-32.9146,  32.5946],
        [-17.9632,  17.9339],
        [-20.4449,  20.5898],
        [-24.3951,  24.6153],
        [-27.7594,  27.4260],
        [-20.1770,  19.8166],
        [-21.0899,  20.4372],
        [-29.1212,  28.3369],
        [-24.3874,  24.0906],
        [-24.2173,  24.1907],
        [-38.5717,  38.0031],
        [-32.8707,  32.0967],
        [-19.9623,  19.7503],
        [-16.4334,  16.2315],
        [-22.4122,  22.3104]], grad_fn=<ThAddmmBackward>)
-------------------------
-------------------------
targets
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cpu')Preformatted text

ptrblck · September 10, 2018, 12:04pm

The first output tensor returns a valid loss, but as you can see the following have really hight logits for the target class (1), which returns a loss approx. 0.

If you use F.log_softmax or F.softmax on the outputs, you see that your model outputs a probability of ~1 for class1.

Ali_Mirzaeyan · September 10, 2018, 12:30pm

so you are saying the model is working properly ? I checked for 10 epoch and for all epochs loss is zero, it seems incorrect, isn’t it ? I have shuffle = True, so data set should fetch mixed of cats and dogs images, right ??

ptrblck · September 10, 2018, 12:35pm

It’s hard to tell as I don’t know your complete use case.
From the output and target you’ve posted, the loss is already 0 in the 3rd iteration.

The targets look strange, as you have all ones in each iteration.
Your model might just learn to output class1, if it’s the majority class.
Could you validate your Dataset to see, how many instances there are of class0 and class1?

Currently you are also using the same directory for train, val and test, which should be unrelated to the current problem.

Ali_Mirzaeyan · September 10, 2018, 12:46pm

yes, you were right I did something terrible in the dataset, It will take sometime to correct it, I will put the result again. I have another problem about testing phase, may I asked it here?

ptrblck · September 10, 2018, 1:06pm

Sure! If it’s a separate problem, which might need some discussion, it would probably be better to start a new thread and focus here on the original issue.

Ali_Mirzaeyan · September 10, 2018, 1:16pm

When we are using datasets of pytorch, is it necessary to have separated folder for each class ? In my testing phase, I have a folder that all cats and dogs pictures are there, unlike the training and dev folders that they were separated, so when I run the test phase it gives me this error RuntimeError: Found 0 files in subfolders of: /scratch/amirzaei/pytorch/catvsdog/test which apparently it looks for separated folder for cats and dogs. this is the my test module:

import os
import os.path
import csv
import glob
import numpy as np
# import matplotlib.pyplot as plt
# from PIL import Image
#from sklearn.metrics import confusion_matrix



import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

import torchvision
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms


#some initial setup
np.set_printoptions(precision=2)
use_gpu = torch.cuda.is_available()
np.random.seed(1234)
#print(use_gpu)

DATA_DIR = "/scratch/amirzaei/pytorch/catvsdog/train/"
DATA_TST_DIR = "/scratch/amirzaei/pytorch/catvsdog/test"

sz = 224
batch_size = 16


trn_dir = f'{DATA_DIR}'
tst_dir = f'{DATA_TST_DIR}'


tfms = transforms.Compose([
    transforms.Resize((sz, sz)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_ds  = datasets.ImageFolder(tst_dir, transform=tfms)
test_dl  = torch.utils.data.DataLoader(test_ds, batch_size = 1, shuffle=False, num_workers=1, pin_memory=False)

class SimpleCNN(nn.Module):
    
    
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.fc = nn.Linear(56*56*32, 2)
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


def test( model, test_loader):
    model.cuda()
    model.eval()
    csv_map = {}
    # switch to evaluate mode
    filepath = 1
    model.eval()
    for i, (images, _ ) in enumerate(test_loader):
        # pop extension, treat as id to map

        image_var = torch.autograd.Variable(images.cuda(), volatile=True)
        y_pred = model(image_var.cuda())
        # get the index of the max log-probability
        smax = nn.Softmax()
        smax_out = smax(y_pred)[0]
        cat_prob = smax_out.data[0]
        dog_prob = smax_out.data[1]
        prob = dog_prob
        if cat_prob > dog_prob:
            prob = 1 - cat_prob
        prob = np.around(prob, decimals=4)
        prob = np.clip(prob, .0001, .999)
        csv_map[filepath] = float(prob.data[0])
        filepath += 1
        # print("{},{}".format(filepath, prob[0]))

    with open('{}entry.csv'.format(DATA_TST_DIR),'w') as csvfile:
        fieldnames = ['id', 'label']
        csv_w = csv.writer(csvfile)
        csv_w.writerow(('id', 'label'))
        for row in sorted(csv_map.items()):
            csv_w.writerow(row)

    return	

model = SimpleCNN()
model.load_state_dict(torch.load('/scratch/amirzaei/pytorch/catvsdog/SAVED_MODEL.pth'))
test(model, test_dl)

ptrblck · September 10, 2018, 1:19pm

If you use datasets.ImageFolder then yes, your images should be located in separate folders which represent the classes.
If you don’t want that, you can easily write your own Dataset and load the images using your own logic.
Here is a good tutorial explaining, how to use Dataset.
One way would be to get all image paths with their label, e.g. by using their name, split them into train and val, and pass them to your Dataset.

Ali_Mirzaeyan · September 10, 2018, 1:28pm

the problem is, the file name in test directory are numbers. I got this DataSet from Kaggle, and it is not easy to separate test data set into two category. I think the first solution you offered is more suitable in this case