Hi,
I’m trying to train a simple model with cats and dogs data set. When I start training on CPU the loss decreased the way it should be, but when I switched to GPU mode LOSS is always zero, I moved model and tensors to GPU like the bellow code but still loss is zero. Any idea ?
import os
import os.path
import csv
import glob
import numpy as np
# import matplotlib.pyplot as plt
# from PIL import Image
#from sklearn.metrics import confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torchvision
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms
#some initial setup
np.set_printoptions(precision=2)
use_gpu = torch.cuda.is_available()
np.random.seed(1234)
#print(use_gpu)
DATA_DIR = "/scratch/amirzaei/pytorch/catvsdog/train/"
DATA_TST_DIR = "/scratch/amirzaei/pytorch/catvsdog/test/"
sz = 224
batch_size = 16
trn_dir = f'{DATA_DIR}'
tst_dir = f'{DATA_DIR}'
tfms = transforms.Compose([
transforms.Resize((sz, sz)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
train_ds = datasets.ImageFolder(trn_dir, transform=tfms)
valid_ds = datasets.ImageFolder(tst_dir, transform=tfms)
test_ds = datasets.ImageFolder(tst_dir, transform=tfms)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size = batch_size, shuffle=True, num_workers=8)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size = batch_size, shuffle=True, num_workers=8)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size = 1, shuffle=False, num_workers=1)
class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=5, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.fc = nn.Linear(56*56*32, 2)
def forward(self, x):
out = self.conv1(x)
out = self.conv2(out)
out = out.view(out.size(0), -1)
out = self.fc(out)
return out
model = SimpleCNN()
if use_gpu:
print('yes gpu')
torch.set_default_tensor_type('torch.cuda.FloatTensor')
model = model.cuda()
criterion = nn.CrossEntropyLoss().cuda()
optimizer = optim.SGD(model.parameters(), lr=0.002, momentum=0.9)
num_epochs = 10
losses = []
for epoch in range(num_epochs):
for i, (inputs, targets) in enumerate(train_dl):
model.train()
optimizer.zero_grad()
outputs = model(Variable(inputs.cuda()))
loss = criterion(outputs.cuda(), Variable(targets.cuda()))
losses += [loss.item()]
loss.backward()
optimizer.step()
#report
if ( i+1) % 50 == 0 :
print( 'epoch [%d/%d], step [%d/%d], loss %f' %( epoch, num_epochs, i, len(train_ds) // batch_size, float(loss.item())))
torch.save(model.state_dict(), '/scratch/amirzaei/pytorch/catvsdog/train/SAVED_MODEL.pth')
this is the beginning of the output:
yes gpu
epoch [0/10], step [49/1562], loss 0.000000
epoch [0/10], step [99/1562], loss 0.000000
epoch [0/10], step [149/1562], loss 0.000000
epoch [0/10], step [199/1562], loss 0.000000
epoch [0/10], step [249/1562], loss 0.000000
epoch [0/10], step [299/1562], loss 0.000000
epoch [0/10], step [349/1562], loss 0.000000
epoch [0/10], step [399/1562], loss 0.000000
...