GPU based CNN training

Dear All,

I am new to pytorch and training my model with CNN and attention mechanism. The model works fine when run on CPU but gives training and validation accuracy as 0, when code runs on GPU. I tried to solve the problem, but unable to do so. Here is my code

#------------------------------------------------->Import all the required libraries-------------------------------------->
import torch # Import Torch
import torch.nn as nn # Import NN module from Torch
from torchvision.datasets import CIFAR10 # Import CIFAR10 datset from torchvision
from torchvision.transforms import transforms# Import transform module from torchvision
from torch.utils.data import DataLoader # Import dataloader from torch
from torch.optim import Adam # import optimizer module from torch
from torch.autograd import Variable # Import autograd from torch
import numpy as np # Import numpy module
import torchvision.datasets as datasets #Import dataset from torch
from torchvision import models # import pretrained models from torch
from Attention import PAM_Module # import position attention module
from Attention import CAM_Module # import channel attention module
from torch import optim, cuda # import optimizer
import os
import random
import torch.nn.functional as F
import random
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device(‘cuda’)

Training_Path="/home/mani/Desktop/Binary Attention Based ASC/DCASE 2018 Dataset/Training" # path to folder contains traning images
Test_Path="/home/mani/Desktop/Binary Attention Based ASC/DCASE 2018 Dataset/Test" # path to folder contains test images
#----------------------------------------------> Define Training Transformation --------------------------------------->
train_transformations = transforms.Compose([
#transforms.RandomHorizontalFlip(),
#transforms.RandomCrop(32,padding=4),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
#----------------------------------------------> Define Test Transformation -------------------------------------------->
test_transformations = transforms.Compose([
#transforms.RandomHorizontalFlip(),
#transforms.RandomCrop(32,padding=4),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
Validation_transformations = transforms.Compose([
#transforms.RandomHorizontalFlip(),
#transforms.RandomCrop(32,padding=4),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
#----------------------------------------------> Define and Load Training and Test Set ---------------------------------->
#Load the training set
train_set =datasets.ImageFolder(root=Training_Path,transform=train_transformations)
validation_set =datasets.ImageFolder(root=Training_Path,transform=Validation_transformations)
#print(train_set) # prints dettails of training set along with number of images, type of transformation etc.
test_set =datasets.ImageFolder(root=Training_Path,transform=test_transformations)
#print(test_set) # prints dettails of test set along with number of images, type of transformation etc.
BATCH_SIZE=32 # Define the batch size
#----------------------------------------------> Use dataloader to create batches of data -------------------------------->
train_loader = DataLoader(train_set,batch_size=BATCH_SIZE,shuffle=True)#Create a loder for the training set
test_loader = DataLoader(test_set,batch_size=BATCH_SIZE,shuffle=True)#Create a loder for the test set
validation_loader = DataLoader(validation_set,batch_size=BATCH_SIZE,shuffle=True)#Create a loder for the test set
#print(len(train_loader)) # Given total number of iterations required to complete one training epoch
#----------------------------------------------> Check of GPU and get the pretrained model ----------------------------------------->
device = torch.device(“cuda” if torch.cuda.is_available()
else “cpu”)
print(device)
#model = models.resnet18(pretrained=True) # Import the resnet model
#print(model)
num_classes=10

class BACNN(nn.Module):
def init(self):
super(BACNN, self).init()
Pre_Trained_Layers = list(models.resnet18().children())[:-2] #all layer expect last layer
print(“pre trained layers after removing the top layers---------------------->”+’\n’)
#print(Pre_Trained_Layers)
#self.count=count
self.features=nn.Sequential(*Pre_Trained_Layers)
self.PAM=PAM_Module(512)
self.CAM=CAM_Module(512)
self.conv1 = nn.Conv2d(512,10,3,bias=True,padding=0)
self.fc1=nn.Linear(18, num_classes)
for p in self.features.parameters():
p.requires_grad=False
#print (self.features)
def forward(self, image):
x = self.features(image)
x=F.relu(x)
x_1=self.PAM(x)
x_2=self.CAM(x)
x_3=x_1+x_2
x=self.conv1(x_3)
#print(a)
x = F.relu(x)
x=self.fc1(x)
x= x.view(x.size(0),-1)
#print(x.size)
#count=count+1
#print(count)
#print(“Completed”)
return x

model=BACNN().to(device)
#optimizer = optim.Adam(model.parameters())
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

Define the cross entropy loss function

criterion = nn.CrossEntropyLoss()
def calculate_accuracy(fx, y):
preds = fx.max(1, keepdim=True)[1]
correct = preds.eq(y.view_as(preds)).sum()
acc = correct.float()/preds.shape[0]
return acc
def train(model,device,iterator, optimizer, criterion):
print(“Training Starts”)
epoch_loss = 0
epoch_acc = 0
count=0
model.train()

for (x, y) in iterator:
	x=x.to(device)
	y=y.to(device)
	optimizer.zero_grad()
	count=count+1
	Predicted_Train_Label = model(x)
	loss = criterion(Predicted_Train_Label, y)
	acc = calculate_accuracy(Predicted_Train_Label, y)
	print("Training Iteration Number=",count)
	loss.backward()
	optimizer.step()
	epoch_loss += loss.item()
	epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model,device,iterator, criterion):
print(“Validation Starts”)
epoch_loss = 0
epoch_acc = 0
count=0
model.eval()

with torch.no_grad():
    for (x, y) in iterator:
    	x=x.to(device)
    	y=y.to(device)
    	count=count+1
    	Predicted_Label = model(x)
    	loss = criterion(Predicted_Label, y)
    	acc = calculate_accuracy(Predicted_Label, y)
    	print("Validation Iteration Number=",count)
    	epoch_loss += loss.item()
    	epoch_acc += acc.item()
    
return epoch_loss / len(iterator), epoch_acc / len(iterator) 

EPOCHS = 250
SAVE_DIR = ‘models’
MODEL_SAVE_PATH = os.path.join("/home/mani/Desktop/Binary Attention Based ASC/", ‘BACNN.pt’)

best_valid_loss = float(‘inf’)

#if not os.path.isdir(f’{SAVE_DIR}’):
#os.makedirs(f’{SAVE_DIR}’)
#print(model)
for epoch in range(EPOCHS):
print(“Start Training and Validation For Epoch Number=”,epoch)
train_loss, train_acc = train(model,device,train_loader, optimizer, criterion)
valid_loss, valid_acc = evaluate(model,device,validation_loader,criterion)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(“Epoch Number=”,epoch,“Train Loss=”,train_loss,“Training Accuracy=”,train_acc,’\n’)
print(“Epoch Number=”,epoch,“Validation Loss=”,valid_loss,“Validation Accuracy=”,valid_acc,’\n’)

--------------------------------------------------------------------------------------------------------------------------------

Please see the code and suggest the modifications that will solve the problem.

Thanks
Achyut

  1. What was the training and validation accuracy when run on the cpu?
  2. Also, print out the loss in both the cpu and gpu . A different behavior in the cpu and gpu is actually very interesting given that most operations are the same.

For CPU training accuracy and validation accuracy go like
Epoch 1 (Training accuracy=23.45, Validation Accuracy=24.38)
.
.
.
Epoch 20 (Training Accuracy=55.67, Validation Accuracy=57.29)

But in GPU both the values come as 0 for each epoch.

Thanks
Achyut

What about the loss in both the instances?

Loss decreases in case of CPU but remain same in case of GPU.

Following is an Example of loss coming in GPU
Epoch Number= 0 Train Loss= 7.179644451865667 Training Accuracy= 0.004746835443037975
Epoch Number= 0 Validation Loss= 7.178059632265115 Validation Accuracy= 0.0

Epoch Number= 1 Train Loss= 7.177682260923747 Training Accuracy= 0.0

Epoch Number= 1 Validation Loss= 7.176880444152446 Validation Accuracy= 0.0

Epoch Number= 2 Train Loss= 7.176005357428442 Training Accuracy= 0.0

Epoch Number= 2 Validation Loss= 7.17541601688047 Validation Accuracy= 0.0
.
.
.

Thanks
Achyut

Since you seem to have ensured deterministicity, try tracking the model ouputs when run on either device for the first epoch. My hunch says, they both should be the same (only the first epoch).

Ok will see, Thanks!