Accuracy changes when training on GPU instead of CPU

So I was training ResNext50 on the melanoma dataset on Kaggle. I trained it once on a CPU and ended up with 0.81 accuracy (training). On switching to gpu training the accuracy oscillates around 0.5. Please do tell me if anyone knows how to solve the same.

Hey Aryaman,
Can you post a small code snippet of the steps you are following to train the network? Also are you setting a seed so that the model is initialized the same way every time you run it?

Hey Dipam, here’s a code snippet

torch.manual_seed(42)

from torchvision.models import inception_v3,vgg16,resnet18,resnext50_32x4d
model = resnext50_32x4d(pretrained=True)
for param in model.parameters():
    param.requires_grad = False
for param in model.layer4.parameters():
    param.requires_grad=True
for param in model.avgpool.parameters():
    param.requires_grad = True
model.fc = nn.Linear(in_features=2048,out_features=1,bias=True)
for param in model.fc.parameters():
    model.fc.requires_grad = True
criterion = nn.BCELoss()
import torch.optim as optim
import torch.nn.functional as F
adam = optim.Adam(model.parameters(),lr=1e-5)
import cv2
device = "cuda"
import torch.nn.functional as F
epochs=5
#model = torch.load("model.pt")
model.cuda()
print("======== Training for ", epochs, "epochs=============")
for epoch in range(epochs):
    total_loss = 0
    model.train()
    print("Training.......")
    print("======== EPOCH #",epoch,"=================")
    tmp_acc = 0
    for i,batch in enumerate(train_loader):
        
        img,label = batch["images"],batch["labels"]
        #img = img.permute(0,3,1,2)
        #img = torch.Tensor(img)
        label = label.type(torch.FloatTensor)
        
        img,label = img.to(device),label.to(device)
        
        model.zero_grad()
        
        op = model(img)
        
        label_cpu = label.cpu().numpy()
        
        op = F.sigmoid(op)
        
        
        output = op.detach().cpu().numpy()
        tmp_acc += accuracy_score(output,label_cpu)
        loss = criterion(op,label)
        total_loss = loss.item()
        loss.backward()
        
        adam.step()
        if(i%10==0 and i>0):
            print("STEP: ",i, "of steps ",len(train_loader))
        
            print("Current loss: ",total_loss/i)
            print("Training Accuracy ",tmp_acc/i)
            
        
        
    avg_loss = total_loss/len(train_loader)
    print("The loss after ",epoch," epochs is ",avg_loss)
    print("OP",op)
    print("Label",label_cpu)
        
    
    model.eval()
    print("Validating.....")
    tmp_accuracy = 0
    z_count,o_count=0,0
    z_count_truth,o_count_truth = 0,0
    
    for i,batch in enumerate(val_loader):
        img,label = batch["images"],batch["labels"]
        img = torch.Tensor(img)
        
        img = img.to(device)
        with torch.no_grad():
            op = F.sigmoid(model(img))
        op_cpu = op.detach().cpu().numpy()
        label = label.numpy()
        tmp_accuracy += accuracy_score(op_cpu,label)
        z_count += np.sum(op_cpu==0)
        o_count += np.sum(op_cpu==1)
        z_count_truth += np.sum(label==0)
        o_count_truth += np.sum(label==1)
    percent_correct_z = z_count/z_count_truth
    percent_correct_o = o_count/o_count_truth
    accuracy = tmp_accuracy/len(val_loader)
    print("Accuracy: ", "is ",accuracy)
    #print("Percent of correct zero labels ",percent_correct_z)
    #print("Percent of correct one labels ",percent_correct_o)
    

Also I tested the same issue with a custom network. On cpu I got a training accuracy of 0.75 while it kept oscillating around 0.5 on a GPU