Newbie question: pytorch not utilising GPU despite I have cuda correctly installed

Hi all, I have cuda correctly installed on my desktop,

torch.cuda.is_available()

returns True.

But my code doesnt seem to be utilising GPU at all (only 0.1% on task manager). I tested the same data set and same model in Keras, training time is considerably faster and the GPU usage is somewhere above 20%.
Here’s my code, can anyone spot the problem? thanks

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

class traindata(Dataset):
    def __init__(self, transform = None):
        self.train = pd.read_csv(r"C:\Users\Angus\Downloads\poker-hand-training-true.data", header=None)
        self.transform = transform
        self.len = self.train.shape[0]
    def __getitem__(self, index):
        x = self.train.iloc[index, :-1].to_numpy()
        y = self.train.iloc[index, -1]
        if self.transform:
            x = self.transform(x)
        return x, y
    def __len__(self):
        return self.len

class testdata(Dataset):
    def __init__(self, transform = None):
        self.test = pd.read_csv(r"C:\Users\Angus\Downloads\poker-hand-testing.data", header=None)
        self.transform = transform
        self.len = self.test.shape[0]
    def __getitem__(self, index):
        x = self.test.iloc[index, :-1].to_numpy()
        y = self.test.iloc[index, -1]
        if self.transform:
            x = self.transform(x)
        return x, y
    def __len__(self):
        return self.len

class perceptronclassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
    def forward(self, x):
        x = self.fc1(x)
        x = torch.nn.functional.relu(x)
        x = self.fc2(x)
        x = torch.nn.functional.relu(x)
        x = self.fc3(x)
        return x

def main():
    device = torch.device('cuda')
    batch_size = 2000
    trainloader = DataLoader(dataset = traindata(), num_workers = 4, batch_size = batch_size, shuffle = True)
    model = perceptronclassifier()
    model.to(device)
    model.train(True)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()
    num_epochs = 2000
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)
        model.train(True)
        running_loss = 0.0
        running_corrects = 0
        n = 0
        for x, y in trainloader:
            model.train(True)
            x = torch.tensor(x, dtype=torch.float, device = device)
            y = torch.tensor(y, dtype=torch.long, device = device)
            outputs = model(x)
            _, preds = torch.max(outputs.data, 1)
            loss = criterion(outputs, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += float(loss)
            running_corrects += torch.sum(preds == y.data)
            n += 1
        epoch_loss = running_loss / n
        epoch_acc = running_corrects / (batch_size * n)
        print('Training Loss:{:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))

if __name__ == '__main__':
    main()

I think you need to set the cuda device for pytorch by running

torch.cuda.set_device(device)

before setting the batch size in the main function