Sending dataset to cuda breaks the dataloader iterator - TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu()

I am trying to speed up my pytorch training by following the advice from here:

So now, I am sending my trainingdata.data and .targets to cuda before starting training. What I am confused about it how to then use the Dataloader made off of the trainingdata in my train function, as when I try the way I had before (when I was sending individual batches to cuda), I get this error on the iterator of the dataloader:

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

It is erroring on the following line (loader is the dataloader here):

for data, target in loader:

What am I doing wrong? How can I still use the dataloader while also sending the full dataset over to the gpu?

Python version is 3.12, pytorch is 2.3.0+cu121

Also, the error is the same if I dont include the pin_memory var to the dataloader

Full error trace:

Traceback (most recent call last):
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2024.1.2\plugins\python-ce\helpers\pydev\pydevd.py", line 1537, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2024.1.2\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:\Users\me\PycharmProjects\NueralNetTests\TorchTests.py", line 181, in <module>
    main()
  File "C:\Users\me\PycharmProjects\NueralNetTests\TorchTests.py", line 169, in main
    train(epoch, model, loaders, device, optimizer, lossFN)
  File "C:\Users\me\PycharmProjects\NueralNetTests\TorchTests.py", line 41, in train
    for data, target in loaders['train']:
  File "C:\Users\me\PycharmProjects\NueralNetTests\venv\Lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__
    data = self._next_data()
           ^^^^^^^^^^^^^^^^^
  File "C:\Users\me\PycharmProjects\NueralNetTests\venv\Lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\me\PycharmProjects\NueralNetTests\venv\Lib\site-packages\torch\utils\data\_utils\fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "C:\Users\me\PycharmProjects\NueralNetTests\venv\Lib\site-packages\torchvision\datasets\mnist.py", line 143, in __getitem__
    img = Image.fromarray(img.numpy(), mode="L")
                          ^^^^^^^^^^^
TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

and The code:

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    _transform = Compose([
        lambda img: rotate(img, -90),
        lambda img: hflip(img),
        ToTensor()
        ])

    trainData = datasets.EMNIST(
        root='data',
        train=True,
        transform=_transform,
        download=True,
        split='letters'
    )
    testData = datasets.EMNIST(
        root='data',
        train=False,
        transform=_transform,
        download=True,
        split='letters'
    )

    trainLoader = DataLoader(trainData,
                             batch_size=100,
                             shuffle=True,
                             pin_memory=True
                             )

    testLoader = DataLoader(testData,
                            batch_size=100,
                            shuffle=True,
                            pin_memory=True
                            )

    trainData.data = trainData.data.to(device)
    trainData.targets = trainData.targets.to(device)
    testData.data = testData.data.to(device)
    testData.targets = testData.targets.to(device)

    model = CNN().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    lossFN = nn.CrossEntropyLoss()

    for epoch in range(1, 2):
        train(model, trainLoader, device, optimizer, lossFN)
        test(model, testLoader, device, lossFN)

def train(model, loader, device, optimizer, lossFN):
    model.train()
    for data, target in loader:
        optimizer.zero_grad()
        output = model(data)
        loss = lossFN(output, target)
        loss.backward()
        optimizer.step()

class CNN(nn.Module):

    def __init__(self):
        super(CNN, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2Drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 27)

    def forward(self, x):
        x = F.leaky_relu(F.max_pool2d(self.conv1(x), 2))
        x = F.leaky_relu(F.max_pool2d(self.conv2Drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.leaky_relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return F.softmax(x)


I forgot to add - I tried adding ‘generator=torch.Generator(device=‘cuda’)’ to the dataloader parameters, but the error stays the same

The __getitem__ method expects to use CPUTensors and fails in the img.numpy() call here. You could override the __getitem__ with your custom logic and transform the CUDATensors directly (i.e. remove the Image.fromarray(img.numpy()) call.