Running on cuda device

Some of the examples on pytorch mostly involving torch.tensor, I placed on cuda devices and it runs fine. i.e .following code:

import torch
import numpy as np
np.set_printoptions(precision=3)

cuda = torch.device('cuda')

# creating tensor from array, np.array

a=[1,2,3]
b=np.array([4,5,6], dtype=np.int32)
t_a=torch.tensor(a, device=cuda)
t_b=torch.from_numpy(b)
print(t_a)
print(t_b)

But I do a more advanced training example with training datas which I also attempted to put on cuda devices but running into error (code, example and error output below), note that following code runs just fine on cpu with placement on cuda device removed:

EXAMPLE CODE:

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import time 
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
ENABLE_PLOT=0
gpu = torch.device('cuda')

torch.manual_seed(1)
np.random.seed(1)
x=np.random.uniform(low=-1, high=1, size=(200, 2))
y=np.ones(len(x))

y[x[:,0] * x[:, 1]<0]=0
n_train=100

'''
x_train=torch.tensor(x[:n_train, :], dtype=torch.float32)
y_train=torch.tensor(y[:n_train], dtype=torch.float32)
x_valid=torch.tensor(x[n_train:, :], dtype=torch.float32)
y_valid=torch.tensor(y[n_train:], dtype=torch.float32)

'''
x_train=torch.tensor(x[:n_train, :], dtype=torch.float32, device='cuda')
y_train=torch.tensor(y[:n_train], dtype=torch.float32, device='cuda')
x_valid=torch.tensor(x[n_train:, :], dtype=torch.float32, device='cuda')
y_valid=torch.tensor(y[n_train:], dtype=torch.float32, device='cuda')

fig=plt.figure(figsize=(6,6))

print(x[y==0, 0])
plt.plot(x[y==0, 0], x[y==0, 1], 'o', alpha=0.75, markersize=10)
plt.plot(x[y==1, 0], x[y==1, 1], '<', alpha=0.75, markersize=10)
plt.xlabel(r'$x_1$', size=15)
plt.ylabel(r'$x_2$', size=15)

if ENABLE_PLOT:
    plt.show()

model=nn.Sequential(nn.Linear(2, 1), nn.Sigmoid())
print(model)

loss_fn=nn.BCELoss()
optimizer=torch.optim.SGD(model.parameters(), lr=0.001)

train_ds=TensorDataset(x_train, y_train)
batch_size = 2
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
print("train_dl type: ", type(train_dl))
#time.sleep(3)
torch.manual_seed(1)
num_epochs=200

def train(model, num_epochs, train_dl, x_valid, y_valid):
    loss_hist_train = [0] * num_epochs
    accuracy_hist_train = [0] * num_epochs
    loss_hist_valid = [0] * num_epochs
    accuracy_hist_valid = [0] * num_epochs

    for epoch in range(num_epochs):
        for x_batch, y_batch in train_dl:
            x_batch = x_batch.to('cuda')
            y_batch = y_batch.to('cuda')
            print("x_batch/device, y_batch type/device: ", type(x_batch), x_batch.device, type(y_batch), y_batch.device)
#           time.sleep(3)
            pred=model(x_batch)[:, 0]
            loss=loss_fn(pred, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_hist_train[epoch] += loss.item()
            is_correct = ((pred>=0.5).float() == y_batch).float()
            accuracy_hist_train[epoch] += is_correct.mean()

        loss_hist_train[epoch] /= n_train
        accuracy_hist_train[epoch] /= n_train/batch_size
        pred=model(x_valid)[:, 0]
        loss = loss_fn(pred, y_valid)
        loss_hist_valid[epoch] = loss.item()
        is_correct = ((pred>=0.5).float() == y_valid).float()
        accuracy_hist_valid[epoch] += is_correct.mean()

    return loss_hist_train, loss_hist_valid, accuracy_hist_train, accuracy_hist_valid

history=train(model, num_epochs, train_dl, x_valid, y_valid)
for i in history:
    print("len: ", len(i))
fig = plt.figure(figsize=(16,4))

ax = fig.add_subplot(1,2,1)
ax.plot(history[0], lw=4)
ax.plot(history[1], lw=4)
ax.legend(['Train loss', 'Validation'], fontsize=15)
ax.set_xlabel('Epochs', size=15)


ax=fig.add_subplot(1,2,2)
ax.plot(history[2], lw=4)
ax.plot(history[3], lw=4)
ax.legend(['Train acc', 'Validation acc'], fontsize=15)
ax.set_xlabel('Epochs', size=15)
plt.show()
plt.plot

ERROR OUTPUT:


p419-xor-with-nn-sequential.py 
[-0.16595599 -0.20646505 -0.16161097 -0.5910955  -0.94522481 -0.1653904
 -0.37315164 -0.66033916  0.38375423 -0.96342345 -0.43911202  0.81719101
 -0.96126608  0.14823521  0.38880032 -0.90009308  0.80680383 -0.72144731
  0.85501716  0.50188487 -0.46014422 -0.14381762 -0.77050805 -0.10017573
 -0.99425935 -0.3467102   0.7718842  -0.96835751 -0.86799965  0.42304952
  0.12206044  0.60126535  0.61421039  0.11248047 -0.54858132  0.11943396
 -0.85605144  0.13620092 -0.49534851 -0.60914104 -0.36527518  0.15949044
  0.33846579  0.25943501  0.50551111 -0.4793698  -0.61313143 -0.21024877
 -0.72984168 -0.95695039  0.65423094 -0.73800631 -0.31052669 -0.08023947
  0.59720718 -0.01949295 -0.96893345 -0.1326473  -0.36951039  0.15571443
 -0.99919595 -0.24683937 -0.42884744  0.72895886  0.34157758  0.24383874
 -0.31330752  0.32543962 -0.4952666   0.13771598  0.96563423  0.18502374
  0.45717134 -0.50393288 -0.16830256  0.03171403 -0.69465671 -0.71090892
 -0.55590172  0.57059206 -0.35127508 -0.8025132   0.59923052  0.26460663
  0.59317736  0.05285187 -0.67920964  0.38188428 -0.11619144  0.97950342
  0.50034483 -0.88614112 -0.57607967 -0.23796775]
Sequential(
  (0): Linear(in_features=2, out_features=1, bias=True)
  (1): Sigmoid()
)
train_dl type:  <class 'torch.utils.data.dataloader.DataLoader'>
x_batch/device, y_batch type/device:  <class 'torch.Tensor'> cuda:0 <class 'torch.Tensor'> cuda:0
Traceback (most recent call last):
  File "/git/codelab/gpu/ml/pt/ml-with-pt-and-sk/p419-xor-with-nn-sequential.py", line 89, in <module>
    history=train(model, num_epochs, train_dl, x_valid, y_valid)
  File "/git/codelab/gpu/ml/pt/ml-with-pt-and-sk/p419-xor-with-nn-sequential.py", line 70, in train
    pred=model(x_batch)[:, 0]
  File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
    input = module(input)
  File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)

It appears although placement on gpu the following datas are ok: x/y_train/valid:
but when it does computation involving
train_dl = DataLoader(train_ds, batch_size, shuffle=True), it complains about one data being on cpu and other being on gpu. I am not sure how i can put train_dl on gpu.

I tried following during loop, to put each batched on one gpu (probably not best way either to move chunk of data during computation, but just wanted to do it anyways to see if it works), but still complains:


        for x_batch, y_batch in train_dl:
            x_batch = x_batch.to('cuda')
            y_batch = y_batch.to('cuda')

You have to move the model to the GPU as well via: model.to('cuda') and it should work.
The error points out that the forward computation fails due to a device mismatch which is caused by your input tensors being on the GPU while the model’s parameters are still on the CPU.

hi, I am rejuvenating this old thread. I put model on gpu however nvidia-smi shows no memory load. Meaning it must have been running on cpu. I am still trying to figure how to put data returned by dataloader on gpu.

Actually, above example worked when data is initialized as torch tensor.
However next example I have is getting torchvision data (mnist_dataset/<class ‘torchvision.datasets.mnist.MNIST’>) from web and convert into dataloader (i.e. traind_dl/<class ‘torch.utils.data.dataloader.DataLoader’>:

transform = transforms.Compose([transforms.ToTensor()])
mnist_dataset = torchvision.datasets.MNIST(root=image_path, train=True, transform=transform, download=True)
mnist_valid_dataset = Subset(mnist_dataset, torch.arange(10000))
mnist_train_dataset = Subset(mnist_dataset, torch.arange(10000, len(mnist_dataset)))
mnist_test_dataset = torchvision.datasets.MNIST(root=image_path, train=False, transform=transform, download=False)

I am not sure how to place that into gpu. I see when run, it is running definietly on cpu as nvidia-smi shows no load.

it seems following did the trick…
Move it during batch iteration…

for epoch in range(num_epochs):
    print("\nEpoch: ", epoch, ": ", end=" ")
    counter=0
    for x_batch, y_batch in train_dl:
        x_batch.cuda()
        y_batch.cuda()

        if DEBUG:
            print(f'training batch {counter}...')
        print(".", end="")
        pred=model(x_batch)
        loss=loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train[epoch] += loss.item()*y_batch.size(0)
        is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
        accuracy_hist_train[epoch] += is_correct.sum()
        counter+=1

[root@localhost ch14-cnn]# nvidia-smi
Sat Dec 31 23:36:16 2022
±----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13 Driver Version: 525.60.13 CUDA Version: 12.0 |
|-------------------------------±---------------------±---------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce … Off | 00000000:01:00.0 On | N/A |
| 41% 34C P2 39W / 215W | 324MiB / 8192MiB | 0% Default |
| | | N/A |
±------------------------------±---------------------±---------------------+

±----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 2063 G /usr/libexec/Xorg 99MiB |
| 0 N/A N/A 2168 G /usr/bin/gnome-shell 29MiB |
| 0 N/A N/A 6150 G …329416941451411583,131072 11MiB |
| 0 N/A N/A 15820 C python3 178MiB |