Expected Tensors on same device

NeuralFoX · July 15, 2021, 6:18pm

Hi I am running this script in Google Colab. I have to code sections, and when I run them after each other I get an error saying:

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking arugment for argument mat1 in method wrapper_addmm)

This is the first code section:

import torch
import torch.nn as nn
import torchvision
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor
from torchvision import transforms
device='cuda'if torch.cuda.is_available()else'cpu';
print(f'Training on {str(device)}');
transform=transforms.Compose([
  transforms.ToTensor(),
])
training_data=datasets.FashionMNIST(
    root='data',
    download=True,
    train=True,
    transform=transform,
)
test_data=datasets.FashionMNIST(
    root='data',
    download=True,
    train=False,
    transform=transform,
)
trainDataLoader=torch.utils.data.DataLoader(training_data,batch_size=64);
testDataLoader=torch.utils.data.DataLoader(test_data,batch_size=64);
class FashionNet(nn.Module):
  def __init__(self):
    super(FashionNet,self).__init__();
    self.flatten=nn.Flatten();
    self.moduleList=nn.Sequential(
        nn.Linear(28*28,512),
        nn.ReLU(),
        nn.Linear(512,512),
        nn.Linear(512,10),
        nn.ReLU(),
    );
  def forward(self,x):
   transformedInput=self.flatten(x);
   logits=self.moduleList(transformedInput);
   return logits;
model=FashionNet();
model.to(device);
print(model);
lossFunction=nn.CrossEntropyLoss();
lr=1e-3;
optimizer=torch.optim.SGD(model.parameters(),lr=lr);
def train(dataloader,model,lossFunction,optimizer):
  size=len(dataloader.dataset);
  for batch,(X,y) in enumerate(dataloader):
    X,y=X.to(device),y.to(device);
    prediction=model(X);
    loss=lossFunction(prediction,y);
    optimizer.zero_grad();
    loss.backward();
    optimizer.step();
    if batch%100==0:
      loss,current=loss.item(),batch*len(X);
      print(f'loss: {loss}, [{current}/{size}]');
def test(dataloader,model,lossFunction):
  size=len(dataloader.dataset);
  num_batches=len(dataloader);
  model.eval();
  test_loss,correct=0,0;
  with torch.no_grad():
    for X,y,in dataloader:
      X,y=X.to(device),y.to(device);
      prediction=model(X);
      loss=lossFunction(prediction,y);
      test_loss+=loss.item();
      correct += (prediction.argmax(1) == y).type(torch.float).sum().item();
  test_loss/=num_batches;
  correct/=size;
  print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n");
epochs=5;
for epoch in range(epochs):
  print(f'Epoch: {epoch} \n---------------------------');
  train(trainDataLoader,model,lossFunction,optimizer);
  test(testDataLoader,model,lossFunction);
  print('End of Epoch');
print('Finished training and testing');
print('State Dict: ');
print(model.state_dict());
torch.save(model.state_dict(),'fashionClas.pt');

This part works okay. I am running on cuda gpu and it doesn’t raise any error and completes training and testing.

Here is the second section:

classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]
network=FashionNet();
network.to(device);
network.load_state_dict(torch.load('fashionClas.pt'));
network.eval();
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    pred=network(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

As you can see, I have explicitly moved the network to the gpu, but the error is still raised.
If anyone finds a solution, please put it below.

eqy · July 15, 2021, 7:23pm

Have you checked if x,y (the test data here) are also on GPU or if they are on CPU?
The training code has

X,y=X.to(device),y.to(device);

so something similar might be needed here.

NeuralFoX · July 15, 2021, 8:22pm

Oh I didn’t think of that. Thanks I will try it. I thought you only needed to move the neural networks.