Hi I am running this script in Google Colab. I have to code sections, and when I run them after each other I get an error saying:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking arugment for argument mat1 in method wrapper_addmm)
This is the first code section:
import torch
import torch.nn as nn
import torchvision
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor
from torchvision import transforms
device='cuda'if torch.cuda.is_available()else'cpu';
print(f'Training on {str(device)}');
transform=transforms.Compose([
transforms.ToTensor(),
])
training_data=datasets.FashionMNIST(
root='data',
download=True,
train=True,
transform=transform,
)
test_data=datasets.FashionMNIST(
root='data',
download=True,
train=False,
transform=transform,
)
trainDataLoader=torch.utils.data.DataLoader(training_data,batch_size=64);
testDataLoader=torch.utils.data.DataLoader(test_data,batch_size=64);
class FashionNet(nn.Module):
def __init__(self):
super(FashionNet,self).__init__();
self.flatten=nn.Flatten();
self.moduleList=nn.Sequential(
nn.Linear(28*28,512),
nn.ReLU(),
nn.Linear(512,512),
nn.Linear(512,10),
nn.ReLU(),
);
def forward(self,x):
transformedInput=self.flatten(x);
logits=self.moduleList(transformedInput);
return logits;
model=FashionNet();
model.to(device);
print(model);
lossFunction=nn.CrossEntropyLoss();
lr=1e-3;
optimizer=torch.optim.SGD(model.parameters(),lr=lr);
def train(dataloader,model,lossFunction,optimizer):
size=len(dataloader.dataset);
for batch,(X,y) in enumerate(dataloader):
X,y=X.to(device),y.to(device);
prediction=model(X);
loss=lossFunction(prediction,y);
optimizer.zero_grad();
loss.backward();
optimizer.step();
if batch%100==0:
loss,current=loss.item(),batch*len(X);
print(f'loss: {loss}, [{current}/{size}]');
def test(dataloader,model,lossFunction):
size=len(dataloader.dataset);
num_batches=len(dataloader);
model.eval();
test_loss,correct=0,0;
with torch.no_grad():
for X,y,in dataloader:
X,y=X.to(device),y.to(device);
prediction=model(X);
loss=lossFunction(prediction,y);
test_loss+=loss.item();
correct += (prediction.argmax(1) == y).type(torch.float).sum().item();
test_loss/=num_batches;
correct/=size;
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n");
epochs=5;
for epoch in range(epochs):
print(f'Epoch: {epoch} \n---------------------------');
train(trainDataLoader,model,lossFunction,optimizer);
test(testDataLoader,model,lossFunction);
print('End of Epoch');
print('Finished training and testing');
print('State Dict: ');
print(model.state_dict());
torch.save(model.state_dict(),'fashionClas.pt');
This part works okay. I am running on cuda gpu and it doesn’t raise any error and completes training and testing.
Here is the second section:
classes = [
"T-shirt/top",
"Trouser",
"Pullover",
"Dress",
"Coat",
"Sandal",
"Shirt",
"Sneaker",
"Bag",
"Ankle boot",
]
network=FashionNet();
network.to(device);
network.load_state_dict(torch.load('fashionClas.pt'));
network.eval();
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
pred=network(x)
predicted, actual = classes[pred[0].argmax(0)], classes[y]
print(f'Predicted: "{predicted}", Actual: "{actual}"')
As you can see, I have explicitly moved the network to the gpu, but the error is still raised.
If anyone finds a solution, please put it below.