RuntimeError: expected backend CUDA and dtype Float but got backend CPU and dtype Float

I want to use a custom filter in CNN. The filter has size 5*5 and each entry is a function of three variables: theta, Lambda, psi. There are two convolutional layers followed by two fully connected layers. I tested my filter on MNIST dataset. But when I run it on GPU, I encounter the error: RuntimeError: expected backend CUDA and dtype Float but got backend CPU and dtype Float. I guess it might be due to how I generate the filter box, but I cannot find where I made mistake exactly. Basically I modified this example code, I only modified the network structure with my custom filter. Training and test remained unchanged. I attach my code here. Thank you!

from __future__ import print_function
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms


class Net(nn.Module):
    def __init__(self, kernel_size, in_channels, channel1, channel2):
        super(Net, self).__init__()
        self.theta1, self.Lambda1, self.psi1, self.bias1 = self.generate_parameters(channel1, in_channels)
        self.filter1 = self.whole_filter(in_channels, channel1, kernel_size, self.theta1, self.Lambda1, self.psi1).cuda()

        self.theta2, self.Lambda2, self.psi2, self.bias2 = self.generate_parameters(channel2, channel1)
        self.filter2 = self.whole_filter(channel1, channel2, kernel_size, self.theta2, self.Lambda2, self.psi2).cuda()
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)


    def forward(self, x):
        x = F.conv2d(x, self.filter1, bias=self.bias1)
        x = F.max_pool2d(x, 2, 2)
        x = F.conv2d(x, self.filter2, bias=self.bias2)
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


    def generate_parameters(self, dim_out, dim_in):
        theta = nn.Parameter(torch.randn(dim_out, dim_in))
        Lambda = nn.Parameter(torch.randn(dim_out, dim_in))
        psi = nn.Parameter(torch.randn(dim_out, dim_in))
        bias = nn.Parameter(torch.randn(dim_out))
        return theta, Lambda, psi, bias


    def whole_filter(self, in_channels, out_channels, kernel_size, theta_column, Lambda_column, psi_column):
        result = torch.zeros(out_channels, in_channels, kernel_size, kernel_size) # \text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kH , kW
        for i in range(out_channels):
            result[i] = self.one_filter(in_channels, kernel_size, theta_column[i], Lambda_column[i], psi_column[i])
        return result


    def one_filter(self, in_channels, kernel_size, theta, Lambda, psi):
        result = torch.zeros(in_channels, kernel_size, kernel_size)
        for i in range(in_channels):
            result[i] = self.filter_fn(theta[i], Lambda[i], psi[i], kernel_size)
        return result


    def filter_fn(self, theta, Lambda, psi, kernel_size):
        # Bounding box
        half_size = (kernel_size - 1) // 2
        ymin, xmin = -half_size, -half_size
        ymax, xmax = half_size, half_size
        (y, x) = np.meshgrid(np.arange(ymin, ymax + 1), np.arange(xmin, xmax + 1))
        y = torch.FloatTensor(y)
        x = torch.FloatTensor(x)

        # Rotation
        x_theta = x * torch.cos(theta) + y * torch.sin(theta)
        y_theta = -x * torch.sin(theta) + y * torch.cos(theta)

        box = torch.cos(y_theta) * torch.sin(2 * np.pi / Lambda * x_theta + psi)
        return box



def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward(retain_graph=True)
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(args, model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.test_batch_size, shuffle=True, **kwargs)


    model = Net(5, 1, 20, 50).to(device)
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for param in model.parameters():
        print(type(param.data), param.size())

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(args, model, device, test_loader)

    if (args.save_model):
        torch.save(model.state_dict(),"mnist_cnn.pt")

if __name__ == '__main__':
    main()

Try to use nn.Parameter for your return values in whole_filter and one_filter, as this will properly register these filters as internal parameters, and will thus push them also to the GPU in the model.to(device) call.

1 Like

Hi Ptrblck, I added nn.Parameter and it works! Thank you very much!! A further question: when I deleted .cuda() in self.filter1 and self.filter2, the parameters have size:

<class 'torch.Tensor'> torch.Size([20, 1])
<class 'torch.Tensor'> torch.Size([20, 1])
<class 'torch.Tensor'> torch.Size([20, 1])
<class 'torch.Tensor'> torch.Size([20])
<class 'torch.Tensor'> torch.Size([20, 1, 5, 5])
<class 'torch.Tensor'> torch.Size([50, 20])
<class 'torch.Tensor'> torch.Size([50, 20])
<class 'torch.Tensor'> torch.Size([50, 20])
<class 'torch.Tensor'> torch.Size([50])
<class 'torch.Tensor'> torch.Size([50, 20, 5, 5])
<class 'torch.Tensor'> torch.Size([500, 800])
<class 'torch.Tensor'> torch.Size([500])
<class 'torch.Tensor'> torch.Size([10, 500])
<class 'torch.Tensor'> torch.Size([10])

It’s wrong, I should not have the parameters with size ([20, 1, 5, 5]) and ([50, 20, 5, 5]), since I am not using the conventional convolution kernel here. Moreover, the loss is nan during training. But when I keep .cuda() there, it’s correct and training is ok:

<class 'torch.Tensor'> torch.Size([20, 1])
<class 'torch.Tensor'> torch.Size([20, 1])
<class 'torch.Tensor'> torch.Size([20, 1])
<class 'torch.Tensor'> torch.Size([20])
<class 'torch.Tensor'> torch.Size([50, 20])
<class 'torch.Tensor'> torch.Size([50, 20])
<class 'torch.Tensor'> torch.Size([50, 20])
<class 'torch.Tensor'> torch.Size([50])
<class 'torch.Tensor'> torch.Size([500, 800])
<class 'torch.Tensor'> torch.Size([500])
<class 'torch.Tensor'> torch.Size([10, 500])
<class 'torch.Tensor'> torch.Size([10])

So I am curious why I cannot delete .cuda(), since you mentioned that nn.Parameter already put the filters on GPU. Thanks a lot!

The .cuda() call shouldn’t change the shape of a tensor.
Could you point me to the line of code or tensor name, which has the shape [20, 1, 5, 5] now?

That happens when I dropped .cuda() at line 16 and 19:

self.filter1 = self.whole_filter(in_channels, channel1, kernel_size, self.theta1, self.Lambda1, self.psi1).cuda()
self.filter2 = self.whole_filter(channel1, channel2, kernel_size, self.theta2, self.Lambda2, self.psi2).cuda()

My guess is that after I dropped .cuda(), when I call x = F.conv2d(x, self.filter1, bias=self.bias1) on line 25, self.filter1 is not on GPU, so pytorch generate a filter of the corresponding size and do the convolution.

That shouldn’t be the case, as luckily PyTorch raises a loud RuntimeError, if that’s the case:

x = torch.randn(1, 1, 4, 4, device='cuda')
filt = torch.randn(6, 1, 3, 3, device='cuda')
output = F.conv2d(x, filt)  # works

filt = filt.cpu()
output = F.conv2d(x, filt)
> RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

Thank you for you reply. I found out that when I dropped .cuda(), self.filter1 and self.filter2 appear in model.parameters(), that’s where I got those extra shape. Further in this case, self.bias1 and self.bias2 have value nan, although self.theta1, self.Lambda1, self.psi1 have their value. This makes me quite confused.

filter1 and filter2 shouldn’t be returned by model.parameters() regardless of the cuda() call, as they are defined as tensors, not nn.Parameters.
If you want to register them as parameters, warp them in nn.Parameter.

I removed the cuda() call, and changed both filters to be parameters:

    def whole_filter(self, in_channels, out_channels, kernel_size, theta_column, Lambda_column, psi_column):
        result = torch.zeros(out_channels, in_channels, kernel_size, kernel_size) # \text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kH , kW
        for i in range(out_channels):
            result[i] = self.one_filter(in_channels, kernel_size, theta_column[i], Lambda_column[i], psi_column[i])
        return nn.Parameter(result)

It seems the model is working for at least one epoch:

...
Train Epoch: 1 [59520/60000 (99%)]      Loss: 0.054967

Test set: Average loss: 0.1043, Accuracy: 9678/10000 (97%)

I think I did the same thing as you, however the result is different. Could you run the following code to have a look? (I added nn.Parameter to both one_filter and whole_filter and removed cuda()). Thank you very much for your time!

from __future__ import print_function
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms


class Net(nn.Module):
    def __init__(self, kernel_size, in_channels, channel1, channel2):
        super(Net, self).__init__()
        self.theta1, self.Lambda1, self.psi1, self.bias1 = self.generate_parameters(channel1, in_channels)
        self.filter1 = self.whole_filter(in_channels, channel1, kernel_size, self.theta1, self.Lambda1, self.psi1)

        self.theta2, self.Lambda2, self.psi2, self.bias2 = self.generate_parameters(channel2, channel1)
        self.filter2 = self.whole_filter(channel1, channel2, kernel_size, self.theta2, self.Lambda2, self.psi2)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)


    def forward(self, x):
        x = F.conv2d(x, self.filter1, bias=self.bias1)
        x = F.max_pool2d(x, 2, 2)
        x = F.conv2d(x, self.filter2, bias=self.bias2)
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


    def generate_parameters(self, dim_out, dim_in):
        theta = nn.Parameter(torch.randn(dim_out, dim_in))
        Lambda = nn.Parameter(torch.randn(dim_out, dim_in))
        psi = nn.Parameter(torch.randn(dim_out, dim_in))
        bias = nn.Parameter(torch.randn(dim_out))
        return theta, Lambda, psi, bias


    def whole_filter(self, in_channels, out_channels, kernel_size, theta_column, Lambda_column, psi_column):
        result = torch.zeros(out_channels, in_channels, kernel_size, kernel_size) # \text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kH , kW
        for i in range(out_channels):
            result[i] = self.one_filter(in_channels, kernel_size, theta_column[i], Lambda_column[i], psi_column[i])
        return nn.Parameter(result)


    def one_filter(self, in_channels, kernel_size, theta, Lambda, psi):
        result = torch.zeros(in_channels, kernel_size, kernel_size)
        for i in range(in_channels):
            result[i] = self.filter_fn(theta[i], Lambda[i], psi[i], kernel_size)
        return nn.Parameter(result)


    def filter_fn(self, theta, Lambda, psi, kernel_size):
        # Bounding box
        half_size = (kernel_size - 1) // 2
        ymin, xmin = -half_size, -half_size
        ymax, xmax = half_size, half_size
        (y, x) = np.meshgrid(np.arange(ymin, ymax + 1), np.arange(xmin, xmax + 1))
        y = torch.FloatTensor(y)
        x = torch.FloatTensor(x)

        # Rotation
        x_theta = x * torch.cos(theta) + y * torch.sin(theta)
        y_theta = -x * torch.sin(theta) + y * torch.cos(theta)

        box = torch.cos(y_theta) * torch.sin(2 * np.pi / Lambda * x_theta + psi)
        return box



def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward(retain_graph=True)
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(args, model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.test_batch_size, shuffle=True, **kwargs)


    model = Net(5, 1, 20, 50).to(device)
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for param in model.parameters():
        print(type(param.data), param.size())

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(args, model, device, test_loader)
        # for param in model.parameters():
        #     print(param.size(), param.data)
        # print(model.state_dict())
        for name, param in model.state_dict().items():
            print(name, param)

    if (args.save_model):
        torch.save(model.state_dict(),"mnist_cnn.pt")

if __name__ == '__main__':
    main()

You are right! It seems I was just lucky to have used another seed (in fact running the code line by line a few times) and it seems your parameter initialization might be quite sensitive.
If you set the seed to 2809 you should get valid results.
However, I would recommend to try to stabilize the training.

Thank you! I set the seed and it works! But the old question still confuses me, why filter1 and filter2 appear in model.parameters() when I remove cuda()? (I printed out parameters in the code above).

They won’t. If I remove cuda() and the nn.Parameter() wrapping, I’ll get:

theta1
Lambda1
psi1
bias1
theta2
Lambda2
psi2
bias2
fc1.weight
fc1.bias
fc2.weight
fc2.bias

Wrapping them in an nn.Parameter will make them appear in model.parameters().
Could you check, if you were calling nn.Parameter or self.register_parameter somewhere else in the code?

I added nn.Parameter to the returned value of one_filter and whole_filter as you mentioned and I didn’t use self.register_parameter. In my code, filter1 and filter2 are the returned value of whole_filter, does this make them appear in model.parameters()? But when I added cuda(), they would not appear. You may run my code above to see it. Thank you a lot again!

I might have misunderstood your use case.
If you call .cuda() on an nn.Parameter, it won’t be a leaf variable anymore, which is explaiend here and thus won’t appear in the .parameters().

I see. Thank you very much for your explanation!