Trying to run just a ConvTranspose2d() on GPU, all other operations on CPU

I am profiling different parts of an application on GPU, and need to profile just a single layer on the GPU (ConvTranspose2d layer). Basically I just want to see what CUDA kernels get launched for ConvTranspose2d (I don’t want batch norm or relu run there).

However, when I try to run the following code, I get a “RuntimeError: Tensor for argument #2 ‘weight’ is on CPU, but expected it to be on GPU (while checking arguments for cudnn_batch_norm)”

import torch
import torch.nn as nn
import torch.nn.functional as F

cuda = torch.device('cuda')

def main():

    net = Generator()
    input = torch.rand(10,1,1).unsqueeze(0)
    image = net(input.to(device=cuda))

    print(image.reshape(28,28))


class Generator(nn.Module):

    def __init__(self, ):
        super(Generator, self).__init__()
        self.z_dim = 10
        self.x_dim = 784
        self.name = 'mnist/dcgan/g_net'

        self.layer1 = nn.Sequential(

            nn.ConvTranspose2d(in_channels=10, out_channels=32, kernel_size=(4,4), stride=2, padding=0).to(device=cuda),
            
            nn.BatchNorm2d(32),
            
            nn.ReLU()
            
        )
        self.layer2 = nn.Sequential(
            
            nn.ConvTranspose2d(in_channels=32, out_channels=32, kernel_size=(6,6),stride=2, padding=0),
            
            nn.BatchNorm2d(32),
            
            nn.ReLU()
            
        )

        self.layer3 = nn.Sequential(

            nn.ConvTranspose2d(in_channels=32, out_channels=1, kernel_size=(6,6), stride=2, padding=0),
            
            nn.Sigmoid()

        )

        for m in self.modules():
            if isinstance(m, nn.ConvTranspose2d):
                nn.init.normal_(m.weight).to(device=cuda)


    def forward(self, x):

        out1 = self.layer1(x)
        pdb.set_trace()
        out2 = self.layer2(out1)
        pdb.set_trace()
        out3 = self.layer3(out2)

        return(out3)

if __name__ == '__main__':

    main()

Full traceback:

rabbit@nano-dev:~/Documents/PRL/FPGA_DeconvAcc/mnist/tensorflow$ python example.py
Traceback (most recent call last):
File “example.py”, line 68, in
main()
File “example.py”, line 11, in main
image = net(input.to(device=cuda))
File “/home/rabbit/Documents/PRL/FPGA_DeconvAcc/mnist/tensorflow/venv/tensorflow/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 532, in call
result = self.forward(*input, **kwargs)
File “example.py”, line 58, in forward
out1 = self.layer1(x)
File “/home/rabbit/Documents/PRL/FPGA_DeconvAcc/mnist/tensorflow/venv/tensorflow/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 532, in call
result = self.forward(*input, **kwargs)
File “/home/rabbit/Documents/PRL/FPGA_DeconvAcc/mnist/tensorflow/venv/tensorflow/lib/python3.6/site-packages/torch/nn/modules/container.py”, line 100, in forward
input = module(input)
File “/home/rabbit/Documents/PRL/FPGA_DeconvAcc/mnist/tensorflow/venv/tensorflow/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 532, in call
result = self.forward(*input, **kwargs)
File “/home/rabbit/Documents/PRL/FPGA_DeconvAcc/mnist/tensorflow/venv/tensorflow/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py”, line 107, in forward
exponential_average_factor, self.eps)
File “/home/rabbit/Documents/PRL/FPGA_DeconvAcc/mnist/tensorflow/venv/tensorflow/lib/python3.6/site-packages/torch/nn/functional.py”, line 1670, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
RuntimeError: Tensor for argument #2 ‘weight’ is on CPU, but expected it to be on GPU (while checking arguments for cudnn_batch_norm)

Found solution:
broke the sequential layers up, then converted the data to and from cuda in the forward pass

class Generator(nn.Module):

    def __init__(self, ):
        super(Generator, self).__init__()
        self.z_dim = 10
        self.x_dim = 784
        self.name = 'mnist/dcgan/g_net'



        self.layer1 = nn.ConvTranspose2d(in_channels=10, out_channels=32, kernel_size=(4,4), stride=2, padding=0).to(device=cuda),
        
        self.activation1 = (nn.BatchNorm2d(32), nn.ReLU())
        
        self.layer2 = nn.ConvTranspose2d(in_channels=32, out_channels=32, kernel_size=(6,6),stride=2, padding=0),
        
        self.activation2 = (nn.BatchNorm2d(32), nn.ReLU())

        self.layer3 = nn.ConvTranspose2d(in_channels=32, out_channels=1, kernel_size=(6,6), stride=2, padding=0),
        
        self.activation3 = nn.Sigmoid()

        

        for m in self.modules():
            if isinstance(m, nn.ConvTranspose2d):
                nn.init.normal_(m.weight)


    def forward(self, x):

        out1_ = self.layer1(x.to(device=cuda))
        out1 = self.activation1(out1_.to(device=cpu))
        out2_ = self.layer2(out1)
        out2 = self.activation2(out2_)
        out3_ = self.layer3(out2)
        out3 = self.activation3(out2_.numpy())

        return(out3)