Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)

Hey,

im trying to split ptsemseg’s frrn model to run on 2 GPU’s (Model Parallel) but in doing so i get the following error message.

NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/functional.py:2941: UserWarning: nn.functional.upsample is deprecated. Use nn.functional.interpolate instead.
  warnings.warn("nn.functional.upsample is deprecated. Use nn.functional.interpolate instead.")
Traceback (most recent call last):
  File "/NeuronalNetwork/train/train.py", line 239, in <module>
    train(cfg, writer, logger)
  File "/NeuronalNetwork/train/train.py", line 129, in train
    outputs = model(images.to('cuda:0'))
  File "/NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/NeuronalNetwork/train/ptsemseg/models/MP_frrn.py", line 86, in forward
    y, z = getattr(self, key)(y_upsampled, z)
  File "/NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/NeuronalNetwork/train/ptsemseg/models/utils.py", line 151, in forward
    y_prime = self.conv1(x)
  File "/NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/NeuronalNetwork/train/ptsemseg/models/utils.py", line 75, in forward
    outputs = self.cbr_unit(inputs)
  File "/NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/modules/container.py", line 117, in forward
    input = module(input)
  File "/NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 419, in forward
    return self._conv_forward(input, self.weight)
  File "/NeuronalNetwork/env/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 416, in _conv_forward
    self.padding, self.dilation, self.groups)
RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)

My class (mostly just copied from the existing model with some edit)

import torch

import torch.nn as nn

from ptsemseg.models.frrn import frrn

import torch.nn.functional as F

class ModelParallelfrrn(frrn):

    def __init__(self, *args, **kwargs):

        super(ModelParallelfrrn, self).__init__(

            n_classes=21, model_type="B", group_norm=False, n_groups=16)  # default init values

        self.conv1.to("cuda:0")

        self.up_residual_units.to("cuda:0")

        self.split_conv.to("cuda:0")

        self.merge_conv.to("cuda:1")

        self.down_residual_units.to("cuda:1")

        self.classif_conv.to("cuda:1")

    def forward(self, x):

        # pass to initial conv

        x = self.conv1(x.to("cuda:0"))

        # pass through residual units

        for i in range(3):

            x = self.up_residual_units[i](x)

        # divide stream

        y = x                               # full image resolution stream

        z = self.split_conv(x) # processed image stream

        prev_channels = 48

        # encoding

        for n_blocks, channels, scale in self.encoder_frru_specs:

            # maxpool bigger feature map

            y_pooled = F.max_pool2d(y, stride=2, kernel_size=2, padding=0)

            # pass through encoding FRRUs

            for block in range(n_blocks):

                key = "_".join(map(str, ["encoding_frru", n_blocks, channels, scale, block]))

                y, z = getattr(self, key)(y_pooled, z)

            prev_channels = channels

        # move both streams to GPU 1

        y = y.to("cuda:1")

        z = z.to("cuda:1")

        

        # decoding

        for n_blocks, channels, scale in self.decoder_frru_specs:

            # bilinear upsample smaller feature map

            upsample_size = torch.Size([_s * 2 for _s in y.size()[-2:]])

            y_upsampled = F.upsample(y, size=upsample_size, mode="bilinear", align_corners=True).to("cuda:1")

            # pass through decoding FRRUs

            for block in range(n_blocks):

                key = "_".join(map(str, ["decoding_frru", n_blocks, channels, scale, block]))

                # print("Incoming FRRU Size: ", key, y_upsampled.shape, z.shape)

                y, z = getattr(self, key)(y_upsampled, z)

                # print("Outgoing FRRU Size: ", key, y.shape, z.shape)

            prev_channels = channels

        # merge streams

        x = torch.cat(

            [F.upsample(y, scale_factor=2, mode="bilinear", align_corners=True), z], dim=1

        ).to("cuda:1")

        x = self.merge_conv(x)

        # pass through residual units

        for i in range(3):

            x = self.down_residual_units[i](x)

        # final 1x1 conv to get classification

        x = self.classif_conv(x)

        return x

my model is on GPU 0

       >  model.train()
        labels = labels.to('cuda:0')

        # forward pass
        optimizer.zero_grad()
        outputs = model(images.to('cuda:0'))

        # backward pass
        labels = labels.to(outputs.device)
    
        loss = loss_fn(input=outputs, target=labels)

Does anyone have an idea what the problem might be?

The error seems to be raised in self.conv1, which should already be in cuda:0 as well as its input x.
Could you check the device of all internal parameters of this layer and also of self.cbr_unit, as this modules is undefined in your code?

Also, I assume you are not wrapping your model into nn.DataParallel, since you are using a model parallel approach? In case you do, this could explain the error, since the model would be cloned to each device, while you are forcing the input to be on GPU0 in the forward pass.

if you mean the x in my forward passes x, then yes it is in fact on GPU 0, but i cant find any information on where self.conv1 is located (infact none of my module have any device information, only my tensors?)

No I dont use DataParallel.

I thought the problem occured in my forward pass at line 57 when calling y, z = getattr(self, key)(y_upsampled, z) since my debugger threw an error only when I passed it or when I digged deeper and ended up in the FRRU modules forward pass.

Based on the stack trace, the parameters causing this error are used in:

self._conv_forward(input, self.weight)

Yes, that is correct, since a module can have parameters on different devices.
You could thus check all parameters and buffers via e.g.:

for name, param in  model.conv1.named_parameters():
    print(name, param.device)

for all modules in question.

for name, param in  model.conv1.named_parameters():
    print(name, param.device)

results in:

cbr_unit.0.weight cuda:0
cbr_unit.0.bias cuda:0
cbr_unit.1.weight cuda:0
cbr_unit.1.bias cuda:0

I also looped over all named parameters of my model and they all seem to be on GPU 0

That seems to be wrong, since some of your modules should be on GPU1.
As I’m currently unsure, what’s causing this issue, could you post an executable code snippet, so that we could reproduce it?

took some time to boil it down:

import torch

import torch.optim as optim

import torch.nn.functional as F

from torch.utils import data

from ptsemseg.models.frrn import frrn

from ptsemseg.loss import cross_entropy2d

class ModelParallelfrrn(frrn):

    def __init__(self, *args, **kwargs):

        super(ModelParallelfrrn, self).__init__(

            n_classes=21, model_type="B", group_norm=False, n_groups=16)  # default init values

        self.conv1.to("cuda:0")

        self.up_residual_units.to("cuda:0")

        self.split_conv.to("cuda:0")

        self.merge_conv.to("cuda:1")

        self.down_residual_units.to("cuda:1")

        self.classif_conv.to("cuda:1")

    def forward(self, x):

        # pass to initial conv

        x = self.conv1(x.to("cuda:0"))

        # pass through residual units

        for i in range(3):

            x = self.up_residual_units[i](x)

        # divide stream

        y = x                               # full image resolution stream

        z = self.split_conv(x) # processed image stream

        prev_channels = 48

        # encoding

        for n_blocks, channels, scale in self.encoder_frru_specs:

            # maxpool bigger feature map

            y_pooled = F.max_pool2d(y, stride=2, kernel_size=2, padding=0)

            # pass through encoding FRRUs

            for block in range(n_blocks):

                key = "_".join(map(str, ["encoding_frru", n_blocks, channels, scale, block]))

                y, z = getattr(self, key)(y_pooled, z)

            prev_channels = channels

        # move both streams to GPU 1

        y = y.to("cuda:1")

        z = z.to("cuda:1")

        print("\nINside\n")

        for name, param in  self.named_parameters():

                print(name, param.device)

        # decoding

        for n_blocks, channels, scale in self.decoder_frru_specs:

            # bilinear upsample smaller feature map

            upsample_size = torch.Size([_s * 2 for _s in y.size()[-2:]])

            y_upsampled = F.upsample(y, size=upsample_size, mode="bilinear", align_corners=True).to("cuda:1")

            # pass through decoding FRRUs

            for block in range(n_blocks):

                key = "_".join(map(str, ["decoding_frru", n_blocks, channels, scale, block]))

                # print("Incoming FRRU Size: ", key, y_upsampled.shape, z.shape)

                y, z = getattr(self, key)(y_upsampled, z)

                # print("Outgoing FRRU Size: ", key, y.shape, z.shape)

            prev_channels = channels

        # merge streams

        x = torch.cat(

            [F.upsample(y, scale_factor=2, mode="bilinear", align_corners=True), z], dim=1

        ).to("cuda:1")

        x = self.merge_conv(x)

        # pass through residual units

        for i in range(3):

            x = self.down_residual_units[i](x)

        # final 1x1 conv to get classification

        x = self.classif_conv(x)

        return x

def train():

    print("Available GPU's:{}".format(torch.cuda.device_count()))

    num_classes = 1000

    num_batches = 3

    batch_size = 20

    image_w = 128

    image_h = 128

    # Setup Model

    model = ModelParallelfrrn().to('cuda:0')

    for name, param in  model.named_parameters():

            print(name, param.device)

    optimizer = optim.SGD(model.parameters(), 1.0e-10,weight_decay= 0.0005, momentum=0.99)

    one_hot_indices = torch.LongTensor(batch_size) \

                            .random_(0, num_classes) \

                            .view(batch_size, 1)

    for _ in range(num_batches):

    

    # generate random inputs and labels

        inputs = torch.randn(batch_size, 3, image_w, image_h)

        labels = torch.zeros(batch_size, num_classes) \

                        .scatter_(1, one_hot_indices, 1)

        # run forward pass

        optimizer.zero_grad()

        outputs = model(inputs.to('cuda:0'))

        # run backward pass

        labels = labels.to(outputs.device)

        loss_fn = cross_entropy2d(outputs, labels, size_average=False).backward()

        optimizer.step()

if __name__ == "__main__":

    train()