Cuda Device error on Dataparallel

Pytorch 0.4.0 / Cuda 9.0 / cudnn 7.1 / V100 x 4

Traceback (most recent call last):
 File "main.py", line 135, in <module>
   model.train(train_loader, valid_loader)
 File "/home/rplab/workspace/DW/nucleus/nuclues_segmentation/trainers/CNNTrainer.py", line 55, in train
   output_ = self.G(input_)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/modules/module.py", line 491, in __call__
   result = self.forward(*input, **kwargs)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 114, in forward
   outputs = self.parallel_apply(replicas, inputs, kwargs)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 124, in parallel_apply
   return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/parallel/parallel_apply.py", line 65, in parallel_apply
   raise output
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/parallel/parallel_apply.py", line 41, in _worker
   output = module(*input, **kwargs)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/modules/module.py", line 491, in __call__
   result = self.forward(*input, **kwargs)
 File "/home/rplab/workspace/DW/nucleus/nuclues_segmentation/models/unet_nonlocal.py", line 55, in forward
   nonlocal1 = self.nonlocal1(maxpool1)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/modules/module.py", line 491, in __call__
   result = self.forward(*input, **kwargs)
 File "/home/rplab/workspace/DW/nucleus/nuclues_segmentation/models/nonlocal_layer.py", line 111, in forward
   output = self.operation_function(x)
 File "/home/rplab/workspace/DW/nucleus/nuclues_segmentation/models/nonlocal_layer.py", line 119, in _embedded_gaussian
   g_x = self.g(x).view(batch_size, self.inter_channels, -1)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/modules/module.py", line 491, in __call__
   result = self.forward(*input, **kwargs)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/modules/container.py", line 91, in forward
   input = module(input)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/modules/module.py", line 491, in __call__
   result = self.forward(*input, **kwargs)
 File "/home/rplab/workspace/DW/nucleus/pytorch040/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 301, in forward
   self.padding, self.dilation, self.groups)
RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)

This error causes under code

class Foo(nn.Module):
    def __init(self, .....):
        .....
        # Define the operation
        if mode == 'concatenation':
            self.operation_function = self._concatenation
        elif mode == 'concatenation_debug':
            self.operation_function = self._concatenation_debug
        elif mode == 'concatenation_residual':
            self.operation_function = self._concatenation_residual
        else:
            raise NotImplementedError('Unknown operation function.')


    def forward(self, x, g):
        output = self.operation_function(x, g)
        return output

self.operation_function is defined in init block use if condition

But in forward output = self.__concatenation(x,g) , above error is not occur

why if condition in __init__ + nn.Dataparallel is occur Runtime error?

1 Like