Torch.cat(list(x.values()), dim=dim), nn.DataParallel: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

Hello!
I am testing following code as this blog:
https://github.com/dthiagarajan/technical_blog/blob/master/_notebooks/2020-03-18-Dynamic-UNet-and-PyTorch-Hooks.ipynb

It runs OK when using only single gpu. However, if using multiple gpus with “model= nn.DataParallel(model)”, it will gives errors of:
"
in forward
return torch.cat(list(x.values()), dim=dim)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking arugment for argument tensors in method wrapper__cat)
"
I guess it may be related to “torch.cat(list(x.values()), dim=dim)”, which using python list instead of nn.ModuleList, also maybe some calls using python dict instead of nn.ModuleDict.

simple change list() to nn.ModuleList won’t work.

anyone please help on how to solve such problem

Could you check the .device attribute of all tensors in x before this torch.cat operation is performed?
The linked notebook is unable to render the code for some reason for me, so I’m unsure how this layer and operations are used.

Generally, we also recommend using DistributedDataParallel for better performance and since nn.DataParallel is in maintenance mode.

thks, would you posible to run following code:

#source: technical_blog/_notebooks/2020-03-18-Dynamic-UNet-and-PyTorch-Hooks.ipynb at master · dthiagarajan/technical_blog · GitHub
import matplotlib.pyplot as plt
import numpy as np
import os
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
import torch.utils.model_zoo as model_zoo
import torchvision.transforms.functional as tf
from tqdm.notebook import tqdm

model_urls = {
‘resnet34’: ‘https://download.pytorch.org/models/resnet34-333f7ec4.pth
}

#collapse_hide
def conv3x3(in_planes, out_planes, stride=1):
“”“3x3 convolution with padding”“”
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)

def conv1x1(in_planes, out_planes, stride=1):
“”“1x1 convolution”“”
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class BasicBlock(nn.Module):
expansion = 1

def __init__(self, inplanes, planes, stride=1, downsample=None):
    super(BasicBlock, self).__init__()
    self.conv1 = conv3x3(inplanes, planes, stride)
    self.bn1 = nn.BatchNorm2d(planes)
    self.relu = nn.ReLU(inplace=True)
    self.conv2 = conv3x3(planes, planes)
    self.bn2 = nn.BatchNorm2d(planes)
    self.downsample = downsample
    self.stride = stride

def forward(self, x):
    residual = x

    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)

    out = self.conv2(out)
    out = self.bn2(out)

    if self.downsample is not None:
        residual = self.downsample(x)

    out += residual
    out = self.relu(out)

    return out

class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1, downsample=None):
    super(Bottleneck, self).__init__()
    self.conv1 = conv1x1(inplanes, planes)
    self.bn1 = nn.BatchNorm2d(planes)
    self.conv2 = conv3x3(planes, planes, stride)
    self.bn2 = nn.BatchNorm2d(planes)
    self.conv3 = conv1x1(planes, planes * self.expansion)
    self.bn3 = nn.BatchNorm2d(planes * self.expansion)
    self.relu = nn.ReLU(inplace=True)
    self.downsample = downsample
    self.stride = stride

def forward(self, x):
    residual = x

    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)

    out = self.conv2(out)
    out = self.bn2(out)
    out = self.relu(out)

    out = self.conv3(out)
    out = self.bn3(out)

    if self.downsample is not None:
        residual = self.downsample(x)

    out += residual
    out = self.relu(out)

    return out

class ResNetEncoder(nn.Module):
def init(self, block, layers, num_classes=1000):
super(ResNetEncoder, self).init()
self.inplanes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer0 = nn.Sequential(self.conv1, self.bn1, self.relu)
self.layer1 = nn.Sequential(self.maxpool, self._make_layer(block, 64, layers[0]))
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.out_dim = 512 * block.expansion

    for m in self.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)

def _make_layer(self, block, planes, blocks, stride=1):
    downsample = None
    if stride != 1 or self.inplanes != planes * block.expansion:
        downsample = nn.Sequential(
            conv1x1(self.inplanes, planes * block.expansion, stride),
            nn.BatchNorm2d(planes * block.expansion),
        )

    layers = []
    layers.append(block(self.inplanes, planes, stride, downsample))
    self.inplanes = planes * block.expansion
    for _ in range(1, blocks):
        layers.append(block(self.inplanes, planes))

    return nn.Sequential(*layers)

def forward(self, x):
    x = self.layer0(x)
    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)

    return x

#collapse_show
def resnet18(pretrained=True, **kwargs):
“”“Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
“””
model = ResNetEncoder(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls[‘resnet18’]), strict=False)
return model

def resnet34(pretrained=True, **kwargs):
“”“Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
“””
model = ResNetEncoder(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls[‘resnet34’]), strict=False)
return model

def resnet50(pretrained=True, **kwargs):
“”“Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
“””
model = ResNetEncoder(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls[‘resnet50’]), strict=False)
return model

def resnet101(pretrained=True, **kwargs):
“”“Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
“””
model = ResNetEncoder(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls[‘resnet101’]), strict=False)
return model

def resnet152(pretrained=True, **kwargs):
“”“Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
“””
model = ResNetEncoder(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls[‘resnet152’]), strict=False)
return model

class ConvLayer(nn.Module):
def init(self, num_inputs, num_filters, bn=True, kernel_size=3, stride=1,
padding=None, transpose=False, dilation=1):
super(ConvLayer, self).init()
if padding is None:
padding = (kernel_size-1)//2 if transpose is not None else 0
if transpose:
self.layer = nn.ConvTranspose2d(num_inputs, num_filters, kernel_size=kernel_size,
stride=stride, padding=padding, dilation=dilation)
else:
self.layer = nn.Conv2d(num_inputs, num_filters, kernel_size=kernel_size,
stride=stride, padding=padding)
nn.init.kaiming_uniform_(self.layer.weight, a=np.sqrt(5))
self.bn_layer = nn.BatchNorm2d(num_filters) if bn else None

def forward(self, x):
    out = self.layer(x)
    out = F.relu(out)
    return out if self.bn_layer is None else self.bn_layer(out)

class ConcatLayer(nn.Module):
def forward(self, x, dim=1):
return torch.cat(list(x.values()), dim=dim)

class LambdaLayer(nn.Module):
def init(self, f):
super(LambdaLayer, self).init()
self.f = f

def forward(self, x):
    return self.f(x)

def upconv2x2(inplanes, outplanes, size=None, stride=1):
if size is not None:
return [
ConvLayer(inplanes, outplanes, kernel_size=2, dilation=2, stride=stride),
nn.Upsample(size=size, mode=‘bilinear’, align_corners=True)
]
else:
return [
ConvLayer(inplanes, outplanes, kernel_size=2, dilation=2, stride=stride),
nn.Upsample(scale_factor=2, mode=‘bilinear’, align_corners=True)
]

#collapse_show
class DecoderConnect(nn.Module):
def init(self, inplanes, output_size):
super(DecoderConnect, self).init()
self.bottom_process = nn.Sequential(
ConvLayer(inplanes, inplanes * 2, kernel_size=3),
ConvLayer(inplanes * 2, inplanes * 2, kernel_size=3),
*upconv2x2(inplanes * 2, inplanes, size=output_size)
)
self.concat_process = nn.Sequential(
ConcatLayer(),
ConvLayer(inplanes * 2, inplanes * 2, kernel_size=1),
ConvLayer(inplanes * 2, inplanes, kernel_size=3),
ConvLayer(inplanes, inplanes, kernel_size=3)
)

def forward(self, x):
    decoder_input = self.bottom_process(x)
    return self.concat_process({0: x, 1: decoder_input})

#collapse_show
class DynamicUNet(nn.Module):
def init(self, encoder, input_size=(224, 224), num_output_channels=None, verbose=0):
super(DynamicUNet, self).init()
self.encoder = encoder
self.verbose = verbose
self.input_size = input_size
self.num_input_channels = 3 # This must be 3 because we’re using a ResNet encoder
self.num_output_channels = num_output_channels

    self.decoder = self.setup_decoder()
    
def forward(self, x):
    encoder_outputs = []
    def encoder_output_hook(self, input, output):
        encoder_outputs.append(output)

    handles = [
        child.register_forward_hook(encoder_output_hook) for name, child in self.encoder.named_children()
        if name.startswith('layer')
    ]

    try:
        self.encoder(x)
    finally:
        if self.verbose >= 1:
            print("Removing all forward handles")
        for handle in handles:
            handle.remove()

    prev_output = None
    for reo, rdl in zip(reversed(encoder_outputs), self.decoder):
        if prev_output is not None:
            prev_output = rdl({0: reo, 1: prev_output})
        else:
            prev_output = rdl(reo)
    return prev_output
            
def setup_decoder(self):
    input_sizes = []
    output_sizes = []
    def shape_hook(self, input, output):
        input_sizes.append(input[0].shape)
        output_sizes.append(output.shape)

    handles = [
        child.register_forward_hook(shape_hook) for name, child in self.encoder.named_children()
        if name.startswith('layer')
    ]    

    self.encoder.eval()
    test_input = torch.randn(1, self.num_input_channels, *self.input_size)
    try:
        self.encoder(test_input)
    finally:
        if self.verbose >= 1:
            print("Removing all shape hook handles")
        for handle in handles:
            handle.remove()
    decoder = self.construct_decoder(input_sizes, output_sizes, num_output_channels=self.num_output_channels)
    return decoder
    
def construct_decoder(self, input_sizes, output_sizes, num_output_channels=None):
    decoder_layers = []
    for layer_index, (input_size, output_size) in enumerate(zip(input_sizes, output_sizes)):
        upsampling_size_factor = int(input_size[-1] / output_size[-1])
        upsampling_channel_factor = input_size[-3] / output_size[-3]
        next_layer = []
        bs, c, h, w = input_size
        ops = []
        if layer_index == len(input_sizes) - 1:
            last_layer_ops = DecoderConnect(output_size[-3], output_size[2:])
            last_layer_ops_input = torch.randn(*output_size)
            last_layer_concat_ops_output = last_layer_ops(last_layer_ops_input)
            next_layer.extend([last_layer_ops])
            if upsampling_size_factor > 1 or upsampling_channel_factor != 1:
                last_layer_concat_upconv_op = upconv2x2(output_size[-3], input_size[-3], size=input_size[2:])
                last_layer_concat_upconv_op_output = nn.Sequential(*last_layer_concat_upconv_op)(
                    last_layer_concat_ops_output
                )
                next_layer.extend(last_layer_concat_upconv_op)
        elif layer_index == 0:
            first_layer_concat_ops = [
                ConcatLayer(),
                ConvLayer(output_size[-3] * 2, output_size[-3] * 2, kernel_size=1),
                *upconv2x2(
                    output_size[-3] * 2,
                    output_size[-3],
                    size=[dim * upsampling_size_factor for dim in output_size[2:]]
                ),
                ConvLayer(output_size[-3], output_size[-3], kernel_size=3),
                ConvLayer(
                    output_size[-3],
                    input_size[-3] if self.num_output_channels is None else self.num_output_channels,
                    kernel_size=1
                ),
            ]
            first_layer_concat_ops_output = nn.Sequential(*first_layer_concat_ops)(
                {0: torch.randn(*output_size), 1: torch.randn(*output_size)}
            )
            next_layer.extend(first_layer_concat_ops)
        else:
            middle_layer_concat_ops = [
                ConcatLayer(),
                ConvLayer(output_size[-3] * 2, output_size[-3] * 2, kernel_size=1),
                ConvLayer(output_size[-3] * 2, output_size[-3], kernel_size=3),
                ConvLayer(output_size[-3], output_size[-3], kernel_size=3)
            ]
            middle_layer_concat_ops_output = nn.Sequential(*middle_layer_concat_ops)(
                {0: torch.randn(*output_size), 1: torch.randn(*output_size)}
            )
            next_layer.extend(middle_layer_concat_ops)
            if upsampling_size_factor > 1 or upsampling_channel_factor != 1:
                middle_layer_concat_upconv_op = upconv2x2(output_size[-3], input_size[-3], size=input_size[2:])
                middle_layer_concat_upconv_op_output = nn.Sequential(*middle_layer_concat_upconv_op)(
                    middle_layer_concat_ops_output
                )
                next_layer.extend(middle_layer_concat_upconv_op)
        decoder_layers.append(nn.Sequential(*next_layer))
    return nn.ModuleList(reversed(decoder_layers))

model = DynamicUNet(resnet34(), num_output_channels=3, input_size=(256,256))
#if torch.cuda.is_available():

model = model.cuda()

#decoder_parameters = [item for module in model.decoder for item in module.parameters()]
#optimizer = optim.AdamW(decoder_parameters) # Only training the decoder for now

model.train()
if torch.cuda.device_count() > 1:
print(“Let’s use”, torch.cuda.device_count(), “GPUs!”)
# dim = 0 [30, xxx] → [10, …], [10, …], [10, …] on 3 GPUs
model= nn.DataParallel(model)
model = model.cuda()
inputs = torch.randn(8, 3, 256, 256).cuda()

out = model(inputs)