nn.Parameter causes two device error

Omega_Ma · November 21, 2022, 9:42pm

For some purpose, I defined layers where elementwise multiplication is operated. I realized this in the following way:

class VectorLinear(nn.Module):
    def __init__(self, N, keep_bias=True):
        super(VectorLinear, self).__init__()
        self.keep_bias = keep_bias
        self.weight = nn.Parameter(torch.randn([1, N]))  # initialize weight
        if self.keep_bias:
            self.bias = nn.Parameter(torch.randn([1, N]))  # initialize bias
        self.reset_parameters()  # self-defined initialization

    def forward(self, input):
        if self.keep_bias:
            return input*self.weight + self.bias
        else:
            return input * self.weight

    def reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.normal_(p, std=0.01)
            else:
                nn.init.normal_(p, std=0.01)

Because I want the code to iteratively define many such layers for use, I use for loop in following way:

class MyNet(nn.Module):
    def __init__(self, in_dim, num_subcarriers, num_layer, Loss_scalar=10, Residule=False, Keep_Bias=False, BN = True, training_method='unsupervised', device=MainDevice):
        super(MyNet self).__init__()
        self.in_dim = in_dim
        self.training_method = training_method
        self.device = device
        self.Rsdl = Residule
        self.dobn = BN
        self.layers_x = []
        self.layers_KL = []
        self.bns = []
        self.num_layer = num_layer
        self.scalar = Loss_scalar
        self.num_subcarriers = num_subcarriers

        for i in range(num_layer):  # define all layers
            layer = VectorLinear(N, keep_bias=Keep_Bias)
            self.layers_x.append(layer)
            setattr(self, 'layer_x_%i' % i, layer)
            for k in range(num_subcarriers):
                layerk = VectorLinear(N, keep_bias=Keep_Bias)
                layer_id = str(i)+str(k)
                setattr(self, 'layer_bzx_'+ layer_id, layerk)  ## another method is to use nn.ModuleList
                self.layers_KL.append(layerk)

            if self.dobn:
                bn_layer = nn.BatchNorm1d(self.in_dim, momentum=0.2)
                setattr(self, 'bn_layers%i'%i, bn_layer)
                self.bns.append(bn_layer)


    def forward(self, BB, zB, x, z, B):
        # batch_size = zB.size()[0]
        LOSS = []
        x_est = torch.randn_like(x, requires_grad=True)

        for l in range(self.num_layer):
            out_x = self.layers_x[l](x_est)
            # Bzx_sum = torch.zeros_like(x, device=self.device)
            for k in range(self.num_subcarriers):
                index = l*self.num_subcarriers + k
                aux_term = torch.bmm(x_est.unsqueeze(1), BB[:, :, :, k]).squeeze() - zB[:, :, k]
                out_x += self.layers_KL[index](aux_term)

            x_est = out_x
            if self.dobn:
                x_est = self.bns[l](x_est)

            if l<self.num_layer-1:
                x_est = torch.nn.functional.relu(x_est)

            else:
                x_est = torch.tanh(x_est)

            if self.training_method == 'supervised':

                dis = torch.mean(torch.square(x - x_est))

            else:
                dis_sum = 0
                for k in range(self.num_subcarriers):
                    diff = z[:, :, k] - torch.matmul(x_est.unsqueeze(1), B[:, :, :, k]).squeeze()
                    dis_sum += torch.mean(torch.square(diff))

            LOSS.append(self.scalar*np.log(l+1) * dis_sum)

        return x_est, LOSS

Now, there are some strange problems. The above codes work well for some values of ( num_subcarriers, num_layer). But for some values, it arises the two device error, like:

Traceback (most recent call last):
  File "/scratch/project_2005641/THz_DNN/THz_Huge.py", line 384, in <module>
    Loss_cache, Lr_list = train_model()
  File "/scratch/project_2005641/THz_DNN/THz_Huge.py", line 94, in train_model
    x_est, loss_list = myModel(batch_BB.to(MainDevice), batch_Bz.to(MainDevice), batch_X.to(MainDevice),
  File "/usr/local/lib64/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/project_2005641/THz_DNN/FuncLbr.py", line 890, in forward
    out_x += self.layers_KL[index](aux_term)
  File "/usr/local/lib64/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/project_2005641/THz_DNN/FuncLbr.py", line 445, in forward
    return input*self.weight + self.bias
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

I can’t figure out why this happens . I just changed the values of ( num_subcarriers, num_layer). Have anyone experienced this before? I thought it might be because of the use of nn.Parameter.

ptrblck · November 21, 2022, 11:45pm

You would need to register the modules into an nn.ModuleList instead of a plain Python list:

        self.layers_x = []
        self.layers_KL = []
        self.bns = []

Change these objects to nn.ModuleList and it should work.

Omega_Ma · November 22, 2022, 8:59am

Thanks, it works now