Module not passed to the gpu properly in `nn.DataParallel`

I have a model that one of it’s modules is:

class LinearAttentionPriorBlock(nn.Module):
    def __init__(self, pathology, normalize_attention=True):
        super(LinearAttentionPriorBlock, self).__init__()
        self.normalize_attn = normalize_attention
        self.score_dir = "AvgAttentionScores"
        self.score = self.read_score(pathology)  # .to('cuda' if torch.cuda.is_available() else 'cpu')
        self.score.requires_grad = False

    def read_score(self, pathology):
        arr = np.load(os.path.join(self.score_dir, pathology, "{:}.npy".format(pathology)))
        return torch.from_numpy(arr)

    def forward(self, global_f, score_shape):
        b_s, n_c, width, height = score_shape
        score = self.score.unsqueeze(0).expand(b_s, n_c, width, height)
        normalized_scores = torch.sigmoid(score)

        global_f = torch.mul(normalized_scores, global_f)
        if self.normalize_attn:
            global_f = global_f.view(b_s, n_c, -1).sum(dim=2)  # batch_size X C
        else:
            global_f = functional.adaptive_avg_pool2d(global_f, (1, 1)).view(b_s, n_c)
        return global_f

In the __init__ of the model I do the following:

 def __init__(self, num_classes, use_attention=True, normalize_attention=True, is_pretrained=True, use_prior=False):
        super().__init__()

        self.use_prior = use_prior
        self.att_status = "w" if use_prior else "wo"
        self.use_attention = use_attention
        self.normalize_attention = normalize_attention
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.resnet = models.resnet50(pretrained=is_pretrained)
        fc_num_features = self.resnet.fc.in_features

        if not self.use_prior:
            self.projector_1 = ProjectorBlock(256, 2048)
            self.projector_2 = ProjectorBlock(512, 2048)
            self.projector_3 = ProjectorBlock(1024, 2048)

            self.attention_1 = LinearAttentionBlock(in_f=fc_num_features, normalize_attention=normalize_attention)
            self.attention_2 = LinearAttentionBlock(in_f=fc_num_features, normalize_attention=normalize_attention)
            self.attention_3 = LinearAttentionBlock(in_f=fc_num_features, normalize_attention=normalize_attention)
            self.classifier = nn.Linear(in_features=fc_num_features * 3, out_features=num_classes, bias=True)
        else:
            self.projector_3 = ProjectorBlock(1024, 2048)
            self.attention_3 = LinearAttentionBlock(in_f=fc_num_features, normalize_attention=normalize_attention)
            self.prior_layers = nn.ModuleList([LinearAttentionPriorBlock(LABELS[i],
                                                                         normalize_attention=normalize_attention)
                                               for i in range(len(LABELS))])
            self.classifiers = nn.ModuleList([nn.Linear(in_features=fc_num_features,
                                                        out_features=1,
                                                        bias=True).to(self.device) for _ in range(len(LABELS))])

Which suppose to pass everything properly to the gpu, so when I do nn.DataParallel(model) I expect it to work but I get the following error message:

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

My guess is that I do something wrong in the init of my custom module but I can’t figure out what is the issue. And because of that I can only train it on a single GPU

Would appreciate some help

Could you remove the self.device from the __init__ method of your model and rerun the code?
I’m not sure, if this is the root cause, but would be an easy check.

I have the same proplem. Did you fix this? If yes, please share me the solution, thank you!