Using 'DistributedDataParallel' occurs a weird error

shaoming20798 · January 13, 2021, 12:22pm

When I use ‘DataParallel’, the codes work fine. But when I convert to ‘DistributedDataParallel’, something weird happens:

File “/home/20798/HTCN/lib/model/faster_rcnn/vgg16_HTCN.py”, line 145, in forward
x2_bn = self.bn2(x2_fc)

And the codes File "/home/20798/HTCN/lib/model/faster_rcnn/vgg16_HTCN.py", line 145, in forward metioned above is as follows:

class netD_da(nn.Module): # line 129
    def __init__(self, feat_d):# line 130
        super(netD_da, self).__init__()# line 131
        self.fc1 = nn.Linear(feat_d,100)# line 132
        self.bn1 = nn.BatchNorm1d(100)# line 133
        self.fc2 = nn.Linear(100,100)# line 134
        self.bn2 = nn.BatchNorm1d(100)# line 135
        self.fc3 = nn.Linear(100,2)# line 136
    def forward(self, x):# line 137
        #x1 = F.dropout(F.relu(self.bn1(self.fc1(x))),training=self.training)# line 138
        x1_fc = self.fc1(x)# line 139
        x1_bn = self.bn1(x1_fc)# line 140
        x1_relu = F.relu(x1_bn)# line 141
        x1 = F.dropout(x1_relu)# line 142
        #x2 = F.dropout(F.relu(self.bn2(self.fc2(x1))),training=self.training)# line 143
        x2_fc = self.fc2(x1)# line 144
        x2_bn = self.bn2(x2_fc)# line 145
        x2_relu = F.relu(x2_bn)# line 146
        x2 = F.dropout(x2_relu)# line 147
        ret = self.fc3(x2)# line 148
        return ret  #[256, 2]# line 149

pritamdamania87 · January 15, 2021, 2:50am

@shaoming20798 Can you provide a minimal repro for this issue?