Order between relu and cat causes different training results

Hi, all:

when I train the network with such structure:

x1=F.relu(BN1(Conv2d(x)))
x1_connect=F.relu(BN1_connect(Conv2d_connect(x)))
x2=torch.cat((x1,x1_connect),dim=1)

Above structure is identical to:

x1=BN1(Conv2d(x))
x1_connect=BN1_connect(Conv2d_connect(x))
x2=torch.cat((x1,x1_connect),dim=1)
x2=F.relu(x2)

But they have totally different training results.
I have print their grad after first iteration with same initialization, input and other setting,
there grad are different in this structure.
So what could cause such problem?
Thank you!

That’s strange. Could you post the code you’ve used to compare the parameters and gradients?
This code seems to be working fine:

class MyModel1(nn.Module):
    def __init__(self):
        super(MyModel1, self).__init__()
        self.conv1 = nn.Conv2d(1, 3, 3, 1, 1)
        self.bn1 = nn.BatchNorm2d(3)
        self.conv2 = nn.Conv2d(1, 3, 3, 1, 1)
        self.bn2 = nn.BatchNorm2d(3)

    def forward(self, x):
        x1 = F.relu(self.bn1(self.conv1(x)))
        x2 = F.relu(self.bn2(self.conv2(x)))
        x = torch.cat((x1, x2), 1)
        return x


class MyModel2(nn.Module):
    def __init__(self):
        super(MyModel2, self).__init__()
        self.conv1 = nn.Conv2d(1, 3, 3, 1, 1)
        self.bn1 = nn.BatchNorm2d(3)
        self.conv2 = nn.Conv2d(1, 3, 3, 1, 1)
        self.bn2 = nn.BatchNorm2d(3)

    def forward(self, x):
        x1 = self.bn1(self.conv1(x))
        x2 = self.bn2(self.conv2(x))
        x = F.relu(torch.cat((x1, x2), 1))
        return x


model1 = MyModel1()
model2 = MyModel2()
model2.load_state_dict(model1.state_dict())

# Check all parameters
for param1, param2 in zip(model1.parameters(), model2.parameters()):
    if (param1!=param2).any():
        print('Parameters different! {}, {}'.format(
            param1, param2))

x = torch.randn(5, 1, 24, 24)
output1 = model1(x)
output1.mean().backward()
output2 = model2(x)
output2.mean().backward()

# Check gradients
for param1, param2 in zip(model1.parameters(), model2.parameters()):
    if (param1.grad!=param2.grad).any():
        print('Gradient different! {}, {}'.format(
            param1.grad, param2.grad))

Sorry, I have found that there is some problem in our SyncBN layer.
Thank you!