Binary classfication loss=0.69

please help me ,why the first one code(MTNet) loss can not be reduced。

With the above code (MTNet,shared two layers of parameters), the loss canl not drop, keep 0.69.
With the code2 (LBNet,share the first layer parameters), the loss can be reduced to 0.0x.

This is an method in a published paper,binary classfication,use crossentropyLoss ,looks like siamese net.The Paper said that MTnet performance is better

class MTNet(nn.Module):
    def __init__(self):
        super(MTNet, self).__init__()
        self.convolutions = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.LocalResponseNorm(5, 0.0001, 0.75, 2),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(16, 64, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.LocalResponseNorm(5, 0.0001, 0.75, 2),
            nn.MaxPool2d(kernel_size=2, stride=2)

        )
        self.c3 =nn.Sequential(
           # nn.Conv2d(16, 64, kernel_size=7, stride=1),
           # nn.ReLU(),
           # nn.LocalResponseNorm(5, 0.0001, 0.75, 2),
           # nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 256, kernel_size=7, stride=1),
        )
        self.Linears = nn.Sequential(
                    nn.Dropout(0.5),
                    nn.Linear(21*21 * 256, 2),
                    nn.LogSoftmax()
        )

    def forward_once(self, x):
        output = self.convolutions(x)
        return output

    def forward(self,x):
        input1 = x[:, 0, :, :].unsqueeze(1)
        input2 = x[:, 1, :, :].unsqueeze(1)

        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        outputf = self.c3(output1+output2)

        outputf= outputf.view(-1, 21*21*256)
        outputf = self.Linears(outputf)
        return  outputf

CODE2

class LBNet(nn.Module):
    def __init__(self):
        super(LBNet, self).__init__()
        self.convolutions = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.LocalResponseNorm(5, 0.0001, 0.75, 2),
            nn.MaxPool2d(kernel_size=2, stride=2),

           # nn.Conv2d(16, 64, kernel_size=7, stride=1),
           # nn.ReLU(),
           # nn.LocalResponseNorm(5, 0.0001, 0.75, 2),
           # nn.MaxPool2d(kernel_size=2, stride=2)

        )
        self.c3 =nn.Sequential(
            nn.Conv2d(16, 64, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.LocalResponseNorm(5, 0.0001, 0.75, 2),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 256, kernel_size=7, stride=1),
        )
        self.Linears = nn.Sequential(
                    nn.Dropout(0.5),
                    nn.Linear(21*21 * 256, 2),
                    nn.LogSoftmax()
        )

    def forward_once(self, x):
        output = self.convolutions(x)
        return output

    def forward(self,x):
        input1 = x[:, 0, :, :].unsqueeze(1)
        input2 = x[:, 1, :, :].unsqueeze(1)

        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        outputf = self.c3(output1+output2)

        outputf= outputf.view(-1, 21*21*256)
        outputf = self.Linears(outputf)
        return  outputf


what is the question?

The loss of the first one code cannot be reduced

发自我的iPhone

------------------ Original ------------------