Pytorch Siamese network for text similarity. Problem with learning

I try to create LSTM Siamese network for text similarity classification.
But the network doesn’t learn correctly. What could it be?

class LSTMEncoder(nn.Module):
    def __init__(self, embed_size, batch_size, hidden_size, num_layers, embed_matrix, bidir=True):
        super(LSTMEncoder, self).__init__()
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.hidden_size_1 = hidden_size
        self.hidden_size_2 = hidden_size
        self.num_layers = num_layers
        self.bidir = bidir
        if self.bidir:
            self.direction = 2
        else: self.direction = 1
        self.dropout = 0.35

        self.embedding = embed_matrix
        self.lstm_1 = nn.LSTM(input_size=self.embed_size, hidden_size=self.hidden_size_1, dropout=self.dropout,
                            num_layers=self.num_layers, bidirectional=self.bidir).to(device)
        self.lstm_2 = nn.LSTM(input_size=self.hidden_size_1, hidden_size=self.hidden_size_2,dropout=self.dropout,
                            num_layers=self.num_layers, bidirectional=self.bidir).to(device)
        
        
    def initHiddenCell(self):
        rand_hidden = Variable(torch.randn(self.direction * self.num_layers, self.batch_size, self.hidden_size_1)).to(device)
        rand_cell = Variable(torch.randn(self.direction * self.num_layers, self.batch_size, self.hidden_size_1)).to(device)
        return rand_hidden, rand_cell

    def forward(self, input, hidden, cell):
        input = self.embedding(input).view(1, self.batch_size, -1)
        output, (hidden, cell) = self.lstm_1(input, (hidden, cell))
        return output, hidden, cell
class Siamese_lstm(nn.Module):
    def __init__(self, embed_size, batch_size, hidden_size, num_layers, embed_matrix, bidir=True):
        super(Siamese_lstm, self).__init__()
        
        
        self.encoder = LSTMEncoder(embed_size, batch_size, hidden_size, num_layers, embed_matrix, bidir=True).to(device)
        self.input_dim = 5 * self.encoder.direction * hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(self.input_dim, int(self.input_dim/2)).to(device),
            nn.Linear( int(self.input_dim/2), 2 ).to(device)
        )
#         self.classifier = nn.Linear(self.input_dim, 2).to(device)

    
    
    def forward(self, s1, s2):

        # init hidden, cell
        h1, c1 = self.encoder.initHiddenCell()
        h2, c2 = self.encoder.initHiddenCell()

        # input one by one

        for i in range(len(s1)):
            v1, h1, c1 = self.encoder(s1[i], h1, c1)

        for j in range(len(s2)):
            v2, h2, c2 = self.encoder(s2[j], h2, c2)
        

        # utilize these two encoded vectors
        features = torch.cat((v1,torch.abs(v1 - v2),v2,v1*v2, (v1+v2)/2), 2)
        # features = v1-v2
        output = self.classifier(features)
        output = torch.sigmoid(output)
        return output

embedding:
TEXT.build_vocab(trn,min_freq=1,vectors = “fasttext.en.300d”)
embedding_matrix = nn.Embedding.from_pretrained(torch.FloatTensor(TEXT.vocab.vectors))
embedding_matrix.requires_grad = False

hyperparams:
model = Siamese_lstm(embed_size=300, batch_size=64, hidden_size=512, num_layers=4, embed_matrix=embedding_matrix, bidir=True)

Example of output:
tensor([[0.4977, 0.5058],
[0.4980, 0.5057],
[0.4976, 0.5062],
[0.4980, 0.5060],
[0.4981, 0.5058],
[0.4982, 0.5061],
[0.4981, 0.5057],
[0.4979, 0.5061],
[0.4978, 0.5056],
[0.4976, 0.5056],

Loss (CrossEntropyLoss) doesn’t decrease