String matching with LSTM not converging

I am trying to classify string similarity using LSTM, unfortunately it is not converging, though it is able to overfit small amount of data.

‘:P3:zOHZl’, ‘:K3:z(OHZl’ are similar

‘DHV))!Hk6I’, ‘tldUi2Rz’ are dissimilar

class LSTMEncoder(nn.Module):
    def __init__(self, opt):
        super(LSTMEncoder, self).__init__()
        self.embed_size = opt.embedding_dims
        self.hidden_size = opt.hidden_dims
        self.num_layers = opt.num_layers
        self.bidir = opt.lstm_bidir
        if self.bidir:
            self.direction = 2
        else: self.direction = 1
        self.dropout = opt.lstm_dropout

        self.embedding = nn.Embedding(num_embeddings=opt.vocab_size, embedding_dim=opt.embedding_dims,
                                      padding_idx=80, max_norm=None, scale_grad_by_freq=False, sparse=False)
        self.lstm = nn.LSTM(input_size=opt.embedding_dims, hidden_size=self.hidden_size, dropout=self.dropout,
                            num_layers=self.num_layers, bidirectional=self.bidir)

    def initHiddenCell(self, batch_size):
        rand_hidden = Variable(torch.zeros(self.direction * self.num_layers, batch_size, self.hidden_size))
        rand_cell = Variable(torch.zeros(self.direction * self.num_layers, batch_size, self.hidden_size))
        return rand_hidden, rand_cell

    def forward(self, input1, hidden, cell, input_lengths):
        input1 = self.embedding(input1)
        input1 = torch.nn.utils.rnn.pack_padded_sequence(input1, input_lengths, batch_first=False, enforce_sorted=False)
        output, (hidden, cell) = self.lstm(input1, (hidden, cell))
        return output, hidden, cell

class Siamese_lstm(nn.Module):
    def __init__(self, opt):
        super(Siamese_lstm, self).__init__()

        self.encoder = LSTMEncoder(opt)

        self.input_dim = int(2 * self.encoder.direction * self.encoder.hidden_size)

        self.classifier = nn.Sequential(
            nn.Linear(self.input_dim, int(self.input_dim/2)),
            nn.ReLU(),
            nn.Linear(int(self.input_dim/2), int(self.input_dim/4)),
            nn.ReLU(),
            nn.Linear(int(self.input_dim/4), 1),
            nn.Sigmoid()
        )

    def forward(self, s1, s2, s1_lengths, s2_lengths):
        batch_size = s1.size()[1]
        # print(batch_size, s1[0])
        max_length = torch.LongTensor(torch.cat((s1_lengths, s2_lengths))).max().item()

        # init hidden, cell
        h1, c1 = self.encoder.initHiddenCell(batch_size)
        h2, c2 = self.encoder.initHiddenCell(batch_size)

        v1, h1, c1 = self.encoder(s1, h1, c1, s1_lengths)
        v2, h2, c2 = self.encoder(s2, h2, c2, s2_lengths)

        v1, l1 = torch.nn.utils.rnn.pad_packed_sequence(v1, batch_first=False, total_length=max_length)
        v2, l2 = torch.nn.utils.rnn.pad_packed_sequence(v2, batch_first=False, total_length=max_length)
        # print(v1)
        batch_indices = torch.LongTensor(range(batch_size))
        v1 = v1[l1-1,batch_indices,:]
        v2 = v2[l2-1,batch_indices,:]
        features = torch.cat((v1,v2), 1)
        output = self.classifier(features)

        return output

BCELoss is the loss function.

Here is the link to google colab with code, if you are interested.
https://colab.research.google.com/drive/12v3NUp0o9VfG6r3GGb-rfFfPBcfGYvD5

Update:
It is training fine. I just didn’t train it for long enough, though it is crashing for some unknown reason if I test it on gpu, there are no issues on cpu.