Siamese LSTM not training


I am currently training a siamese neural network with LSTM with tensors of Size [100,70,42] (batch, seq, feature) for a classification problem. I want to predict the class label (similar (1), dissimilar(0)) based on the difference between the last hidden states of each input. The architecture seems to make sense, but the accuracy I get after training the network is at 50% (basically a random guess).

This is how the Siamese Neural Network is defined:

class SNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, cell_type):
        super(SNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.cell_type = cell_type.lower() 
        self.fc = nn.Linear(hidden_size,1)
        # Defining the RNN layers
        if self.cell_type == 'gru':
            self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first = True)
        elif self.cell_type == 'lstm':
            self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
            raise ValueError("Value of the parameter should be 'GRU' or 'LSTM'")
    def forward_once(self, x, hidden):       
        # Passing the input and the hidden state into the model and obtaining outputs
        if self.cell_type == 'gru':
            output, hidden = self.rnn(x)
        elif self.cell_type == 'lstm':
            output, hidden = self.rnn(x, hidden)
            raise ValueError("Value of the parameter should be 'GRU' or 'LSTM'")
        output = self.fc(hidden[-1])
        return output
    def forward(self, input1, input2, h1, h2):
        # This method performs the forward pass in parallel for each stream of RNN
        output1 = self.forward_once(input1, h1)
        output2 = self.forward_once(input2, h2)
        return output1, output2

    def init_hidden(self, batch_size):
        # Creation of two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialised to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        if (on_gpu):
            hidden = (, batch_size, self.hidden_size).zero_().cuda(),
            , batch_size, self.hidden_size).zero_().cuda())    
            hidden = (, batch_size, self.hidden_size).zero_(),
           , batch_size, self.hidden_size).zero_())
        return hidden

the loss function used in this instance is the contrastive loss (I have tried using a MSE and did not work either):

class ContrastiveLoss(torch.nn.Module):
    Contrastive loss function.
    Based on:
    def __init__(self ,margin = 2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        #self.batch_size = batch_size
        #self.hidden_size = hidden_size
    def forward(self, output1, output2, label):
        euclidean_distance = torch.sqrt((output1 - output2)**2)
        label = label.reshape(100,1) # 100 is the batch_size, to be changed if it works
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                     (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min = 0.0),2))

        return loss_contrastive

Since the instances have been padded, I have tried to pack the input, so that it is easier for the SNN to process. The training of the SNN proceeds as follows:

def train(SNN, dataloader, epochs, batch_size, optimiser, clip):
    if (on_gpu):
    optimiser = optimiser
    criterion = ContrastiveLoss()
    # Train for a given number of epochs
    for e in range(epochs):
        # Initialisation of hidden state
        h1 = SNN.init_hidden(batch_size)
        h2 = h1 
        # batch loop
        for inputs1, inputs2, labels, inputs1_lens, inputs2_lens in dataloader:

                inputs1, inputs2, labels = inputs1.cuda(), inputs2.cuda(), labels.cuda()
            # Creation of new variables for the hidden state, otherwise we would backpropagate through the entire
            # training history
            h1 = tuple([ for each in h1])
            h2 = tuple([ for each in h2])
            # zero accumulated gradients
            # Packing sequences
            inputs1 = pack_padded_sequence(inputs1, inputs1_lens, batch_first = True, enforce_sorted = False)
            inputs2 = pack_padded_sequence(inputs2, inputs2_lens, batch_first = True, enforce_sorted = False)         
            # get the output from the model

            output1, output2 = SNN(inputs1, inputs2, h1, h2)
            # calculate the loss and perform backprop
            loss = criterion(output1, output2, labels)

            #`clip grad norm` helps prevent the exploding gradient problem in LSTMs
            nn.utils.clip_grad_norm_(SNN.parameters(), clip)
            # Optimiser
        print('Epoch: {}/{}'.format(e+1,epochs))

The hyperparameters used are these:

input_size = 42
hidden_size = 256
num_layers = 1
output_size = 1
clip = 5
epochs = 10
lr = 0.001
optimiser = torch.optim.Adam(test_net.parameters())
epochs = 20

test_net = SNN(input_size, hidden_size, num_layers, output_size,0.5,'lstm')

I would appreciate any ideas/help to solve this issue.
Thanks in advance!

1 Like