Hi,
I am currently training a siamese neural network with LSTM with tensors of Size [100,70,42] (batch, seq, feature) for a classification problem. I want to predict the class label (similar (1), dissimilar(0)) based on the difference between the last hidden states of each input. The architecture seems to make sense, but the accuracy I get after training the network is at 50% (basically a random guess).
This is how the Siamese Neural Network is defined:
class SNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, cell_type):
super(SNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.cell_type = cell_type.lower()
self.fc = nn.Linear(hidden_size,1)
# Defining the RNN layers
if self.cell_type == 'gru':
self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first = True)
elif self.cell_type == 'lstm':
self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
else:
raise ValueError("Value of the parameter should be 'GRU' or 'LSTM'")
def forward_once(self, x, hidden):
# Passing the input and the hidden state into the model and obtaining outputs
if self.cell_type == 'gru':
output, hidden = self.rnn(x)
elif self.cell_type == 'lstm':
output, hidden = self.rnn(x, hidden)
else:
raise ValueError("Value of the parameter should be 'GRU' or 'LSTM'")
output = self.fc(hidden[-1])
return output
def forward(self, input1, input2, h1, h2):
# This method performs the forward pass in parallel for each stream of RNN
output1 = self.forward_once(input1, h1)
output2 = self.forward_once(input2, h2)
return output1, output2
def init_hidden(self, batch_size):
# Creation of two new tensors with sizes n_layers x batch_size x n_hidden,
# initialised to zero, for hidden state and cell state of LSTM
weight = next(self.parameters()).data
if (on_gpu):
hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda(),
weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda())
else:
hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
return hidden
the loss function used in this instance is the contrastive loss (I have tried using a MSE and did not work either):
class ContrastiveLoss(torch.nn.Module):
"""
Contrastive loss function.
Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
"""
def __init__(self ,margin = 2.0):
super(ContrastiveLoss, self).__init__()
self.margin = margin
#self.batch_size = batch_size
#self.hidden_size = hidden_size
def forward(self, output1, output2, label):
euclidean_distance = torch.sqrt((output1 - output2)**2)
label = label.reshape(100,1) # 100 is the batch_size, to be changed if it works
loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
(label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min = 0.0),2))
return loss_contrastive
Since the instances have been padded, I have tried to pack the input, so that it is easier for the SNN to process. The training of the SNN proceeds as follows:
def train(SNN, dataloader, epochs, batch_size, optimiser, clip):
if (on_gpu):
SNN.cuda()
SNN.train()
optimiser = optimiser
criterion = ContrastiveLoss()
# Train for a given number of epochs
for e in range(epochs):
# Initialisation of hidden state
h1 = SNN.init_hidden(batch_size)
h2 = h1
# batch loop
for inputs1, inputs2, labels, inputs1_lens, inputs2_lens in dataloader:
if(on_gpu):
inputs1, inputs2, labels = inputs1.cuda(), inputs2.cuda(), labels.cuda()
# Creation of new variables for the hidden state, otherwise we would backpropagate through the entire
# training history
h1 = tuple([each.data for each in h1])
h2 = tuple([each.data for each in h2])
# zero accumulated gradients
SNN.zero_grad()
# Packing sequences
inputs1 = pack_padded_sequence(inputs1, inputs1_lens, batch_first = True, enforce_sorted = False)
inputs2 = pack_padded_sequence(inputs2, inputs2_lens, batch_first = True, enforce_sorted = False)
# get the output from the model
output1, output2 = SNN(inputs1, inputs2, h1, h2)
# calculate the loss and perform backprop
loss = criterion(output1, output2, labels)
loss.backward()
#`clip grad norm` helps prevent the exploding gradient problem in LSTMs
nn.utils.clip_grad_norm_(SNN.parameters(), clip)
# Optimiser
optimiser.step()
print('Epoch: {}/{}'.format(e+1,epochs))
print('Loss:{:.6f}...'.format(loss.item()))
print('-'*20)
The hyperparameters used are these:
input_size = 42
hidden_size = 256
num_layers = 1
output_size = 1
clip = 5
epochs = 10
lr = 0.001
optimiser = torch.optim.Adam(test_net.parameters())
epochs = 20
test_net = SNN(input_size, hidden_size, num_layers, output_size,0.5,'lstm')
I would appreciate any ideas/help to solve this issue.
Thanks in advance!