I am using a Siamese network with 2-layer LSTM with dropout=0.5 as encoder for string similarity.
‘ni1iNN:6VM’, ‘ni1iNN:6OM’ are similar.
‘ut/,TG*Xr3e’, ‘X+/(IZ)!.h’ are dissimilar.
For training and testing I am randomly generating the similar and dissimilar strings. The model cannot overfit to the training data.
The only difference for calculating loss is setting the model to training or evaluation mode. In training mode I am getting almost perfect results, but during testing mode the loss is too high (the accuracy is probably around 65 to 80 %, I haven’t calculated it).
What is the reason for this?
class LSTMEncoder(nn.Module):
def __init__(self, opt):
super(LSTMEncoder, self).__init__()
self.embed_size = opt.embedding_dims
self.hidden_size = opt.hidden_dims
self.num_layers = opt.num_layers
self.bidir = opt.lstm_bidir
self.padding_idx = opt.padding_idx
if self.bidir:
self.direction = 2
else: self.direction = 1
self.dropout = opt.lstm_dropout
self.lstm = nn.LSTM(input_size=opt.embedding_dims, hidden_size=self.hidden_size, dropout=self.dropout,
num_layers=self.num_layers, bidirectional=self.bidir)
def initHiddenCell(self, batch_size):
rand_hidden = Variable(torch.zeros(self.direction * self.num_layers, batch_size, self.hidden_size))
rand_cell = Variable(torch.zeros(self.direction * self.num_layers, batch_size, self.hidden_size))
return rand_hidden, rand_cell
def forward(self, input1, hidden, cell):
# input1 = self.embedding(input1)
# input_lengths = torch.as_tensor(input_lengths, dtype=torch.int64, device='cpu')
# input1 = torch.nn.utils.rnn.pack_padded_sequence(input1, input_lengths, batch_first=False, enforce_sorted=False)
output, (hidden, cell) = self.lstm(input1, (hidden, cell))
return output, hidden, cell
class Siamese_lstm(nn.Module):
def __init__(self, opt):
super(Siamese_lstm, self).__init__()
self.encoder = LSTMEncoder(opt)
self.input_dim = int(1 * self.encoder.direction * self.encoder.hidden_size)
self.classifier = nn.Sequential(
nn.Linear(self.input_dim, int(self.input_dim/2)),
nn.ReLU(),
nn.Linear(int(self.input_dim/2), int(self.input_dim/4)),
nn.ReLU(),
nn.Linear(int(self.input_dim/4), 1),
nn.Sigmoid()
)
self.embedding = nn.Embedding(num_embeddings=opt.vocab_size, embedding_dim=opt.embedding_dims,
padding_idx=opt.padding_idx, max_norm=None, scale_grad_by_freq=False, sparse=False)
def forward(self, s1, s2, s1_lengths, s2_lengths):
batch_size = s1.size()[1]
if device.type == 'cuda':
max_length = torch.cuda.LongTensor(torch.cat((s1_lengths, s2_lengths))).max().item()
else:
max_length = torch.LongTensor(torch.cat((s1_lengths, s2_lengths))).max().item()
# init hidden, cell
h1, c1 = self.encoder.initHiddenCell(batch_size)
h2, c2 = self.encoder.initHiddenCell(batch_size)
s1 = self.embedding(s1)
s1 = torch.nn.utils.rnn.pack_padded_sequence(s1, s1_lengths, batch_first=False, enforce_sorted=False)
h1 = h1.to(device)
c1 = c1.to(device)
v1, h1, c1 = self.encoder(s1, h1, c1)
s2 = self.embedding(s2)
s2 = torch.nn.utils.rnn.pack_padded_sequence(s2, s2_lengths, batch_first=False, enforce_sorted=False)
h2 = h2.to(device)
c2 = c2.to(device)
v2, h2, c2 = self.encoder(s2, h2, c2)
v1, l1 = torch.nn.utils.rnn.pad_packed_sequence(v1, batch_first=False, total_length=max_length)
v2, l2 = torch.nn.utils.rnn.pad_packed_sequence(v2, batch_first=False, total_length=max_length)
# print(v1)
if device.type == 'cuda':
batch_indices = torch.cuda.LongTensor(range(batch_size))
else:
batch_indices = torch.LongTensor(range(batch_size))
v1 = v1[l1-1,batch_indices,:]
v2 = v2[l2-1,batch_indices,:]
# features = torch.cat((v1,v2), 1)
features = abs(v1-v2)
output = self.classifier(features)
return output