I am trying to build a siamese model that will compare 2 time-series and learn if they are similar or not by using the cross entropy loss. I have the following model:
class SiameseNetwork(nn.Module):
def __init__(self, in_shape):
super(SiameseNetwork, self).__init__()
self.cnn = nn.Sequential(
nn.Conv2d(in_shape[0], 32, kernel_size = (1,9), stride = (1,2), padding = (0,4)),
nn.ReLU(),
nn.MaxPool2d(kernel_size = (1,2), stride = (1,2)),
nn.Conv2d(32, 64, kernel_size = (1,3), stride = 1, padding = (0,1)),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size = (1,3), stride = 1, padding = (0,1)),
nn.ReLU(),
nn.MaxPool2d(kernel_size = (1,2), stride = (1,2)),
nn.Conv2d(64, 128, kernel_size = (6,1), stride = 1, padding = (0,0)),
nn.ReLU(),
)
# calculate the output size of the cnn to flatten it for the fc layer
x = Variable(torch.rand(in_shape), requires_grad=False)
x = self.cnn(x)
n = self.num_flat_features(x)
self.classifier = nn.Sequential(
nn.Linear(2*n, 2),
)
def forward_one(self, x):
out_x = self.cnn(x)
out_x = out_x.view(out_x.size(0),-1)
return out_x
def forward(self, x1, x2):
out_x1 = self.forward_one(x1)
out_x2 = self.forward_one(x2)
out_conc = torch.cat((out_x1, out_x2), 1)
out = self.classifier(out_conc)
return out
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = np.prod(size) # returns the product of the array elements
return num_features
then I am trying to train it using the following code:
siamese_net = SiameseNetwork(input_shape)
if GPU and torch.cuda.is_available():
siamese_net = siamese_net.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(siamese_net.parameters(),lr = 0.0025)
for epoch in range(number_epochs):
torch.cuda.empty_cache()
epoch_loss_train = 0
epoch_loss_valid = 0
epoch_accuracy_valid = 0
epoch_accuracy_train = 0
#training
siamese_net.train()
for batch_index, data in enumerate(train_loader, 0):
x1_batch, x2_batch, y_batch = data
if GPU and torch.cuda.is_available():
x1_batch = x1_batch.type('torch.FloatTensor').cuda()
x2_batch = x2_batch.type('torch.FloatTensor').cuda()
y_batch = y_batch.long().cuda()
else:
x1_batch = x1_batch.type('torch.FloatTensor')
x2_batch = x2_batch.type('torch.FloatTensor')
y_batch = y_batch.type('torch.FloatTensor').unsqueeze(1)
optimizer.zero_grad()
output = siamese_net(x1_batch, x2_batch)
output.long()
train_loss = criterion(output, y_batch)
train_loss.backward()
optimizer.step()
epoch_loss_train += train_loss.item()
max_index = torch.max(output, 1)[1]
num_correct = torch.sum(max_index==y_batch)
accuracy = (num_correct*100)/len(max_index)
epoch_accuracy_train += accuracy
#validating
siamese_net.eval()
with torch.no_grad():
for batch_index, data in enumerate(valid_loader, 0):
x1_batch, x2_batch, y_batch = data
if GPU and torch.cuda.is_available():
x1_batch = x1_batch.type('torch.FloatTensor').cuda()
x2_batch = x2_batch.type('torch.FloatTensor').cuda()
y_batch = y_batch.long().cuda()
else:
x1_batch = x1_batch.type('torch.FloatTensor')
x2_batch = x2_batch.type('torch.FloatTensor')
y_batch = y_batch.type('torch.FloatTensor').unsqueeze(1)
output = siamese_net(x1_batch, x2_batch)
output.long()
valid_loss = criterion(output, y_batch)
epoch_loss_valid += valid_loss.item()
max_index = torch.max(output, 1)[1]
num_correct = torch.sum(max_index==y_batch)
accuracy = (num_correct*100)/len(max_index)
epoch_accuracy_valid += accuracy
#print results
train_loss_history.append(epoch_loss_train/ len(train_loader))
print("===> Epoch {} Complete: Avg. training Loss: {:.4f}, Train Accuracy: {:.4f}".format(epoch, epoch_loss_train / len(train_loader), epoch_accuracy_train/len(train_loader)))
valid_loss_history.append(epoch_loss_valid/ len(valid_loader))
print("===> Epoch {} Complete: Avg. valid Loss: {:.4f}, Valid Accuracy: {:.4f}".format(epoch, epoch_loss_valid / len(valid_loader), epoch_accuracy_valid/len(valid_loader)))
x1 and x2 are vectors of size 6x128 and y is 0 or 1 depending on if x1 and x2 are similar or not. I split the data into training and validation sets, but the validation loss increases after every epoch, while the training loss is decreasing. I checked for overfitting I tried dropout, I tried using less layers and fewer nodes, but the issue persists. I am new in pytorch, so I would like to ask if my implementation is correct. I also tried using random x1 and x2 and the network was not learning anything (had the same loss after each epoch), which I think is correct. I would appreciate any suggestions.