Network always outputs the same vector when evaluating, but not during training

Hi everyone, I hope someone can help me with this.

I have constructed a Siamese Network, where the branches of the network consist of the Longformer as implemented in huggingface. The dataset I am using for finetuning and testing is a parallelized corpus with a binary target variable labelling two sequences as similar if they are from the same speaker about the same topic.

I am training the network with a custom contrastive loss function, based on the cosine similarity of the encoded sequences (see below).

The issue I am facing is, that after finetuning the network (so far I am doing that only on a very small sample of ~200 sequence pairs) is outputting the same vector for all speeches. The odd thing is that this is not happening always, even though I am constantly using the same data for finetuning.

What is also odd is that when I check for the similarity of the encodings during finetuning, the outputs have some variance. However, when I evaluate the model on the same data as used for finetuning the similarity for all sequences is always 1.

class SiameseNetwork(T.nn.Module):
    def __init__(self, embedding_model):
        super(SiameseNetwork, self).__init__()
        self.embedding_model = embedding_model
        self.fc = T.nn.Linear(embedding_model.config.hidden_size, 256) 

    def forward(self, input1, input2):
        output1 = self.embedding_model(**input1).last_hidden_state[:, 0, :]
        output2 = self.embedding_model(**input2).last_hidden_state[:, 0, :]
        output1_fc = self.fc(output1)
        output2_fc = self.fc(output2)
        return output1_fc, output2_fc, output1, output2

#loss function
class ContrastiveLoss(T.nn.Module):
    def __init__(self, m):
        super(ContrastiveLoss, self).__init__()
        self.m = m  # margin or radius

    def forward(self, output1_fc, output2_fc, output1, output2, d):
        # d = 0 means y1 and y2 are supposed to be the same
        # d = 1 means y1 and y2 are supposed to be different

        # Calculate Euclidean distance or cosine similarity for each pair in the batch
        
        euc_dist = 1 - F.cosine_similarity(output1_fc, output2_fc, dim=1) 
                
        # If the target is 0 (same class), compute distance squared
        if T.any(d == 0):
            return T.mean(T.pow(euc_dist[d == 0], 2))
        else:  # T.any(d == 1)
            delta = self.m - euc_dist
            delta = T.clamp(delta, min=0.0, max=None)
            return T.mean(T.pow(delta[d == 1], 2))

#training loop
def train_siamese_network(siamese_network, contrastive_loss, dataloader, optimizer, num_epochs=10, print_interval=100, m = 0.3):
    for epoch in range(num_epochs):
        total_loss = 0.0
        print(f'Epoch {epoch}. Training start at: {datetime.now().time()}')
        
        progress = 0
        batch_count = 0
        for batch in dataloader:
            batch_count +=1
            optimizer.zero_grad()

            input_ids1_batch, attention_mask1_batch, input_ids2_batch, attention_mask2_batch, labels = batch
            input_dict1 = {'input_ids': input_ids1_batch, 'attention_mask': attention_mask1_batch}
            input_dict2 = {'input_ids': input_ids2_batch, 'attention_mask': attention_mask2_batch}
            
            output1_fc, output2_fc, output1, output2 = siamese_network(input_dict1, input_dict2)

            print(f'cosine similarity: {F.cosine_similarity(output1_fc, output2_fc, dim=1)}')
            loss = contrastive_loss(output1_fc, output2_fc, output1, output2, labels)
            
            #check progress
            progress += dataloader.batch_size / len(dataloader)*100
            if batch_count%3 == 0:
                print(f'progress in epoch{epoch}: {progress}%; Time: {datetime.now().time()}; Loss current batch: {loss}.....')
            
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

#initialize 
embedding_model = LongformerModel.from_pretrained(model_name).to(device)
siamese_network = SiameseNetwork(embedding_model).to(device)
siamese_network.train()
contrastive_loss = ContrastiveLoss(m = 0.5).to(device)
optimizer = optim.Adam(siamese_network.parameters(), lr=0.0001)

#train
train_siamese_network(siamese_network, contrastive_loss, tokenized_dataloader_test, optimizer, num_epochs=2)

#evaluate
siamese_network.eval()
max_len = 4096

embedding1 = []
embedding2 = []
similarity = []

for index, values in tq.tqdm(train_data[["speech1", "speech2"]].iterrows()):
    input1, input2 = values
    
    # Tokenize the sequences for input mapping, also apply padding and truncation

    tokenized_input1 = tokenizer(input1, return_tensors="pt", padding='max_length', truncation=True, max_length=max_len)
    tokenized_input2 = tokenizer(input2, return_tensors="pt", padding='max_length', truncation=True, max_length=max_len)
    
    sequences1 = {key: value.to(device) for key, value in tokenized_input1.items()}
    sequences2 = {key: value.to(device) for key, value in tokenized_input2.items()}
    
    with T.no_grad():
        output1_fc, output2_fc, output1, output2 = siamese_network(sequences1, sequences2)
        
    cosine_similarity = T.nn.functional.cosine_similarity(output1_fc, output2_fc, dim=1)
    embedding1.append(output1_fc)
    embedding2.append(output2_fc)
    similarity.append(cosine_similarity)

####example output from training loop
Epoch 1. Training start at: 20:59:32.611909
cosine similarity: tensor([0.7706], device=‘cuda:0’, grad_fn=)
cosine similarity: tensor([0.8191], device=‘cuda:0’, grad_fn=)
cosine similarity: tensor([0.7640], device=‘cuda:0’, grad_fn=)
progress in epoch1: 1.4492753623188406%; Time: 20:59:35.079073; Loss current batch: 0.06967173516750336…

###example output when comparing pair from training data using finetuned model
tensor([1.0000], device=‘cuda:0’)

What I have done so far is adjust the learning rate, the number of epochs, such as the dimensionality of the representations, but the problem persisted.

Thanks in advance for any help!