Semantic Search Fine-Tune

eg. Pre-Trained BERT Result for sentence cosine similarity

======================

Query: milk with chocolate flavor

Top 10 most similar sentences in corpus:
Milka milk chocolate 100 g (Score: 0.8672)
Alpro, Chocolate soy drink 1 ltr (Score: 0.6821)
Danone, HiPRO 25g Protein chocolate flavor 330 ml (Score: 0.6692)

in the above example, I am searching for milk the result should be milk-related first but here it returns chocolate in the first place. how do I fine-tune similarity for the result?

I googled it but do not found any proper solution, please help me.

Code:

import scipy
    import numpy as np
    from sentence_transformers import models, SentenceTransformer
    model = SentenceTransformer('distilbert-base-multilingual-cased')
    
    corpus = [
              "Alpro, Chocolate soy drink 1 ltr",
              "Milka milk chocolate 100 g",
              "Danone, HiPRO 25g Protein chocolate flavor 330 ml"
             ]
    corpus_embeddings = model.encode(corpus)
    
    queries = [
                'milk with chocolate flavor',
              ]
    query_embeddings = model.encode(queries)
    
    # Calculate Cosine similarity of query against each sentence i
    closest_n = 10
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
    
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
    
        print("\n======================\n")
        print("Query:", query)
        print("\nTop 10 most similar sentences in corpus:")
    
        for idx, distance in results[0:closest_n]:
            print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))

Fine-Tune Try:

!pip install sentence-transformers
import scipy
import numpy as np
from sentence_transformers import models, SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased') # workes with Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish


#Fine-Tuning 
import pandas as pd
df = pd.DataFrame({
    "message":[
          "latte al cioccolato"  ,
          "Alpro, Cioccolato bevanda a base di soia 1 ltr ", #Alpro, Chocolate soy drink 1 ltr
          "Milka  cioccolato al latte 100 g", #Milka milk chocolate 100 g
          "Danone, HiPRO 25g Proteine gusto cioccolato 330 ml", #Danone, HiPRO 25g Protein chocolate flavor 330 ml
         ],
    "lbl":["liquid","liquid","chocolate","liquid"]
})
df


X=list(df['message'])
y=list(df['lbl'])


y=list(pd.get_dummies(y,drop_first=True)['liquid'])


from transformers import AutoTokenizer, AutoModel
  
tokenizer = AutoTokenizer.from_pretrained("kiri-ai/distiluse-base-multilingual-cased-et")
encodings = tokenizer(X, truncation=True, padding=True)


import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(encodings),
    y
))



from transformers import AutoModel, TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)



with training_args.strategy.scope():
    model = AutoModel.from_pretrained("kiri-ai/distiluse-base-multilingual-cased-et")

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset         # training dataset
)

trainer.train()