eg. Pre-Trained BERT Result for sentence cosine similarity
======================
Query: milk with chocolate flavor
Top 10 most similar sentences in corpus:
Milka milk chocolate 100 g (Score: 0.8672)
Alpro, Chocolate soy drink 1 ltr (Score: 0.6821)
Danone, HiPRO 25g Protein chocolate flavor 330 ml (Score: 0.6692)
in the above example, I am searching for milk the result should be milk-related first but here it returns chocolate in the first place. how do I fine-tune similarity for the result?
I googled it but do not found any proper solution, please help me.
Code:
import scipy
import numpy as np
from sentence_transformers import models, SentenceTransformer
model = SentenceTransformer('distilbert-base-multilingual-cased')
corpus = [
"Alpro, Chocolate soy drink 1 ltr",
"Milka milk chocolate 100 g",
"Danone, HiPRO 25g Protein chocolate flavor 330 ml"
]
corpus_embeddings = model.encode(corpus)
queries = [
'milk with chocolate flavor',
]
query_embeddings = model.encode(queries)
# Calculate Cosine similarity of query against each sentence i
closest_n = 10
for query, query_embedding in zip(queries, query_embeddings):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print("\n======================\n")
print("Query:", query)
print("\nTop 10 most similar sentences in corpus:")
for idx, distance in results[0:closest_n]:
print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))
Fine-Tune Try:
!pip install sentence-transformers
import scipy
import numpy as np
from sentence_transformers import models, SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased') # workes with Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish
#Fine-Tuning
import pandas as pd
df = pd.DataFrame({
"message":[
"latte al cioccolato" ,
"Alpro, Cioccolato bevanda a base di soia 1 ltr ", #Alpro, Chocolate soy drink 1 ltr
"Milka cioccolato al latte 100 g", #Milka milk chocolate 100 g
"Danone, HiPRO 25g Proteine gusto cioccolato 330 ml", #Danone, HiPRO 25g Protein chocolate flavor 330 ml
],
"lbl":["liquid","liquid","chocolate","liquid"]
})
df
X=list(df['message'])
y=list(df['lbl'])
y=list(pd.get_dummies(y,drop_first=True)['liquid'])
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("kiri-ai/distiluse-base-multilingual-cased-et")
encodings = tokenizer(X, truncation=True, padding=True)
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(encodings),
y
))
from transformers import AutoModel, TFTrainer, TFTrainingArguments
training_args = TFTrainingArguments(
output_dir='./results', # output directory
num_train_epochs=2, # total number of training epochs
per_device_train_batch_size=8, # batch size per device during training
per_device_eval_batch_size=16, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
with training_args.strategy.scope():
model = AutoModel.from_pretrained("kiri-ai/distiluse-base-multilingual-cased-et")
trainer = TFTrainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset # training dataset
)
trainer.train()