i am training a kind of equation model for example: “A plus B should be 0”, the output of the model should be: A+B=0. Im using transfer learning from a MarianDB which code was created with chat-gpt3, but i do not know why the output is not near to be the expected, this is my training code:
import pandas as pd
import torch
from transformers import MarianTokenizer, MarianMTModel, MarianConfig
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer
from tqdm import tqdm
# Define a custom dataset class for your translation data
class CustomTranslationDataset(Dataset):
def __init__(self, source_texts, target_texts, tokenizer, max_length):
self.source_texts = source_texts
self.target_texts = target_texts
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.source_texts)
def __getitem__(self, idx):
source_text = self.source_texts[idx]
target_text = self.target_texts[idx]
encoding = self.tokenizer(
source_text,
target_text,
add_special_tokens=True,
max_length=self.max_length,
padding="max_length",
return_tensors="pt",
truncation=True,
)
return {
"input_ids": encoding["input_ids"].squeeze(),
"attention_mask": encoding["attention_mask"].squeeze(),
"labels": encoding["input_ids"].squeeze(),
}
# Define the paths to your CSV files
train_csv_path = 'train_dataset.csv' # Replace with the path to your training data CSV
validation_csv_path = 'val_dataset.csv' # Replace with the path to your validation data CSV
# Define the column names containing the source and target texts
source_column_name = 'english_statement' # Replace with the name of the source text column
target_column_name = 'eq_representation' # Replace with the name of the target text column
# Initialize the MarianMT model and tokenizer for transfer learning
pretrained_model_name = "Helsinki-NLP/opus-mt-en-es" # Replace with a pre-trained model for a related language pair
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
config = MarianConfig.from_pretrained(pretrained_model_name)
model = MarianMTModel.from_pretrained(pretrained_model_name, config=config)
# Prepare your training dataset and dataloader
train_data = pd.read_csv(train_csv_path)
train_source_texts = train_data[source_column_name].tolist()
train_target_texts = train_data[target_column_name].tolist()
max_seq_length = 128 # Maximum sequence length for training data
train_dataset = CustomTranslationDataset(train_source_texts, train_target_texts, tokenizer, max_seq_length)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# Initialize the optimizer and learning rate scheduler
num_epochs = 500 # Adjust as needed
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(num_epochs):
model.train()
total_loss = 0.0
for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False):
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
optimizer.zero_grad()
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
)
loss = outputs.loss
loss.backward()
optimizer.step()
scheduler.step()
total_loss += loss.item()
average_loss = total_loss / len(train_dataloader)
print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {average_loss:.4f}")
# Save the fine-tuned model
model.save_pretrained("fine_tuned_marianmt_model")
# Save the tokenizer
tokenizer.save_pretrained("fine_tuned_marianmt_model/tokenizer")
And this is my evaluation code:
from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer
# Load the saved tokenizer using AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_marianmt_model/tokenizer")
# Initialize the MarianMT model for inference
model_name = "fine_tuned_marianmt_model" # Replace with the path to your fine-tuned model
model = MarianMTModel.from_pretrained(model_name)
# Load the saved tokenizer
# Define the source text to be translated
source_text = "when signal1 rise, signal2 should fell"
# Tokenize the source text
inputs = tokenizer(source_text, return_tensors="pt")
decoded_tokens = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
# Split the decoded tokens into individual tokens
tokens = decoded_tokens.split()
# Print out the individual tokens
print("Individual Tokens:")
for token in tokens:
print(token)
# Translate the source text to the target language
translated_ids = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
# Print the translated text
print("Translated Text:", translated_text)
the output im getting from the evaluation code looks like this:
Individual Tokens:
▁when▁signal1▁rise,▁signal2▁should▁fell
Translated Text: cuando▁signal1▁rise,▁signal2▁should▁fell
(i know the example given is not related to equations but it is related to the real application, the expecteds output should contain symbols like an equation)
im not sure why is the output kind of being translated to spanish, it looks that the transfer learning is not being applied to my training data, i have no idea what canbe the cause of this and neither chat-gpt has an idea.
My training data consist on 20 examples which i augmented by using synonyms and changing the way the sentence is written and modifying the value of numbers on each example. So at the end i have more than 10K training data.
i tried using different models like BERT transformer but results were not good, the translated text consisted on random symbols unrelated to my training data.
The loss seems to be near 0 during training.
Any help would be apreciated.