hello please am working on a text generation model that generates job descriptions based on the input and am trying to achieve this by finetuning the gpt2 model on my dataset of job descriptions.
but i have encounterd this problem that has held me for more than a week
import pandas as pd
import numpy as np
data = pd.read_csv('jobs.csv', error_bad_lines=False, engine='python')
data = data.drop(columns=['company_id','context', 'date', 'sal_high','sal_low','salary','post_id','post_url'])
data = data.dropna()
median = data['months_experience'].median()
data['months_experience'] = data['months_experience'].fillna(median)
data['title'] = data['title'].replace('/','or',regex=True)
data['months_experience'] = round(data['months_experience']/12)
data.head(3)
!pip install -q transformers
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
# Preprocessing function
def preprocess_text(text):
# Tokenization
tokens = word_tokenize(text)
# Lowercasing
tokens = [token.lower() for token in tokens]
# Removing punctuation
table = str.maketrans('', '', string.punctuation)
tokens = [token.translate(table) for token in tokens]
# Handling stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# Join the tokens back into a single string
processed_text = ' '.join(tokens)
return processed_text
data['description'] = data['description'].apply(preprocess_text)
from sklearn.model_selection import train_test_split
# Split the dataset into training, validation, and test sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_data, test_size=0.5, random_state=42)
# Print the shapes of the resulting sets
print("Train set shape:", train_data.shape)
print("Validation set shape:", val_data.shape)
print("Test set shape:", test_data.shape)
# train_data.head()
input_texts = []
target_texts = []
# Iterate through the training data
for index, row in train_data.iterrows():
prompt = f"Job Category: {row['Industries']} | Seniority Level: {row['Seniority level']} | Location: {row['location']} | title: {row['title']} | education: {row['education']} | months_experience: {row['months_experience']} | Employment type: {row['Employment type']}"
description = row['description']
# Add the prompt and description to the input-output pairs
input_texts.append(prompt)
target_texts.append(description)
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Tokenize and encode the input and target texts
input_ids = tokenizer.batch_encode_plus(
input_texts,
padding=True,
truncation=True,
max_length=1024, # Adjust as per your requirements
return_tensors='pt'
)['input_ids']
target_ids = tokenizer.batch_encode_plus(
target_texts,
padding=True,
truncation=True,
max_length=1024, # Adjust as per your requirements
return_tensors='pt'
)['input_ids']
print(len(input_ids))
print(len(target_ids))
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
# Load the GPT-2 model and tokenizer
model_name = 'gpt2' # You can choose different GPT-2 variations, e.g., 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
# Set the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Convert the input and target IDs to tensors and move to the device
input_ids = input_ids.to(device)
target_ids = target_ids.to(device)
# Set the training parameters
epochs = 5
batch_size = 8
learning_rate = 1e-4
# Create the data loader
data = torch.utils.data.TensorDataset(input_ids, target_ids)
data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)
# Set the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()
# Training loop
model.train()
for epoch in range(epochs):
total_loss = 0
for batch in data_loader:
optimizer.zero_grad()
input_batch, target_batch = batch
# Ensure the input and target tensors have the same shape
input_batch = input_batch.view(-1, input_batch.size(-1))
# target_batch = target_batch.view(-1, target_batch.size(-1))
target_batch = target_batch[:input_batch.size(0)]
# input_batch = F.pad(input_batch, (0, target_batch.size(0) - input_batch.size(0)))
outputs = model(input_ids=input_batch, labels=target_batch)
loss = outputs.loss
loss.backward()
optimizer.step()
total_loss += loss.item()
# Calculate and print the average loss for the epoch
average_loss = total_loss / len(data_loader)
print(f'Epoch: {epoch+1} | Average Loss: {average_loss:.4f}')
print(len(target_batch))
print(len(input_batch))
above is my entire code.
i thought my target batch and input batch had deferent dimentions but that was not the case