Getting "IndexError: index out of range in self" trying to set finetuning parameters for gpt2 transformer

Nkem_Diran · May 15, 2023, 3:12am

hello please am working on a text generation model that generates job descriptions based on the input and am trying to achieve this by finetuning the gpt2 model on my dataset of job descriptions.
but i have encounterd this problem that has held me for more than a week



import pandas as pd
import numpy as np

data = pd.read_csv('jobs.csv', error_bad_lines=False, engine='python')

data = data.drop(columns=['company_id','context', 'date', 'sal_high','sal_low','salary','post_id','post_url'])

data = data.dropna()

median = data['months_experience'].median()
data['months_experience'] = data['months_experience'].fillna(median)

data['title'] = data['title'].replace('/','or',regex=True)
data['months_experience'] = round(data['months_experience']/12)

data.head(3)

!pip install -q transformers

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Removing punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(table) for token in tokens]

    # Handling stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a single string
    processed_text = ' '.join(tokens)

    return processed_text

data['description'] = data['description'].apply(preprocess_text)

from sklearn.model_selection import train_test_split

# Split the dataset into training, validation, and test sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_data, test_size=0.5, random_state=42)

# Print the shapes of the resulting sets
print("Train set shape:", train_data.shape)
print("Validation set shape:", val_data.shape)
print("Test set shape:", test_data.shape)

# train_data.head()

input_texts = []
target_texts = []

# Iterate through the training data
for index, row in train_data.iterrows():
    prompt = f"Job Category: {row['Industries']} | Seniority Level: {row['Seniority level']} | Location: {row['location']} | title: {row['title']} | education: {row['education']} | months_experience: {row['months_experience']} | Employment type: {row['Employment type']}"
    description = row['description']

    # Add the prompt and description to the input-output pairs
    input_texts.append(prompt)
    target_texts.append(description)



from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Tokenize and encode the input and target texts
input_ids = tokenizer.batch_encode_plus(
    input_texts,
    padding=True,
    truncation=True,
    max_length=1024,  # Adjust as per your requirements
    return_tensors='pt'
)['input_ids']

target_ids = tokenizer.batch_encode_plus(
    target_texts,
    padding=True,
    truncation=True,
    max_length=1024,  # Adjust as per your requirements
    return_tensors='pt'
)['input_ids']
print(len(input_ids))
print(len(target_ids))

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

# Load the GPT-2 model and tokenizer
model_name = 'gpt2'  # You can choose different GPT-2 variations, e.g., 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

# Set the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Convert the input and target IDs to tensors and move to the device
input_ids = input_ids.to(device)
target_ids = target_ids.to(device)

# Set the training parameters
epochs = 5
batch_size = 8
learning_rate = 1e-4

# Create the data loader
data = torch.utils.data.TensorDataset(input_ids, target_ids)
data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)

# Set the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_batch, target_batch = batch
        
        # Ensure the input and target tensors have the same shape
        input_batch = input_batch.view(-1, input_batch.size(-1))
        # target_batch = target_batch.view(-1, target_batch.size(-1))
        target_batch = target_batch[:input_batch.size(0)]
        # input_batch = F.pad(input_batch, (0, target_batch.size(0) - input_batch.size(0)))
        outputs = model(input_ids=input_batch, labels=target_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Calculate and print the average loss for the epoch
    average_loss = total_loss / len(data_loader)
    print(f'Epoch: {epoch+1} | Average Loss: {average_loss:.4f}')

print(len(target_batch))
print(len(input_batch))

above is my entire code.
i thought my target batch and input batch had deferent dimentions but that was not the case