Still cant solve this (expected scalar type Long but found Int)

EthanlOLOO · May 21, 2024, 2:48pm

Before posting a query, check the FAQs - it might already be answered!

Apologies, I know there are several topics about this but i cant resolve the error

import pandas as pd 
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

device = torch.device('cuda:0'if torch.cuda.is_available()else 'cpu')
print(device)

df = pd.read_csv('Dataset.csv',sep='\t')

df.head()

#전처리
null_idx = df[df.lable.isnull()].index
df.loc[null_idx,"content"]

df.loc[null_idx, "lable"] = df.loc[null_idx, "content"].apply(lambda x: x[-1])
df.loc[null_idx, "content"] = df.loc[null_idx, "content"].apply(lambda x: x[-2])
df = df.astype({"lable":int})

df.info()


#분리

train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

print('train data before removing',format(len(train_data)))
print('test data before removing',format(len(test_data)))

#중복 제거

train_data.drop_duplicates(subset=["content"],inplace=True)
test_data.drop_duplicates(subset=["content"],inplace=True)

print('train data after removing',format(len(train_data)))
print('test data after removing',format(len(test_data)))

MODEL_NAME = "beomi/KcELECTRA-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenized_train_sentences = tokenizer(
    list(train_data["content"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

tokenized_test_sentences = tokenizer(
    list(train_data["content"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

class cursedataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_label = train_data["lable"].values
test_label = test_data["lable"].values

train_data = train_data.astype({"lable": float})
test_data = test_data.astype({"lable": float})

train_dataset = cursedataset(tokenized_train_sentences, train_label)
test_dataset = cursedataset(tokenized_test_sentences, test_label)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

training_args = TrainingArguments(
    output_dir='./',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=2
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ =precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return{
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

error appear in ‘trainer.train()’

AlphaBetaGamma96 · May 21, 2024, 3:41pm

Hi @EthanlOLOO,

Can you share the full stack trace of your error?

EthanlOLOO · May 22, 2024, 4:04am

Exception has occurred: RuntimeError
expected scalar type Long but found Int
  File "C:\Users\ethan\Desktop\CoDe\train.py", line 121, in <module>
    trainer.train()
RuntimeError: expected scalar type Long but found Int

AlphaBetaGamma96 · May 22, 2024, 8:37am

Is that the full stacktrace or just the last part of the stacktrace?

You could check the types returned by the compute_metrics function and see if any are of type int?