Before posting a query, check the FAQs - it might already be answered!
Apologies, I know there are several topics about this but i cant resolve the error
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
device = torch.device('cuda:0'if torch.cuda.is_available()else 'cpu')
print(device)
df = pd.read_csv('Dataset.csv',sep='\t')
df.head()
#전처리
null_idx = df[df.lable.isnull()].index
df.loc[null_idx,"content"]
df.loc[null_idx, "lable"] = df.loc[null_idx, "content"].apply(lambda x: x[-1])
df.loc[null_idx, "content"] = df.loc[null_idx, "content"].apply(lambda x: x[-2])
df = df.astype({"lable":int})
df.info()
#분리
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)
print('train data before removing',format(len(train_data)))
print('test data before removing',format(len(test_data)))
#중복 제거
train_data.drop_duplicates(subset=["content"],inplace=True)
test_data.drop_duplicates(subset=["content"],inplace=True)
print('train data after removing',format(len(train_data)))
print('test data after removing',format(len(test_data)))
MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_train_sentences = tokenizer(
list(train_data["content"]),
return_tensors="pt",
max_length=128,
padding=True,
truncation=True,
add_special_tokens=True
)
tokenized_test_sentences = tokenizer(
list(train_data["content"]),
return_tensors="pt",
max_length=128,
padding=True,
truncation=True,
add_special_tokens=True
)
class cursedataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_label = train_data["lable"].values
test_label = test_data["lable"].values
train_data = train_data.astype({"lable": float})
test_data = test_data.astype({"lable": float})
train_dataset = cursedataset(tokenized_train_sentences, train_label)
test_dataset = cursedataset(tokenized_test_sentences, test_label)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)
training_args = TrainingArguments(
output_dir='./',
num_train_epochs=10,
per_device_train_batch_size=8,
per_device_eval_batch_size=64,
logging_dir='./logs',
logging_steps=500,
save_total_limit=2
)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ =precision_recall_fscore_support(labels, preds, average='binary')
acc = accuracy_score(labels, preds)
return{
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
trainer = Trainer(
model=model,
args = training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics
)
trainer.train()
error appear in ‘trainer.train()’