Hello everyone!
I have the same issue that is Errors when fine-tuning T5 - Beginners - Hugging Face Forums but was not able to solve it using the solution in that topic.
I’m finetuning the model using BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
Here is the dataset class:
class MyDataset(Dataset):
def __init__(self, df, tokenizer_name=MODEL_name, max_length=1024):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
self.seqs, self.labels = self.load_dataset(df)
self.max_length = max_length
def __len__(self):
return len(self.labels)
def load_dataset(self,df):
seq = list(df['sequence'])
label = list(df['score'])
assert len(seq) == len(label)
return seq, label
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
seq = " ".join("".join(self.seqs[idx].split()))
seq = re.sub(r"[UZOB]", "X", seq)
seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt').to(device)
sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
sample['labels'] = torch.tensor(self.labels[idx])
return sample
Which I then load to Dataloader for batching
train_seqs_encodings_dataset=MyDataset(df=train_dataset_clean)
train_loader = DataLoader.DataLoader(
train_seqs_encodings_dataset,
batch_size=64,
shuffle=True
)
But in training:
loss, predictions = model(b_input_ids,
token_type_ids=b_token_type_ids,
attention_mask=b_input_mask,
labels=b_labels)
I got the following error:
923 elif input_ids is not None: 924 input_shape = input_ids.size() 925 batch_size, seq_length = input_shape 926 elif inputs_embeds is not None: 927 input_shape = inputs_embeds.size()[:-1] ValueError: too many values to unpack (expected 2)
So when I print (input_ids.size()) out of (1) batch = torch.Size([64, 1, 1024]), but I believe it expects tuple with (64,1024). The question is why it returns tuple(list) instead of just tuple?
Thank you for your help