I was doing sentence pair classification using BERT. At first, I encode the sentence pair as
train_encode = tokenizer(train1, train2,padding="max_length",truncation=True)
test_encode = tokenizer(test1, test2,padding="max_length",truncation=True)
where train1 and train2 are lists of sentence pairs.
Then I did:
train_seq = torch.tensor(train_encode['input_ids'])
train_mask = torch.tensor(train_encode['attention_mask'])
train_token = torch.tensor(train_encode['token_type_ids'])
train_y = torch.tensor(y_train.tolist())
And created a data loader as
train_data = TensorDataset(train_seq, train_mask, train_token, train_y)
Model is defined as
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=5)
class BERT_Arch(nn.Module):
def __init__(self, bert):
super(BERT_Arch, self).__init__()
self.bert = bert
self.dropout = nn.Dropout(0.1)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(768,512)
self.fc2 = nn.Linear(512,5)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input_ids, attn_masks, token_type_ids):
_, cls_hs = self.bert(input_ids, attn_masks, token_type_ids)
x = self.fc1(cls_hs)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.softmax(x)
return x
And defined bert as
model1 = BERT_Arch(model)
optimizer = AdamW(model1.parameters(), lr=0.01)
device = torch.device("cuda") if torch.cuda.is_available() else
torch.device("cpu")
model1.to(device)
I created the training loop as
EPOCHS = 5
criterion = nn.CrossEntropyLoss()
total_loss, total_accuracy = 0, 0
total_preds=[]
for epoch in range(EPOCHS):
model1.train()
total_train_loss = 0
total_train_acc = 0
for step,batch in enumerate(train_dataloader):
batch = [r.to(device) for r in batch]
input_id,attention_mask,token_type_id,y = batch
optimizer.zero_grad()
pair_token_ids = pair_token_ids.to(device)
mask_ids = mask_ids.to(device)
seg_ids = seg_ids.to(device)
labels = y.to(device)
model1.zero_grad()
prediction = model1(pair_token_ids,mask_ids,seg_ids)
loss = criterion(prediction, labels)
total_loss = total_loss + loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
preds=preds.detach().cpu().numpy()
total_preds.append(preds)
avg_loss = total_loss / len(train_dataloader)
total_preds = np.concatenate(total_preds, axis=0)
print(avg_loss)
I get the following errors as ValueError: not enough values to unpack (expected 2, got 1)
I am not sure what I am doing wrong here? Any suggestions?
The output of
for step,batch in enumerate(train_dataloader):
batch = [r.to(device) for r in batch]
input_id,attention_mask,token_type_id,y = batch
is
tensor([[ 101, 3191, 1999, ..., 0, 0, 0],
[ 101, 2023, 11204, ..., 0, 0, 0],
[ 101, 6140, 1996, ..., 0, 0, 0],
...,
[ 101, 2023, 11204, ..., 0, 0, 0],
[ 101, 2275, 2039, ..., 0, 0, 0],
[ 101, 2023, 2240, ..., 0, 0, 0]], device='cuda:0')
tensor([[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
...,
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0]], device='cuda:0')
tensor([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], device='cuda:0')
tensor([3, 4, 1, 3, 1, 4, 2, 1], device='cuda:0')
If I don’t create a custom BERT model and just use
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=5)`
And use the training loop as same as above, I get the error as TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not SequenceClassifierOutput