Hey, I’m trying to fine-tune BER, using BoolQ dataset from SuperGLUE
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
df = pd.read_json(file_train, lines=True)
train_df, dev_df = train_test_split(df, test_size=0.1, random_state=42)
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize and format the data
train_encodings = tokenizer(train_df['question'].tolist(),
train_df['passage'].tolist(),
padding=True,
truncation=True,
max_length=512,
add_special_tokens=True, # Add [CLS] and [SEP] tokens.
return_tensors='pt')
dev_encodings = tokenizer(dev_df['question'].tolist(),
dev_df['passage'].tolist(),
padding=True,
truncation=True,
max_length=512,
add_special_tokens=True, # Add [CLS] and [SEP] tokens.
return_tensors='pt')
import torch
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
# Set up the device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Define the hyperparameters
epochs = 3
batch_size = 3
learning_rate = 2e-5
eps = 1e-8
# Initialize the model.
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,
output_attentions = False,
output_hidden_states = False)
model.to(device)
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'],
train_encodings['attention_mask'],
torch.tensor(train_df['label'].tolist()).float().unsqueeze(1))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataset = torch.utils.data.TensorDataset(dev_encodings['input_ids'],
dev_encodings['attention_mask'],
torch.tensor(dev_df['label'].tolist()).float().unsqueeze(1))
dev_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
total_steps = len(train_dataset) * epochs // batch_size
warmup_steps = total_steps // 10 # 10% of the total number of steps. Usually good.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss().to(device)
# Define the training loop
def train_eval_loop(train_loader, dev_loader, model, loss_fn, optimizer, scheduler, device):
model.train()
train_loss_meter = AverageMeter()
total_loss = 0.0
for batch in train_loader:
# Unpack the batch
input_ids, attention_mask, labels = batch
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
print(labels)
print(input_ids)
print(input_ids.shape)
print(attention_mask.shape)
# Zero gradients
optimizer.zero_grad()
outputs, logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = loss_fn(outputs.logits, labels)
loss.backward()
# Clip the gradients to avoid exploding gradients.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update the weights
optimizer.step()
scheduler.step()
train_loss_meter.update(loss.item())
dev_loss_meter = AverageMeter()
model.eval()
for batch in dev_loader:
# batch_inputs = batch_inputs.to(device)
# batch_targets = batch_targets.to(device)
input_ids, attention_mask, labels = batch
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
outputs.to(device)
loss = loss_fn(outputs.logits, labels)
dev_loss_meter.update(loss.item())
print(f'\t Train loss:{train_loss_meter.avg}')
print(f'\t Val(dev) loss:{dev_loss_meter.avg}')
But when I train
# Train and eval the model
for epoch in range(epochs):
train_loss = train_eval_loop(train_loader, dev_loader, model, loss_fn, optimizer, scheduler, device)
print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')
I’m having ValueError: Target size (torch.Size([3, 1])) must be the same as input size (torch.Size([3, 2]))
Any ideas, what may be wrong?
my attention_mask and input_ids are [3, 512], and labels are [3, 1]