I have been trying to execute BERT fine tune model with custom layer first on single GPU and then later on for multiple GPU distributed learning. However, I can’t figure out this error.
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=512)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id, return_dict=False)
class Classifier(torch.nn.Module):
def __init__(self, roberta_model):
super(Classifier, self).__init__()
self.embedding = model.bert.embeddings
self.encoder = model.bert.encoder
self.classifier = model.classifier
def forward(self, input_ids, token_type_ids = None, attention_mask = None, labels = None):
emb_out = self.embedding(input_ids)
print(f'Shape after Embedding: {emb_out.shape}')
enc_out = self.encoder(emb_out)
#print(f'Shape after encoder: {enc_out.shape}')
classifier_out = self.classifier(enc_out[0])
print(f'Shape after classifier: {classifier_out.shape}')
return classifier_out
bert = Classifier(model)
EPOCHS = 2
LR = 1e-5
optimizer = torch.optim.AdamW(bert.parameters(), lr = LR)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = total_steps)
loss_fn = torch.nn.CrossEntropyLoss()
def train_model(model, data_loader, loss_fn, optimizer, scheduler, n_examples):
model = model.train() # Explicitly setting model to train state
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d['input_ids']
attention_mask = d['attention_mask']
# Reshaping attention mask as per the input shape of tensor in the forward pass
reshaped_attention_mask = attention_mask.reshape(d['attention_mask'].shape[0], 1, 1, d['attention_mask'].shape[1])
targets = d['labels']
print(f'Target shape:{targets.shape}')
outputs= model(input_ids = input_ids, attention_mask = reshaped_attention_mask)
print(f'Ouput shape:{outputs.shape}')
_, preds = torch.max(outputs, dim = 1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
# Clip the gradients of the model to prevent exploding gradients using clip_grad_norm
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
for d in data_loader:
input_ids = d['input_ids']
attention_mask = d['attention_mask']
# Reshaping attention mask as per the input shape of tensor in the forward pass
reshaped_attention_mask = attention_mask.reshape(d['attention_mask'].shape[0], 1, 1, d['attention_mask'].shape[1])
targets = d['labels']
outputs = model(input_ids = input_ids, attention_mask = reshaped_attention_mask)
_, preds = torch.max(outputs, dim = 1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_model(bert, train_data_loader, loss_fn, optimizer, scheduler, len(df_train))
print(f'Train Loss: {train_loss} ; Train Accuracy: {train_acc}')
val_acc, val_loss = eval_model(bert, val_data_loader, loss_fn, len(df_val))
print(f'Val Loss: {val_loss} ; Val Accuracy: {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(bert.state_dict(), 'multi_gpu_bert_best_model_state.bin')
best_acc = val_acc
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
File <timed exec>:8
Cell In[60], line 18, in train_model(model, data_loader, loss_fn, optimizer, scheduler, n_examples)
15 _, preds = torch.max(outputs, dim = 1)
---> 18 loss = loss_fn(outputs, targets)
20 correct_predictions += torch.sum(preds == targets)
21 losses.append(loss.item())
File /opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/nn/modules/loss.py:1174, in CrossEntropyLoss.forward(self, input, target)
1173 def forward(self, input: Tensor, target: Tensor) -> Tensor:
-> 1174 return F.cross_entropy(input, target, weight=self.weight,
1175 ignore_index=self.ignore_index, reduction=self.reduction,
1176 label_smoothing=self.label_smoothing)
File /opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/nn/functional.py:3026, in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
3024 if size_average is not None or reduce is not None:
3025 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 3026 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
RuntimeError: Expected target size [2, 800], got [2]
Target shape:torch.Size([2])
Shape after Embedding: torch.Size([2, 512, 768])
Shape after classifier: torch.Size([2, 512, 800])
Ouput shape:torch.Size([2, 512, 800])
My data has two columns. The first column is text data and the second is label data. I tried to change this codes for Distributed Parallel Training on multiple GPUs but I haven’t able to do so. Do you have any suggestions? Thanks in advance!