Can you help me figure out if there are any issues with my implementation specifically?
Model:
class BertClassifier(nn.Module):
"""
Class defining the classifier model with a BERT encoder and a single fully connected classifier layer.
"""
def __init__(self, dropout=0.5, num_labels=24):
super(BertClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_labels)
self.relu = nn.ReLU()
self.best_score = 0
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
output = self.relu(self.linear(self.dropout(pooled_output)))
return output
Helper objects:
device = torch.device("cuda" if use_cuda else "cpu")
criterion = nn.CrossEntropyLoss().cuda() if use_cuda else nn.CrossEntropyLoss
# Set eps to 1e-04 to use float16
optimizer = Adam(model.parameters(), lr=learning_rate, eps=1e-04)
# Use scaler to use mixed precision (float16 and float32)
scaler = torch.cuda.amp.GradScaler()
# Use scheduler to reduce learning rate gradually
scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
if use_cuda:
# use float16 to reduce GPU memory load
model = model.cuda().to(dtype=torch.float16)
Training steps:
def forward_pass(auxiliaries, inputs, label):
device, criterion, optimizer, scaler, model, _ = auxiliaries
label = label.to(device)
mask = inputs['attention_mask'].to(device)
input_id = inputs['input_ids'].squeeze(1).to(device)
with torch.cuda.amp.autocast():
output = model(input_id, mask)
loss = criterion(output, label)
return loss
def backward_pass(auxiliaries, batch_loss):
_, _, optimizer, scaler, model, _ = auxiliaries
scaler.scale(batch_loss).backward()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
def train_loop(auxiliaries, train_dataloader):
for train_input, train_label in tqdm(train_dataloader):
batch_loss = forward_pass(auxiliaries, train_input, train_label)
backward_pass(auxiliaries, batch_loss)
Error trace:
Traceback (most recent call last):
File "/bert_extraction/bert_extraction_main.py", line 27, in <module>
ranked_train = train(model, df_train, df_val, ENV, label_converter)
File "/bert_extraction/train_test.py", line 221, in train
train_results = train_loop(auxiliaries, train_dataloader)
File "/bert_extraction/train_test.py", line 140, in train_loop
backward_pass(auxiliaries, batch_loss)
File "/bert_extraction/train_test.py", line 119, in backward_pass
scaler.step(optimizer)
File "/anaconda/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py", line 334, in step
self.unscale_(optimizer)
File "/anaconda/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py", line 279, in unscale_
optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False)
File "/anaconda/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py", line 207, in _unscale_grads_
raise ValueError("Attempting to unscale FP16 gradients.")
ValueError: Attempting to unscale FP16 gradients.