Hi guys, I am trying to fine tuning BERT with Pytorch. And I use torch.nn.Parallel to train the model in 8 GPUs. After the evalution I delete the model and using torch.cuda.empty_cache().
The most interesting is that when the script is running, my server is good. But one I click the “Interrupt the kernel” button, my server crashed. Could you tell me why it happened.
The reason why I want to clean gpu cache is that i want my result to be reproducible. In my notebook, I evalute the ‘bert-base-uncased’, ‘bert-base-case’, ‘bert-large-uncased’, ‘bert-large-cased’ by using a for loop. But something weird happened. If I restart the kernel and run the for loop function in this order like ABCD, the results are always the same. But if just fine-tuning B model. The result of this B model is different from the result of B model which trained in ABCD loop. I’m wondering if the model reuse some parameters from the former model. So I want to clean all gpu cache to see it. But the crash happpend!!!
environment:
centos7, pytorch1.1, cuda 9.0, jupyter lab latest
some code snippets:
for name, tokenizer in tokenizers.items():
#imagine we create datasets here
# initialize model
num_labels = 2 if task=='A' else 4
if model_type == 'BERT':
model = BertForSequenceClassification.from_pretrained(name, num_labels=num_labels)
# Tell pytorch to run this model on the GPU.
if torch.cuda.device_count() > 1:
# data parallelism
model = torch.nn.DataParallel(model)
print("Let's use", torch.cuda.device_count(), "GPUs!")
model.to(device)
#initialize optimizer
if model_type == 'BERT':
epochs = CONFIG['bert_epochs']
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
# training
# Store the average loss after each epoch so we can plot them.
loss_values = []
model.zero_grad()
# For each epoch...
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
if verbose:
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
# Measure how long the training epoch takes.
t0 = time()
# Reset the total loss for this epoch.
total_loss = 0
# Set our model to training mode (as opposed to evaluation mode)
model.train()
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
# Progress update every 40 batches.
if step % 40 == 0 and not step == 0:
# Calculate elapsed time in minutes.
elapsed = format_time(time() - t0)
# Report progress.
if verbose:
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using the
# `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Forward pass (evaluate the model on this training batch)
# `model` is of type: pytorch_pretrained_bert.modeling.BertForSequenceClassification
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs[0]
# Accumulate the loss. `loss` is a Tensor containing a single value;
# the `.item()` function just returns the Python value from the tensor.
total_loss += loss.mean().item()
# total_loss += loss.item()
# Perform a backward pass to calculate the gradients.
loss.mean().backward()
# loss.backward()
# Clip the norm of the gradients to 1.0.
if model_type == 'BERT':
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient
optimizer.step()
# Update the learning rate.
if model_type == 'BERT':
scheduler.step()
# Clear out the gradients (by default they accumulate)
model.zero_grad()
# Calculate the average loss over the training data.
avg_train_loss = total_loss / len(train_dataloader)
loss_values.append(avg_train_loss)
if verbose:
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(time() - t0))
# ========================================
# Validation
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.
print("")
print("Running Validation...")
t0 = time()
# Put model in evaluation mode to evaluate loss on the validation set
model.eval()
# Tracking variables
preds, labels = [], []
# Evaluate data for one epoch
for batch in validation_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and speeding up validation
with torch.no_grad():
# Forward pass, calculate logit predictions
# token_type_ids is for the segment ids, but we only have a single sentence here.
# See https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L258
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = outputs[0]
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
preds.append(np.argmax(logits, axis=1))
labels.append(label_ids)
# Report the final accuracy for this validation run.
preds = [item for sublist in preds for item in sublist]
labels = [item for sublist in labels for item in sublist]
# print(preds, labels)
if verbose:
print_score(preds, labels, task)
print(" Validation took: {:}".format(format_time(time() - t0)))
# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
# Put model in evaluation mode
model.eval()
t0 = time()
# Tracking variables
predictions , true_labels = [], []
# Predict
for batch in prediction_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Store predictions and true labels
predictions.append(np.argmax(logits, axis=1))
true_labels.append(label_ids)
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
print("The {} model's result for task {} is:".format(name, task))
print_score(predictions, true_labels, task)
if verbose:
print(" Prediction took: {:}".format(format_time(time() - t0)))
print(' DONE.')
del model
torch.cuda.empty_cache()
everything is ok before I add
del model
torch.cuda.emtpy_cache()
Please help me!!!