Hello,
I am using my university’s HPC cluster and there is a time limit per job. So I ran the train method of the Trainer class with resume_from_checkpoint=MODEL and resumed the training. The following is the code for resuming. To prevent CUDA out of memory errors, we set param.requires_grad = False in the model as before resuming.
training_args = TrainingArguments(
logging_steps=500,
save_steps=500,
eval_steps=500,
output_dir='/home/groups/group_a/yuspqr/result/'+NAME+'.global',
num_train_epochs=3, # total # of training epochs
adam_epsilon=1e-6,
per_device_train_batch_size=REAL_BS, # batch size per device during training
per_device_eval_batch_size=REAL_BS, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
gradient_accumulation_steps=ACCUM_NUM, # directory for storing logs
#gradient_checkpointing=True,
dataloader_num_workers=4,
fp16=True,
learning_rate=RL,
do_train=True,
do_eval=True,
logging_dir='/home/users/yuspqr/runs/'+NAME+'/',
report_to='tensorboard',
evaluation_strategy='steps',
logging_strategy='steps',
save_strategy='steps',
)
import glob
def latest_checkpoint(path):
cl=glob.glob(path)
max_digit=max([len(i) for i in cl])
return max([i for i in cl if len(i)==max_digit])
MODEL=latest_checkpoint('/home/groups/group_a/yuspqr/result/'+NAME+'.global/*')
model = RobertaLongForSequenceClassification.from_pretrained(MODEL)
for param in model.roberta.embeddings.parameters():
param.requires_grad = False
for layer in model.roberta.encoder.layer[:8]:
for param in layer.parameters():
param.requires_grad = False
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset, # evaluation dataset
)
trainer.train(resume_from_checkpoint=MODEL)
Here I got the following error. This happened after skipping the first batches.
Traceback (most recent call last):
File "longformer_processing_final2.py", line 209, in <module>
trainer.train(resume_from_checkpoint=MODEL)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/trainer.py", line 1332, in train
tr_loss_step = self.training_step(model, inputs)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/trainer.py", line 1891, in training_step
loss = self.compute_loss(model, inputs)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/trainer.py", line 1923, in compute_loss
outputs = model(**inputs)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 1212, in forward
return_dict=return_dict,
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 861, in forward
return_dict=return_dict,
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 533, in forward
output_attentions,
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 417, in forward
past_key_value=self_attn_past_key_value,
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 346, in forward
output_attentions,
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "longformer_processing_final2.py", line 132, in forward
output_attentions=output_attentions)
File "/home/users/yuspqr/.local/lib/python3.6/site-packages/transformers/models/longformer/modeling_longformer.py", line 632, in forward
attn_probs = torch.masked_fill(attn_probs, is_index_masked[:, :, None, None], 0.0)
RuntimeError: CUDA out of memory. Tried to allocate 1.84 GiB (GPU 0; 31.75 GiB total capacity; 29.37 GiB already allocated; 616.19 MiB free; 29.95 GiB reserved in total by PyTorch)
0%| | 0/21072 [01:04<?, ?it/s]
My research has stopped and I am really in trouble. Your help is sincerely appreciated.