from tokenizers import ByteLevelBPETokenizer
from transformers import BartConfig, BartTokenizerFast, AutoModelForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import pipeline, Trainer, TrainingArguments
import os
os.environ["NCCL_DEBUG"] = "INFO"
os.environ['TORCH_DISTRIBUTED_DEBUG']= "INFO"
os.environ['TORCH_SHOW_CPP_STACKTRACES']= "1"
# Use the Shakespere dataset.
paths = ['/data/temp/shakespeare.txt']
# byte-level byte-pair encoding tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=paths, vocab_size=50000, min_frequency=2, special_tokens=[
"<s>",
"</s>",
"<pad>"
])
# Save the tokenizer model in a folder(contains 2 files - vocab.json and mergest.txt)
!mkdir shakespere_BART
tokenizer.save_model("shakespere_BART")
# Just for testing whether the tokenizer is working or not
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
"./shakespere_BART/vocab.json",
"./shakespere_BART/merges.txt",)
tokenizer._tokenizer.post_processor = BertProcessing(
("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
# print(torch.cuda.is_available()) GPU P100
print(tokenizer.encode('Et tu').tokens)
# Trying the BART model
# Set the config
config = BartConfig(
vocab_size=50000,
max_position_embeddings=32,
num_attention_heads=2,
num_hidden_layers=1,
type_vocab_size=1,
)
# We can set the following parameters : (Ref. https://huggingface.co/transformers/v4.5.1/model_doc/bart.html)
tokenizer = BartTokenizerFast.from_pretrained("./shakespere_BART", max_len=64) # This loads the model
model = AutoModelForMaskedLM.from_config(config=config) # AutoModel automatically recognizes.
print(model.num_parameters()) # Don't go too heavy, you might break the Kaggle platform !!!
# Building our dataset - This cell breaks the runtime limits provided under the free limits.
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path='/data/temp/shakespeare.txt',
block_size=16,
)
# Batching data
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True,
mlm_probability=0.15,
)
training_args = TrainingArguments(
output_dir='shakespere_BART',
overwrite_output_dir=True,
num_train_epochs=1,
per_gpu_train_batch_size=2,
save_steps=5000,
save_total_limit=2,
prediction_loss_only=True
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset
)
trainer.train()
After executing the trainer line, it gives the below errors:
MED-ARC-GPU01:264800:264800 [0] NCCL INFO cudaDriverVersion 12020
MED-ARC-GPU01:264800:264800 [0] NCCL INFO Bootstrap : Using main:10.8.26.23<0>
MED-ARC-GPU01:264800:264800 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
NCCL version 2.19.3+cuda11.0
MED-ARC-GPU01:264800:265554 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB main:10.8.26.23<0>
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Using non-device net plugin version 0
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Using network IB
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Using non-device net plugin version 0
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Using network IB
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Using non-device net plugin version 0
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Using network IB
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Using non-device net plugin version 0
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Using network IB
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Using non-device net plugin version 0
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Using network IB
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Using non-device net plugin version 0
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Using network IB
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Using non-device net plugin version 0
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Using network IB
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Using non-device net plugin version 0
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Using network IB
MED-ARC-GPU01:264800:265559 [5] NCCL INFO comm 0x55a100a7e810 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId 89000 commId 0x9f5f1bea8934d34a - Init START
MED-ARC-GPU01:264800:265554 [0] NCCL INFO comm 0x55a100a61f30 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 1a000 commId 0x9f5f1bea8934d34a - Init START
MED-ARC-GPU01:264800:265555 [1] NCCL INFO comm 0x55a100a66e90 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 1b000 commId 0x9f5f1bea8934d34a - Init START
MED-ARC-GPU01:264800:265556 [2] NCCL INFO comm 0x55a100a6ccf0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 3d000 commId 0x9f5f1bea8934d34a - Init START
MED-ARC-GPU01:264800:265561 [7] NCCL INFO comm 0x55a100a8a4d0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId b3000 commId 0x9f5f1bea8934d34a - Init START
MED-ARC-GPU01:264800:265557 [3] NCCL INFO comm 0x55a100a72b50 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 3e000 commId 0x9f5f1bea8934d34a - Init START
MED-ARC-GPU01:264800:265560 [6] NCCL INFO comm 0x55a100a84670 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId b2000 commId 0x9f5f1bea8934d34a - Init START
MED-ARC-GPU01:264800:265558 [4] NCCL INFO comm 0x55a100a789b0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId 88000 commId 0x9f5f1bea8934d34a - Init START
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00ffffff
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Setting affinity for GPU 5 to ffffff00,0000ffff,ff000000
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Setting affinity for GPU 7 to ffffff00,0000ffff,ff000000
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffff0000,00ffffff
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Setting affinity for GPU 4 to ffffff00,0000ffff,ff000000
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffff0000,00ffffff
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffff0000,00ffffff
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Setting affinity for GPU 6 to ffffff00,0000ffff,ff000000
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Trees [0] 2/-1/-1->3->0 [1] 2/-1/-1->3->0 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] 7/-1/-1->3->1 [5] 1/-1/-1->3->7 [6] 2/-1/-1->3->0 [7] 2/-1/-1->3->0 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] 7/-1/-1->3->1 [11] 1/-1/-1->3->7
MED-ARC-GPU01:264800:265557 [3] NCCL INFO P2P Chunksize set to 524288
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Trees [0] 1/-1/-1->2->3 [1] 1/-1/-1->2->3 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] -1/-1/-1->2->6 [5] 6/-1/-1->2->0 [6] 1/-1/-1->2->3 [7] 1/-1/-1->2->3 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] -1/-1/-1->2->6 [11] 6/-1/-1->2->0
MED-ARC-GPU01:264800:265556 [2] NCCL INFO P2P Chunksize set to 524288
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Trees [0] 5/-1/-1->1->2 [1] 5/-1/-1->1->2 [2] 2/-1/-1->1->5 [3] 2/-1/-1->1->5 [4] 3/-1/-1->1->0 [5] -1/-1/-1->1->3 [6] 5/-1/-1->1->2 [7] 5/-1/-1->1->2 [8] 2/-1/-1->1->5 [9] 2/-1/-1->1->5 [10] 3/-1/-1->1->0 [11] -1/-1/-1->1->3
MED-ARC-GPU01:264800:265555 [1] NCCL INFO P2P Chunksize set to 524288
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Trees [0] -1/-1/-1->4->7 [1] -1/-1/-1->4->7 [2] 7/-1/-1->4->0 [3] 7/-1/-1->4->0 [4] 6/-1/-1->4->5 [5] 5/-1/-1->4->6 [6] -1/-1/-1->4->7 [7] -1/-1/-1->4->7 [8] 7/-1/-1->4->0 [9] 7/-1/-1->4->0 [10] 6/-1/-1->4->5 [11] 5/-1/-1->4->6
MED-ARC-GPU01:264800:265558 [4] NCCL INFO P2P Chunksize set to 524288
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Trees [0] 6/-1/-1->5->1 [1] 6/-1/-1->5->1 [2] 1/-1/-1->5->6 [3] 1/-1/-1->5->6 [4] 4/-1/-1->5->7 [5] 7/-1/-1->5->4 [6] 6/-1/-1->5->1 [7] 6/-1/-1->5->1 [8] 1/-1/-1->5->6 [9] 1/-1/-1->5->6 [10] 4/-1/-1->5->7 [11] 7/-1/-1->5->4
MED-ARC-GPU01:264800:265559 [5] NCCL INFO P2P Chunksize set to 524288
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 00/12 : 0 3 2 1 5 6 7 4
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 01/12 : 0 3 2 1 5 6 7 4
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 5/-1/-1->6->7 [3] 5/-1/-1->6->7 [4] 2/-1/-1->6->4 [5] 4/-1/-1->6->2 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 5/-1/-1->6->7 [9] 5/-1/-1->6->7 [10] 2/-1/-1->6->4 [11] 4/-1/-1->6->2
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 02/12 : 0 4 7 6 5 1 2 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO P2P Chunksize set to 524288
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 03/12 : 0 4 7 6 5 1 2 3
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Trees [0] 4/-1/-1->7->6 [1] 4/-1/-1->7->6 [2] 6/-1/-1->7->4 [3] 6/-1/-1->7->4 [4] 5/-1/-1->7->3 [5] 3/-1/-1->7->5 [6] 4/-1/-1->7->6 [7] 4/-1/-1->7->6 [8] 6/-1/-1->7->4 [9] 6/-1/-1->7->4 [10] 5/-1/-1->7->3 [11] 3/-1/-1->7->5
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 04/12 : 0 1 3 7 5 4 6 2
MED-ARC-GPU01:264800:265561 [7] NCCL INFO P2P Chunksize set to 524288
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 05/12 : 0 2 6 4 5 7 3 1
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 06/12 : 0 3 2 1 5 6 7 4
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 07/12 : 0 3 2 1 5 6 7 4
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 08/12 : 0 4 7 6 5 1 2 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 09/12 : 0 4 7 6 5 1 2 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 10/12 : 0 1 3 7 5 4 6 2
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 11/12 : 0 2 6 4 5 7 3 1
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Trees [0] 3/-1/-1->0->-1 [1] 3/-1/-1->0->-1 [2] 4/-1/-1->0->-1 [3] 4/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 2/-1/-1->0->-1 [6] 3/-1/-1->0->-1 [7] 3/-1/-1->0->-1 [8] 4/-1/-1->0->-1 [9] 4/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 2/-1/-1->0->-1
MED-ARC-GPU01:264800:265554 [0] NCCL INFO P2P Chunksize set to 524288
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/direct pointer
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/direct pointer
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 05/0 : 0[0] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 11/0 : 0[0] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 04/0 : 1[1] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 10/0 : 1[1] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 00/0 : 1[1] -> 5[5] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 01/0 : 1[1] -> 5[5] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 06/0 : 1[1] -> 5[5] via P2P/direct pointer
MED-ARC-GPU01:264800:265555 [1] NCCL INFO Channel 07/0 : 1[1] -> 5[5] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 00/0 : 0[0] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 04/0 : 4[4] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 05/0 : 5[5] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 01/0 : 0[0] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 10/0 : 4[4] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 11/0 : 5[5] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 06/0 : 0[0] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 02/0 : 4[4] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 07/0 : 0[0] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 03/0 : 4[4] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Channel 04/0 : 6[6] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 08/0 : 4[4] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Channel 05/0 : 2[2] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Channel 10/0 : 6[6] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 09/0 : 4[4] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Channel 11/0 : 2[2] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 02/0 : 5[5] -> 1[1] via P2P/direct pointer
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Channel 05/0 : 6[6] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Channel 04/0 : 2[2] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 03/0 : 5[5] -> 1[1] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 04/0 : 3[3] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265560 [6] NCCL INFO Channel 11/0 : 6[6] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 05/0 : 7[7] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 10/0 : 3[3] -> 7[7] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 11/0 : 7[7] -> 3[3] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 00/0 : 4[4] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 08/0 : 5[5] -> 1[1] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 01/0 : 4[4] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 00/0 : 7[7] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265559 [5] NCCL INFO Channel 09/0 : 5[5] -> 1[1] via P2P/direct pointer
MED-ARC-GPU01:264800:265556 [2] NCCL INFO Channel 10/0 : 2[2] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 06/0 : 4[4] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 01/0 : 7[7] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265558 [4] NCCL INFO Channel 07/0 : 4[4] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 06/0 : 7[7] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 07/0 : 7[7] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 02/0 : 0[0] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 03/0 : 0[0] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265554 [0] NCCL INFO Channel 09/0 : 0[0] -> 4[4] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 05/0 : 3[3] -> 1[1] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 11/0 : 3[3] -> 1[1] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 04/0 : 7[7] -> 5[5] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 10/0 : 7[7] -> 5[5] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265561 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265557 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/direct pointer
MED-ARC-GPU01:264800:265581 [5] include/alloc.h:178 NCCL WARN Cuda failure 'out of memory'
MED-ARC-GPU01:264800:265581 [5] include/alloc.h:185 NCCL WARN Failed to CUDA calloc 10485760 bytes
MED-ARC-GPU01:264800:265581 [5] NCCL INFO transport/p2p.cc:208 -> 1
MED-ARC-GPU01:264800:265581 [5] NCCL INFO transport/p2p.cc:612 -> 1
MED-ARC-GPU01:264800:265559 [5] NCCL INFO transport/p2p.cc:443 -> 1
MED-ARC-GPU01:264800:265559 [5] NCCL INFO transport.cc:33 -> 1
MED-ARC-GPU01:264800:265559 [5] NCCL INFO transport.cc:97 -> 1
MED-ARC-GPU01:264800:265559 [5] NCCL INFO init.cc:1117 -> 1
MED-ARC-GPU01:264800:265559 [5] NCCL INFO init.cc:1396 -> 1
MED-ARC-GPU01:264800:265559 [5] NCCL INFO group.cc:64 -> 1 [Async thread]
MED-ARC-GPU01:264800:265576 [6] NCCL INFO misc/socket.cc:47 -> 3
MED-ARC-GPU01:264800:265576 [6] NCCL INFO misc/socket.cc:58 -> 3
MED-ARC-GPU01:264800:265576 [6] NCCL INFO misc/socket.cc:787 -> 3
MED-ARC-GPU01:264800:265576 [6] NCCL INFO proxy.cc:1398 -> 3
MED-ARC-GPU01:264800:265576 [6] proxy.cc:1557 NCCL WARN [Proxy Service 6] Failed to execute operation Init from rank 6, retcode 3
MED-ARC-GPU01:264800:265561 [7] NCCL INFO bootstrap.cc:550 -> 3
MED-ARC-GPU01:264800:265561 [7] NCCL INFO transport.cc:123 -> 3
MED-ARC-GPU01:264800:265561 [7] NCCL INFO init.cc:1117 -> 3
MED-ARC-GPU01:264800:265561 [7] NCCL INFO init.cc:1396 -> 3
MED-ARC-GPU01:264800:265561 [7] NCCL INFO group.cc:64 -> 3 [Async thread]
MED-ARC-GPU01:264800:265560 [6] NCCL INFO misc/socket.cc:47 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO misc/socket.cc:58 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO misc/socket.cc:773 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO proxy.cc:1137 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO proxy.cc:1218 -> 3
MED-ARC-GPU01:264800:265577 [2] NCCL INFO misc/socket.cc:47 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO proxy.cc:1076 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO transport/p2p.cc:442 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO transport.cc:33 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO transport.cc:97 -> 3
MED-ARC-GPU01:264800:265577 [2] NCCL INFO misc/socket.cc:58 -> 3
MED-ARC-GPU01:264800:265580 [1] NCCL INFO misc/socket.cc:47 -> 3
MED-ARC-GPU01:264800:265577 [2] NCCL INFO misc/socket.cc:787 -> 3
MED-ARC-GPU01:264800:265577 [2] NCCL INFO proxy.cc:1398 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO init.cc:1117 -> 3
MED-ARC-GPU01:264800:265577 [2] proxy.cc:1557 NCCL WARN [Proxy Service 2] Failed to execute operation Init from rank 2, retcode 3
MED-ARC-GPU01:264800:265580 [1] NCCL INFO misc/socket.cc:58 -> 3
MED-ARC-GPU01:264800:265580 [1] NCCL INFO misc/socket.cc:773 -> 3
MED-ARC-GPU01:264800:265580 [1] NCCL INFO proxy.cc:1374 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO init.cc:1396 -> 3
MED-ARC-GPU01:264800:265580 [1] NCCL INFO proxy.cc:1415 -> 3
MED-ARC-GPU01:264800:265560 [6] NCCL INFO group.cc:64 -> 3 [Async thread]
MED-ARC-GPU01:264800:265580 [1] proxy.cc:1557 NCCL WARN [Proxy Service 1] Failed to execute operation Setup from rank 1, retcode 3
MED-ARC-GPU01:264800:265582 [0] NCCL INFO misc/socket.cc:47 -> 3
MED-ARC-GPU01:264800:265582 [0] NCCL INFO misc/socket.cc:58 -> 3
MED-ARC-GPU01:264800:265582 [0] NCCL INFO misc/socket.cc:787 -> 3
MED-ARC-GPU01:264800:265558 [4] NCCL INFO bootstrap.cc:550 -> 3
MED-ARC-GPU01:264800:265582 [0] NCCL INFO proxy.cc:1398 -> 3
MED-ARC-GPU01:264800:265582 [0] proxy.cc:1557 NCCL WARN [Proxy Service 0] Failed to execute operation Setup from rank 0, retcode 3
MED-ARC-GPU01:264800:265558 [4] NCCL INFO transport.cc:124 -> 3
MED-ARC-GPU01:264800:265558 [4] NCCL INFO init.cc:1117 -> 3
MED-ARC-GPU01:264800:265558 [4] NCCL INFO init.cc:1396 -> 3
MED-ARC-GPU01:264800:265558 [4] NCCL INFO group.cc:64 -> 3 [Async thread]
MED-ARC-GPU01:264800:265556 [2] NCCL INFO misc/socket.cc:47 -> 3
MED-ARC-GPU01:264800:265557 [3] NCCL INFO bootstrap.cc:550 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO misc/socket.cc:58 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO misc/socket.cc:773 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO proxy.cc:1137 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO proxy.cc:1218 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO proxy.cc:1076 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO transport/p2p.cc:442 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO transport.cc:33 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO transport.cc:97 -> 3
MED-ARC-GPU01:264800:265557 [3] NCCL INFO transport.cc:123 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO misc/socket.cc:47 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO init.cc:1117 -> 3
MED-ARC-GPU01:264800:265557 [3] NCCL INFO init.cc:1117 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO init.cc:1396 -> 3
MED-ARC-GPU01:264800:265556 [2] NCCL INFO group.cc:64 -> 3 [Async thread]
MED-ARC-GPU01:264800:265557 [3] NCCL INFO init.cc:1396 -> 3
MED-ARC-GPU01:264800:265557 [3] NCCL INFO group.cc:64 -> 3 [Async thread]
MED-ARC-GPU01:264800:265554 [0] NCCL INFO misc/socket.cc:58 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO misc/socket.cc:773 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO proxy.cc:1137 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO proxy.cc:1218 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO transport/p2p.cc:443 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO transport.cc:33 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO transport.cc:97 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO init.cc:1117 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO init.cc:1396 -> 3
MED-ARC-GPU01:264800:265554 [0] NCCL INFO group.cc:64 -> 3 [Async thread]
MED-ARC-GPU01:264800:265555 [1] NCCL INFO misc/socket.cc:47 -> 3
MED-ARC-GPU01:264800:265555 [1] NCCL INFO misc/socket.cc:750 -> 3
MED-ARC-GPU01:264800
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[10], line 1
----> 1 trainer.train()
File ~/.../transformers/src/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1883 hf_hub_utils.enable_progress_bars()
1884 else:
-> 1885 return inner_training_loop(
1886 args=args,
1887 resume_from_checkpoint=resume_from_checkpoint,
1888 trial=trial,
1889 ignore_keys_for_eval=ignore_keys_for_eval,
1890 )
File ~/.../transformers/src/transformers/trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2213 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
2215 with self.accelerator.accumulate(model):
-> 2216 tr_loss_step = self.training_step(model, inputs)
2218 if (
2219 args.logging_nan_inf_filter
2220 and not is_torch_xla_available()
2221 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2222 ):
2223 # if loss is nan or inf simply add the average of previous logged losses
2224 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~/.../transformers/src/transformers/trainer.py:3216, in Trainer.training_step(self, model, inputs)
3213 return loss_mb.reduce_mean().detach().to(self.args.device)
3215 with self.compute_loss_context_manager():
-> 3216 loss = self.compute_loss(model, inputs)
3218 del inputs
3219 torch.cuda.empty_cache()
File ~/.../transformers/src/transformers/trainer.py:3242, in Trainer.compute_loss(self, model, inputs, return_outputs)
3240 else:
3241 labels = None
-> 3242 outputs = model(**inputs)
3243 # Save past state if it exists
3244 # TODO: this needs to be fixed and made cleaner later.
3245 if self.args.past_index >= 0:
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:184, in DataParallel.forward(self, *inputs, **kwargs)
182 if len(self.device_ids) == 1:
183 return self.module(*inputs[0], **module_kwargs[0])
--> 184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
185 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
186 return self.gather(outputs, self.output_device)
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:189, in DataParallel.replicate(self, module, device_ids)
188 def replicate(self, module: T, device_ids: Sequence[Union[int, torch.device]]) -> List[T]:
--> 189 return replicate(module, device_ids, not torch.is_grad_enabled())
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/nn/parallel/replicate.py:110, in replicate(network, devices, detach)
108 params = list(network.parameters())
109 param_indices = {param: idx for idx, param in enumerate(params)}
--> 110 param_copies = _broadcast_coalesced_reshape(params, devices, detach)
112 buffers = list(network.buffers())
113 buffers_rg: List[torch.Tensor] = []
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/nn/parallel/replicate.py:83, in _broadcast_coalesced_reshape(tensors, devices, detach)
80 else:
81 # Use the autograd function to broadcast if not detach
82 if len(tensors) > 0:
---> 83 tensor_copies = Broadcast.apply(devices, *tensors)
84 return [tensor_copies[i:i + len(tensors)]
85 for i in range(0, len(tensor_copies), len(tensors))]
86 else:
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/autograd/function.py:598, in Function.apply(cls, *args, **kwargs)
595 if not torch._C._are_functorch_transforms_active():
596 # See NOTE: [functorch vjp and autograd interaction]
597 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 598 return super().apply(*args, **kwargs) # type: ignore[misc]
600 if not is_setup_ctx_defined:
601 raise RuntimeError(
602 "In order to use an autograd.Function with functorch transforms "
603 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
604 "staticmethod. For more details, please see "
605 "https://pytorch.org/docs/master/notes/extending.func.html"
606 )
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:23, in Broadcast.forward(ctx, target_gpus, *inputs)
21 ctx.num_inputs = len(inputs)
22 ctx.input_device = inputs[0].get_device()
---> 23 outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
24 non_differentiables = []
25 for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):
File ~/anaconda3/envs/huggingfaceNew/lib/python3.8/site-packages/torch/nn/parallel/comm.py:57, in broadcast_coalesced(tensors, devices, buffer_size)
55 devices = [_get_device_index(d) for d in devices]
56 tensors = [_handle_complex(t) for t in tensors]
---> 57 return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: NCCL Error 1: unhandled cuda error (run with NCCL_DEBUG=INFO for details)
I have already tried different version of pytorch and cuda (1.9.1+111, 2.2.3+121, and other also), but nothing works.
Could anyone please help?