Hi, I got this error when using torch.distributed on training a model on a single machine multiple GPUs.
File "run_classifier_bertgcn.py", line 364, in <module>
main()
File "run_classifier_bertgcn.py", line 319, in main
train(train_dataset, model, mlb, G, args.batch_sz, args.num_epochs, criterion, device, optimizer, lr_scheduler)
File "run_classifier_bertgcn.py", line 136, in train
output = model(input_ids, attention_mask)
File "/home/xdwang/ml4h/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/xdwang/ml4h/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 511, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/xdwang/ml4h/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/lustre03/project/6001103/xdwang/MeSH_Indexing_RGCN/model.py", line 349, in forward
output, _ = self.bert(src_input_ids, src_attention_mask)
File "/home/xdwang/ml4h/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/xdwang/ml4h/lib/python3.7/site-packages/transformers/modeling_bert.py", line 838, in forward
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
File "/home/xdwang/ml4h/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/xdwang/ml4h/lib/python3.7/site-packages/transformers/modeling_bert.py", line 197, in forward
inputs_embeds = self.word_embeddings(input_ids)
File "/home/xdwang/ml4h/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/xdwang/ml4h/lib/python3.7/site-packages/torch/nn/modules/sparse.py", line 126, in forward
self.norm_type, self.scale_grad_by_freq, self.sparse)
File "/home/xdwang/ml4h/lib/python3.7/site-packages/torch/nn/functional.py", line 1814, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: arguments are located on different GPUs at /pytorch/aten/src/THC/generic/THCTensorIndex.cu:403
My model is a simple BERT model (using huggingface transformer) with a sigmoid activation after BERT output.