I am trying to run model which is created using Capsule Network on multiple GPUs by using pytorch’s dataparallel approach but getting error. Below is the stacktrace and code for the same.
Not understanding why its not replicating the model across all the GPUs. Please suggest me where I am going wrong.
cuda
Let's use 4 GPUs!
Devices: [0, 1, 2, 3]
Traceback (most recent call last):
File "main.py", line 245, in <module>
train(epoch)
File "main.py", line 161, in train
output = model(data) # forward.
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/_utils.py", line 369, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/scratch/engn8536/project_data/u6724013/caer/capsule_network.py", line 43, in forward
h = self.primary_caps(h)
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/scratch/engn8536/project_data/u6724013/caer/primary_caps.py", line 56, in forward
u_i = self.conv_units[i](x)
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/scratch/engn8536/project_data/u6724013/caer/primary_caps.py", line 26, in forward
h = self.conv(x)
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 343, in forward
return self.conv2d_forward(input, self.weight)
File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 340, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)
main.py
parser.add_argument('--gpu', help="GPU_ID", type=str, default = "0,1,2,3")
args = parser.parse_args()
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda:"+re.split(r",",args.gpu)[0] if USE_CUDA else "cpu")
print(device)
gpu_id = list(map(int, re.split(r",",args.gpu)))
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model,device_ids = gpu_id).to(device)
print('Devices:', model.device_ids)
Use_Dataparallel = True