I am trying to exploit multiple GPUs on Amazon AWS via DataParallel. This is on AWS Sagemaker with 4 GPUs, PyTorch 1.8 (GPU Optimized) and Python 3.6.
I have searched through the forum and read through the data parallel tutorial but I do not see such a minimal working example mentioned or the error explained.
Do you know what is wrong?
x = torch.rand(300, 400, 500).cuda()
model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
model = torch.nn.DataParallel(model, device_ids=[0,1])
y = model(x)
I get an error:
RuntimeError Traceback (most recent call last)
in
4 model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
5 model = torch.nn.DataParallel(model, device_ids=[0,1])
----> 6 y = model(x)/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
916 result = self._slow_forward(*input, **kwargs)
917 else:
→ 918 result = self.forward(*input, **kwargs)
919 for hook in itertools.chain(
920 _global_forward_hooks.values(),/opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
153 raise RuntimeError("module must have its parameters and buffers "
154 "on device {} (device_ids[0]) but found one of "
→ 155 “them on device: {}”.format(self.src_device_obj, t.device))
156
157 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cpu
For these variants, I get these errors:
import torch
x = torch.rand(300, 400, 500)
model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
model = torch.nn.DataParallel(model, device_ids=[0,1])
y = model(x)
```>
> ---------------------------------------------------------------------------
> RuntimeError Traceback (most recent call last)
> <ipython-input-133-2ed596eb6192> in <module>
> 4 model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
> 5 model = torch.nn.DataParallel(model, device_ids=[0,1])
> ----> 6 y = model(x)
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
> 916 result = self._slow_forward(*input, **kwargs)
> 917 else:
> --> 918 result = self.forward(*input, **kwargs)
> 919 for hook in itertools.chain(
> 920 _global_forward_hooks.values(),
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
> 153 raise RuntimeError("module must have its parameters and buffers "
> 154 "on device {} (device_ids[0]) but found one of "
> --> 155 "them on device: {}".format(self.src_device_obj, t.device))
> 156
> 157 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
>
> RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cpu
import torch
x = torch.rand(300, 400, 500)
model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
model = torch.nn.DataParallel(model, device_ids=[0,1]).cuda()
y = model(x)
> ---------------------------------------------------------------------------
> RuntimeError Traceback (most recent call last)
> <ipython-input-134-16dea105c595> in <module>
> 4 model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
> 5 model = torch.nn.DataParallel(model, device_ids=[0,1]).cuda()
> ----> 6 y = model(x)
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
> 916 result = self._slow_forward(*input, **kwargs)
> 917 else:
> --> 918 result = self.forward(*input, **kwargs)
> 919 for hook in itertools.chain(
> 920 _global_forward_hooks.values(),
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
> 164 if len(self.device_ids) == 1:
> 165 return self.module(*inputs[0], **kwargs[0])
> --> 166 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
> 167 outputs = self.parallel_apply(replicas, inputs, kwargs)
> 168 return self.gather(outputs, self.output_device)
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in replicate(self, module, device_ids)
> 169
> 170 def replicate(self, module, device_ids):
> --> 171 return replicate(module, device_ids, not torch.is_grad_enabled())
> 172
> 173 def scatter(self, inputs, kwargs, device_ids):
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/replicate.py in replicate(network, devices, detach)
> 89 params = list(network.parameters())
> 90 param_indices = {param: idx for idx, param in enumerate(params)}
> ---> 91 param_copies = _broadcast_coalesced_reshape(params, devices, detach)
> 92
> 93 buffers = list(network.buffers())
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/replicate.py in _broadcast_coalesced_reshape(tensors, devices, detach)
> 69 # Use the autograd function to broadcast if not detach
> 70 if len(tensors) > 0:
> ---> 71 tensor_copies = Broadcast.apply(devices, *tensors)
> 72 return [tensor_copies[i:i + len(tensors)]
> 73 for i in range(0, len(tensor_copies), len(tensors))]
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/_functions.py in forward(ctx, target_gpus, *inputs)
> 21 ctx.num_inputs = len(inputs)
> 22 ctx.input_device = inputs[0].get_device()
> ---> 23 outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
> 24 non_differentiables = []
> 25 for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/comm.py in broadcast_coalesced(tensors, devices, buffer_size)
> 56 devices = [_get_device_index(d) for d in devices]
> 57 tensors = [_handle_complex(t) for t in tensors]
> ---> 58 return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
> 59
> 60
>
> RuntimeError: NCCL Error 2: unhandled system error
import torch
x = torch.rand(300, 400, 500).cuda()
model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
model = torch.nn.DataParallel(model, device_ids=[0,1]).cuda()
y = model(x)
> ---------------------------------------------------------------------------
> RuntimeError Traceback (most recent call last)
> <ipython-input-136-439fd34aeaf9> in <module>
> 4 model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
> 5 model = torch.nn.DataParallel(model, device_ids=[0,1]).cuda()
> ----> 6 y = model(x)
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
> 916 result = self._slow_forward(*input, **kwargs)
> 917 else:
> --> 918 result = self.forward(*input, **kwargs)
> 919 for hook in itertools.chain(
> 920 _global_forward_hooks.values(),
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
> 164 if len(self.device_ids) == 1:
> 165 return self.module(*inputs[0], **kwargs[0])
> --> 166 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
> 167 outputs = self.parallel_apply(replicas, inputs, kwargs)
> 168 return self.gather(outputs, self.output_device)
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in replicate(self, module, device_ids)
> 169
> 170 def replicate(self, module, device_ids):
> --> 171 return replicate(module, device_ids, not torch.is_grad_enabled())
> 172
> 173 def scatter(self, inputs, kwargs, device_ids):
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/replicate.py in replicate(network, devices, detach)
> 89 params = list(network.parameters())
> 90 param_indices = {param: idx for idx, param in enumerate(params)}
> ---> 91 param_copies = _broadcast_coalesced_reshape(params, devices, detach)
> 92
> 93 buffers = list(network.buffers())
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/replicate.py in _broadcast_coalesced_reshape(tensors, devices, detach)
> 69 # Use the autograd function to broadcast if not detach
> 70 if len(tensors) > 0:
> ---> 71 tensor_copies = Broadcast.apply(devices, *tensors)
> 72 return [tensor_copies[i:i + len(tensors)]
> 73 for i in range(0, len(tensor_copies), len(tensors))]
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/_functions.py in forward(ctx, target_gpus, *inputs)
> 21 ctx.num_inputs = len(inputs)
> 22 ctx.input_device = inputs[0].get_device()
> ---> 23 outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
> 24 non_differentiables = []
> 25 for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/comm.py in broadcast_coalesced(tensors, devices, buffer_size)
> 56 devices = [_get_device_index(d) for d in devices]
> 57 tensors = [_handle_complex(t) for t in tensors]
> ---> 58 return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
> 59
> 60
>
> RuntimeError: NCCL Error 2: unhandled system error
import torch
x = torch.rand(300, 400, 500).cuda()
model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
model = torch.nn.DataParallel(model.cuda(), device_ids=[0,1])
y = model(x)
> ---------------------------------------------------------------------------
> RuntimeError Traceback (most recent call last)
> <ipython-input-137-a1695a6de8c1> in <module>
> 4 model = torch.nn.Sequential(torch.nn.Linear(500, 900), torch.nn.Linear(900, 1))
> 5 model = torch.nn.DataParallel(model.cuda(), device_ids=[0,1])
> ----> 6 y = model(x)
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
> 916 result = self._slow_forward(*input, **kwargs)
> 917 else:
> --> 918 result = self.forward(*input, **kwargs)
> 919 for hook in itertools.chain(
> 920 _global_forward_hooks.values(),
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
> 164 if len(self.device_ids) == 1:
> 165 return self.module(*inputs[0], **kwargs[0])
> --> 166 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
> 167 outputs = self.parallel_apply(replicas, inputs, kwargs)
> 168 return self.gather(outputs, self.output_device)
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in replicate(self, module, device_ids)
> 169
> 170 def replicate(self, module, device_ids):
> --> 171 return replicate(module, device_ids, not torch.is_grad_enabled())
> 172
> 173 def scatter(self, inputs, kwargs, device_ids):
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/replicate.py in replicate(network, devices, detach)
> 89 params = list(network.parameters())
> 90 param_indices = {param: idx for idx, param in enumerate(params)}
> ---> 91 param_copies = _broadcast_coalesced_reshape(params, devices, detach)
> 92
> 93 buffers = list(network.buffers())
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/replicate.py in _broadcast_coalesced_reshape(tensors, devices, detach)
> 69 # Use the autograd function to broadcast if not detach
> 70 if len(tensors) > 0:
> ---> 71 tensor_copies = Broadcast.apply(devices, *tensors)
> 72 return [tensor_copies[i:i + len(tensors)]
> 73 for i in range(0, len(tensor_copies), len(tensors))]
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/_functions.py in forward(ctx, target_gpus, *inputs)
> 21 ctx.num_inputs = len(inputs)
> 22 ctx.input_device = inputs[0].get_device()
> ---> 23 outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
> 24 non_differentiables = []
> 25 for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):
>
> /opt/conda/lib/python3.6/site-packages/torch/nn/parallel/comm.py in broadcast_coalesced(tensors, devices, buffer_size)
> 56 devices = [_get_device_index(d) for d in devices]
> 57 tensors = [_handle_complex(t) for t in tensors]
> ---> 58 return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
> 59
> 60
>
> RuntimeError: NCCL Error 2: unhandled system error