I’ve been having a lot of problems with DataParallel. I’ve tried the simplest possible version of DataParallel I can think of, and it still errors out. Any help or advice would be greatly appreciated! This is running on a server with two P100 GPUs.
In [4]: mlp = nn.DataParallel(nn.Linear(100, 200))
In [5]: mlp(torch.zeros((32, 100)))
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~/code/projects/mve_sac/core.py in <module>
----> 1 mlp(torch.zeros((32, 100)))
/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
/opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
144 raise RuntimeError("module must have its parameters and buffers "
145 "on device {} (device_ids[0]) but found one of "
--> 146 "them on device: {}".format(self.src_device_obj, t.device))
147
148 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cpu