First run on Multiple GPUs throwing error

Hello.

I have access to multiple GPUs for the first time and I"m getting an error.

Here is the code block to instantiate the model. I’m not sure if I’m doing the parrallel command right…

Here is the code:

torch.manual_seed(101)
combined_model = Image_Embedd(embedding_size=train_categorical_embedding_sizes)
criterion = torch.nn.NLLLoss().cuda()
optimizer = torch.optim.Adam(combined_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 4, verbose = True, min_lr = .00000001)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
combined_model = combined_model.cuda()
combined_model = torch.nn.DataParallel(combined_model)

Here is the full trace back. Does anyone see the mistake I’m making?

RuntimeError                              Traceback (most recent call last)
<ipython-input-28-74392e3c8a7d> in <module>
     18             break
     19 
---> 20         y_pred = combined_model(image, numerical_data, categorical_data)
     21         single_loss = criterion(y_pred, label)
     22 

~/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    530             result = self._slow_forward(*input, **kwargs)
    531         else:
--> 532             result = self.forward(*input, **kwargs)
    533         for hook in self._forward_hooks.values():
    534             hook_result = hook(self, input, result)

~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
    150             return self.module(*inputs[0], **kwargs[0])
    151         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 152         outputs = self.parallel_apply(replicas, inputs, kwargs)
    153         return self.gather(outputs, self.output_device)
    154 

~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
    160 
    161     def parallel_apply(self, replicas, inputs, kwargs):
--> 162         return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
    163 
    164     def gather(self, outputs, output_device):

~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
     83         output = results[i]
     84         if isinstance(output, ExceptionWrapper):
---> 85             output.reraise()
     86         outputs.append(output)
     87     return outputs

~/miniconda3/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
    392             # (https://bugs.python.org/issue2651), so we work around it.
    393             msg = KeyErrorMessage(msg)
--> 394         raise self.exc_type(msg)

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "<ipython-input-24-86287e73cc1f>", line 34, in forward
    x = torch.cat(embeddings, 1)
RuntimeError: cuda runtime error (710) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCGeneral.cpp:313