Hello.
I have access to multiple GPUs for the first time and I"m getting an error.
Here is the code block to instantiate the model. I’m not sure if I’m doing the parrallel
command right…
Here is the code:
torch.manual_seed(101)
combined_model = Image_Embedd(embedding_size=train_categorical_embedding_sizes)
criterion = torch.nn.NLLLoss().cuda()
optimizer = torch.optim.Adam(combined_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 4, verbose = True, min_lr = .00000001)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
combined_model = combined_model.cuda()
combined_model = torch.nn.DataParallel(combined_model)
Here is the full trace back. Does anyone see the mistake I’m making?
RuntimeError Traceback (most recent call last)
<ipython-input-28-74392e3c8a7d> in <module>
18 break
19
---> 20 y_pred = combined_model(image, numerical_data, categorical_data)
21 single_loss = criterion(y_pred, label)
22
~/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
150 return self.module(*inputs[0], **kwargs[0])
151 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 152 outputs = self.parallel_apply(replicas, inputs, kwargs)
153 return self.gather(outputs, self.output_device)
154
~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
160
161 def parallel_apply(self, replicas, inputs, kwargs):
--> 162 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
163
164 def gather(self, outputs, output_device):
~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
83 output = results[i]
84 if isinstance(output, ExceptionWrapper):
---> 85 output.reraise()
86 outputs.append(output)
87 return outputs
~/miniconda3/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
392 # (https://bugs.python.org/issue2651), so we work around it.
393 msg = KeyErrorMessage(msg)
--> 394 raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "<ipython-input-24-86287e73cc1f>", line 34, in forward
x = torch.cat(embeddings, 1)
RuntimeError: cuda runtime error (710) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCGeneral.cpp:313