Hello,
I’m trying to move a single GPU model to a machine with 4 GPUs, only I’m on a timeline to use this machine.
I’m getting the following error:
RuntimeError Traceback (most recent call last)
<ipython-input-28-4b69b40dcdef> in <module>
18 break
19
---> 20 y_pred = combined_model(image, numerical_data, categorical_data)
21 single_loss = criterion(y_pred, label)
22
~/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
150 return self.module(*inputs[0], **kwargs[0])
151 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 152 outputs = self.parallel_apply(replicas, inputs, kwargs)
153 return self.gather(outputs, self.output_device)
154
~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
160
161 def parallel_apply(self, replicas, inputs, kwargs):
--> 162 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
163
164 def gather(self, outputs, output_device):
~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
83 output = results[i]
84 if isinstance(output, ExceptionWrapper):
---> 85 output.reraise()
86 outputs.append(output)
87 return outputs
~/miniconda3/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
392 # (https://bugs.python.org/issue2651), so we work around it.
393 msg = KeyErrorMessage(msg)
--> 394 raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "<ipython-input-25-86287e73cc1f>", line 34, in forward
x = torch.cat(embeddings, 1)
RuntimeError: cuda runtime error (710) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCGeneral.cpp:313
Here is my model:
class Image_Embedd(nn.Module):
def __init__(self, embedding_size):
'''
Args
---------------------------
embedding_size: Contains the embedding size for the categorical columns
num_numerical_cols: Stores the total number of numerical columns
output_size: The size of the output layer or the number of possible outputs.
layers: List which contains number of neurons for all the layers.
p: Dropout with the default value of 0.5
'''
super().__init__()
self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
self.embedding_dropout = nn.Dropout(p = .04)
self.cnn = models.resnet50(pretrained=False).cuda()
self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 1000)
self.fc1 = nn.Linear(1000, 1077)
self.fc2 = nn.Linear(1077, 128)
self.fc3 = nn.Linear(128, 2)
#define the foward method
def forward(self, image, x_numerical, x_categorical):
embeddings = []
for i, e in enumerate(self.all_embeddings):
embeddings.append(e(x_categorical[:,i]))
x = torch.cat(embeddings, 1)
x = self.embedding_dropout(x)
x1 = self.cnn(image)
x2 = x_numerical
x3 = torch.cat((x1, x2), dim = 1)
x4 = torch.cat((x, x3), dim = 1)
x4 = F.relu(self.fc2(x4))
x4 = self.fc3(x4)
x4 = F.log_softmax(x4)
return x4
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(101)
combined_model = Image_Embedd(embedding_size=train_categorical_embedding_sizes)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(combined_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 4, verbose = True, min_lr = .00000001)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
combined_model = nn.DataParallel(combined_model)
combined_model.to(device)
epochs = 5000
aggregated_losses = []
max_trn_batch = 11053
for i in range(epochs):
for b, (image, label, policy, numerical_data, categorical_data) in enumerate(train_loader):
image = image.to(device)
label = label.to(device)
numerical_data = numerical_data.to(device)
categorical_data = categorical_data.to(device)
#count batches
b += 1
#throttle teh batches
if b == max_trn_batch:
break
y_pred = combined_model(image, numerical_data, categorical_data)
single_loss = criterion(y_pred, label)
# statistics
print(f'epoch: {i:3}, batch: {b:3}, loss: {single_loss.item():10.8f}')
optimizer.zero_grad()
single_loss.backward()
optimizer.step()
aggregated_losses.append(single_loss.cpu().data.numpy())
scheduler.step(single_loss)
print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')
I’m not sure what I’m doing wrong. I followed or try to follow the tutorial here:
https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html