Multiple GPU model getting RuntimeError: Caught RuntimeError in replica 0 on device 0

Hello,

I’m trying to move a single GPU model to a machine with 4 GPUs, only I’m on a timeline to use this machine.

I’m getting the following error:

RuntimeError                              Traceback (most recent call last)
<ipython-input-28-4b69b40dcdef> in <module>
     18             break
     19 
---> 20         y_pred = combined_model(image, numerical_data, categorical_data)
     21         single_loss = criterion(y_pred, label)
     22 

~/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    530             result = self._slow_forward(*input, **kwargs)
    531         else:
--> 532             result = self.forward(*input, **kwargs)
    533         for hook in self._forward_hooks.values():
    534             hook_result = hook(self, input, result)

~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
    150             return self.module(*inputs[0], **kwargs[0])
    151         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 152         outputs = self.parallel_apply(replicas, inputs, kwargs)
    153         return self.gather(outputs, self.output_device)
    154 

~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
    160 
    161     def parallel_apply(self, replicas, inputs, kwargs):
--> 162         return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
    163 
    164     def gather(self, outputs, output_device):

~/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
     83         output = results[i]
     84         if isinstance(output, ExceptionWrapper):
---> 85             output.reraise()
     86         outputs.append(output)
     87     return outputs

~/miniconda3/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
    392             # (https://bugs.python.org/issue2651), so we work around it.
    393             msg = KeyErrorMessage(msg)
--> 394         raise self.exc_type(msg)

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "<ipython-input-25-86287e73cc1f>", line 34, in forward
    x = torch.cat(embeddings, 1)
RuntimeError: cuda runtime error (710) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCGeneral.cpp:313

Here is my model:

class Image_Embedd(nn.Module):

    def __init__(self, embedding_size):
        '''
        Args
        ---------------------------
        embedding_size: Contains the embedding size for the categorical columns
        num_numerical_cols: Stores the total number of numerical columns
        output_size: The size of the output layer or the number of possible outputs.
        layers: List which contains number of neurons for all the layers.
        p: Dropout with the default value of 0.5
        
        '''
        super().__init__()    
        
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p = .04)
        
        self.cnn = models.resnet50(pretrained=False).cuda()
        
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 1000)
        self.fc1 = nn.Linear(1000, 1077)
        self.fc2 = nn.Linear(1077, 128)
        self.fc3 = nn.Linear(128, 2)
        
        
    #define the foward method
    def forward(self, image, x_numerical, x_categorical):
        
        embeddings = []
        for i, e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
            
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)
        x1 = self.cnn(image)
        x2 = x_numerical
        
        x3 = torch.cat((x1, x2), dim = 1)
        x4 = torch.cat((x, x3), dim = 1)
        x4 = F.relu(self.fc2(x4))
        x4 = self.fc3(x4)
        x4 = F.log_softmax(x4)
        return x4

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(101)
combined_model = Image_Embedd(embedding_size=train_categorical_embedding_sizes)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(combined_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 4, verbose = True, min_lr = .00000001)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    combined_model = nn.DataParallel(combined_model)
combined_model.to(device)

epochs = 5000
aggregated_losses = []

max_trn_batch = 11053

for i in range(epochs):
    for b, (image, label, policy, numerical_data, categorical_data) in enumerate(train_loader):
        image = image.to(device)
        label = label.to(device)
        numerical_data = numerical_data.to(device)
        categorical_data = categorical_data.to(device)
        
        #count batches
        b += 1
        
        #throttle teh batches
        if b == max_trn_batch:
            break
        
        y_pred = combined_model(image, numerical_data, categorical_data)
        single_loss = criterion(y_pred, label)
        
        # statistics
        print(f'epoch: {i:3}, batch: {b:3}, loss: {single_loss.item():10.8f}')

        optimizer.zero_grad()
        single_loss.backward()
        optimizer.step()
    
    aggregated_losses.append(single_loss.cpu().data.numpy())
    scheduler.step(single_loss)


print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

I’m not sure what I’m doing wrong. I followed or try to follow the tutorial here:
https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html

Looks like some layers of the model lives on GPU and others live on CPU. Is this intentional? DataParallel does not support mixed CPU-GPU model, all layers of the same model need to live on the same GPU.

If you have multi-GPU model, e.g., some layers live on cuda:0 and others live on cuda:1, you can try DistributedDataParallel. Check out this.

nope. i’m reworking it now. Thank you for pointing that out.

How did you solve the issue??