nn.DataParallel **RuntimeError** : error in LoadLibraryA

Hi, I am trying to apply the dataparallel wrapper to following model. I believe i have followed this tutorial correctly; https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html. Any idea what may be the problem.

If I use ’ logits = Bert.module(token_ids, masks)’ instead of ‘logits = Bert(token_ids, masks)’ the runtime error does not occur, but the system only uses one of the 2 GPUs.

class BertFineGrain(nn.Module):
    def __init__(self, bins, dropout=0.1):
        super(BertFineGrain, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, bins)
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda:0,1" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Bert = nn.DataParallel(BertFineGrain(bins).to(device))
# Bert = nn.DataParallel(BertFineGrain(bins).to(device), device_ids=[1,0])
Bert = nn.DataParallel(BertFineGrain(bins).to(device), device_ids=[0,1])
 print('Model:', type(Bert))
 print('Devices:', Bert.device_ids)

Model: <class 'torch.nn.parallel.data_parallel.DataParallel'> 
Devices: [0, 1]
optimizer = Adam(Bert.parameters(),lr=2e-5)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
loss_func = nn.CrossEntropyLoss(weight=None)
EPOCHS = 4
for epoch_num in range(EPOCHS):

    Bert.train()
    train_loss = 0

    # itterate through train data
    for step_num, batch_data in enumerate(train_loader):
        token_ids,masks, labels = tuple(t.to(device) for t in batch_data)

        # https://github.com/huggingface/transformers/issues/2952
        token_ids = token_ids.type(torch.LongTensor)
        labels = labels.type(torch.LongTensor)
        token_ids =  token_ids.to(device)
        labels =  labels.to(device)
        # end bug fix

        Bert.zero_grad()
        logits = Bert(token_ids, masks) 
        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()      
        batch_loss.backward()
        clip_grad_norm_(parameters=Bert.parameters(), max_norm=1.0)
        optimizer.step()
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
 in 
     21         print('t1')
     22         Bert.zero_grad()
---> 23         logits = Bert(token_ids, masks)
     24         batch_loss = loss_func(logits, labels)
     25         train_loss += batch_loss.item()

~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    530             result = self._slow_forward(*input, **kwargs)
    531         else:
--> 532             result = self.forward(*input, **kwargs)
    533         for hook in self._forward_hooks.values():
    534             hook_result = hook(self, input, result)

~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\nn\parallel\data_parallel.py in forward(self, *inputs, **kwargs)
    149         if len(self.device_ids) == 1:
    150             return self.module(*inputs[0], **kwargs[0])
--> 151         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
    152         outputs = self.parallel_apply(replicas, inputs, kwargs)
    153         return self.gather(outputs, self.output_device)

~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\nn\parallel\data_parallel.py in replicate(self, module, device_ids)
    154 
    155     def replicate(self, module, device_ids):
--> 156         return replicate(module, device_ids, not torch.is_grad_enabled())
    157 
    158     def scatter(self, inputs, kwargs, device_ids):

~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\nn\parallel\replicate.py in replicate(network, devices, detach)
     86     params = list(network.parameters())
     87     param_indices = {param: idx for idx, param in enumerate(params)}
---> 88     param_copies = _broadcast_coalesced_reshape(params, devices, detach)
     89 
     90     buffers = list(network.buffers())

~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\nn\parallel\replicate.py in _broadcast_coalesced_reshape(tensors, devices, detach)
     69         # Use the autograd function to broadcast if not detach
     70         if len(tensors) > 0:
---> 71             tensor_copies = Broadcast.apply(devices, *tensors)
     72             return [tensor_copies[i:i + len(tensors)]
     73                     for i in range(0, len(tensor_copies), len(tensors))]

~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\nn\parallel\_functions.py in forward(ctx, target_gpus, *inputs)
     19         ctx.num_inputs = len(inputs)
     20         ctx.input_device = inputs[0].get_device()
---> 21         outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
     22         non_differentiables = []
     23         for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):

~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\cuda\comm.py in broadcast_coalesced(tensors, devices, buffer_size)
     37         corresponding to indices from ``devices``.
     38     """
---> 39     return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
     40 
     41 

RuntimeError: error in LoadLibraryA (DynamicLibrary at ..\aten\src\ATen\DynamicLibrary.cpp:52)
(no backtrace available)

Hi,

Could you give some information about how you installed pytorch, which version of python you’re using and which version of cuda please?

@peterjc123 any idea what could be causing this?

Sorry, I don’t have a PC with multiple GPUs so I could not reproduce the issue. But I’m improving the experience with https://github.com/pytorch/pytorch/pull/40365.

1 Like

Hi,

I have pytorch version 1.4.0 installed through pip.

Here are my Cuda details;

C:\Users\harki>nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Oct_23_19:32:27_Pacific_Daylight_Time_2019
Cuda compilation tools, release 10.2, V10.2.89

Also I am using 2 RTX 2070 Super GPUs.