CNN using BCELoss causes CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

Hi everyone,

I am trying to train a CNN to recognize age, race, and gender using the UTKFace Dataset and was capable of training the model for gender using MSE Loss, but I didn’t like the results (regression for a classification issue). The thing is, when I tried to switch my loss function to BCELoss, I get this error which I really don’t understand after going through this post and this other one. I’ll paste some pictures of my code below, any sort of guidance is really appreciated : - )

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.cuda()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 94 * 94, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84,4)  # 4 bc batch size

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 94 * 94)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
net.to(torch.device("cuda:0"))
#loss_function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters())

#Training the network
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, _, labels = data

        #getting the gender info from labels
        genders = [train_map[label] for label in labels]

        # zero the parameter gradients
        optimizer.zero_grad()
        labels = torch.Tensor(genders)
        labels = torch.reshape(labels, (4,))

        # forward + backward + optimize
        outputs = net(inputs.cuda())
        outputs = torch.reshape(outputs, (4, ))

        loss = criterion(outputs, labels.cuda())
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 500 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.8f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

RuntimeError                              Traceback (most recent call last)
<ipython-input-12-e3c4d5fc0ef2> in <module>
     20 
     21         loss = criterion(outputs, labels.cuda())
---> 22         loss.backward()
     23         optimizer.step()
     24 



~/.local/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    219                 retain_graph=retain_graph,
    220                 create_graph=create_graph)
--> 221         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    222 
    223     def register_hook(self, hook):

~/.local/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    128         retain_graph = create_graph
    129 
--> 130     Variable._execution_engine.run_backward(
    131         tensors, grad_tensors_, retain_graph, create_graph,
    132         allow_unreachable=True)  # allow_unreachable flag

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

This error might be raised, if you are running out of memory and cublas isn’t able to create it’s handle.
Could you reduce the batch size and rerun the script?

I reduced the batch size to two on both the dataloaders (train and test) and when I went back in the jupyter notebook to do this statement

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.cuda()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 94 * 94, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84,2)  # batch size 2 but error differnt :(

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 94 * 94)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
net = Net()
net.to(torch.device("cuda:0"))
#loss_function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters())

I get this error

RuntimeError                              Traceback (most recent call last)
<ipython-input-16-c8b64f3361eb> in <module>
      1 net = Net()
----> 2 net.to(torch.device("cuda:0"))
      3 #loss_function and optimizer
      4 criterion = nn.BCELoss()
      5 optimizer = optim.Adam(net.parameters())

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in to(self, *args, **kwargs)
    610             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    611 
--> 612         return self._apply(convert)
    613 
    614     def register_backward_hook(

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    357     def _apply(self, fn):
    358         for module in self.children():
--> 359             module._apply(fn)
    360 
    361         def compute_should_use_set_data(tensor, tensor_applied):

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    379                 # `with torch.no_grad():`
    380                 with torch.no_grad():
--> 381                     param_applied = fn(param)
    382                 should_use_set_data = compute_should_use_set_data(param, param_applied)
    383                 if should_use_set_data:

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in convert(t)
    608             if convert_to_format is not None and t.dim() == 4:
    609                 return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
--> 610             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    611 
    612         return self._apply(convert)

RuntimeError: CUDA error: device-side assert triggered

Try to restart the Python kernel in the Jupyter notebook or execute the Python script through the terminal directly. If you are rerunning cells after an assert is triggered, the error will just be raised again, since the CUDA context is corrupted.

Well the error is different right now

RuntimeError                              Traceback (most recent call last)
<ipython-input-11-d2b63f822ee5> in <module>
     16 
     17         # forward + backward + optimize
---> 18         outputs = net(inputs.cuda())
     19         outputs = torch.reshape(outputs, (2, ))
     20 

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

<ipython-input-7-518392d92557> in forward(self, x)
     13         x = self.pool(F.relu(self.conv1(x)))
     14         x = self.pool(F.relu(self.conv2(x)))
---> 15         x = x.view(-1, 16 * 94 * 94)
     16         x = F.relu(self.fc1(x))
     17         x = F.relu(self.fc2(x))

RuntimeError: shape '[-1, 141376]' is invalid for input of size 70688

This has to do with how I call x.view right? I believe theres another forum post on that one

This new error is raised, because the view operation creates a tensor, which doesn’t match the expected in_features of the next linear layer.
Try to use x = x.view(x.size(0), -1), print the shape, and make sure self.fc1 has the expected in_features set.

I fixed the x.view statement, but I’m still confused as to the math to find the in features (still pretty new to ML)

You can print the shape of the flattened x tensor after the view operation and set the in_features of self.fc1 to this value:

x = x.view(x.size(0), -1)
print(x.shape)

as suggested before or alternatively, you could manually calculate the output shapes of the activations using the shape formulas for the corresponding layers in the docs.