AlexNet RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

I tried the following code on AlexNet. I’m using it to classify it into 16 different categories. Labels are from 1 to 16 (numerical)

iterations = 30
trainLoss = []
testAcc = []

start = time.time()
for epoch in range(iterations):
    epochStart = time.time()
    runningLoss = 0    
    net.train(True) # For training
    for data in trainLoader:
        inputs,labels = data
        # Wrap them in Variable
        if use_gpu:
            inputs, labels = Variable(inputs.float().cuda()), \
                Variable(labels.long().cuda())
        else:
            inputs, labels = Variable(inputs), Variable(labels.long()) 
        inputs = inputs/1600
        # Initialize gradients to zero
        optimizer.zero_grad()
        # Feed-forward input data through the network
        outputs = net(inputs)
        # Compute loss/error
        loss = criterion(outputs, labels)        
        # Backpropagate loss and compute gradients
        loss.backward()
        # Update the network parameters
        optimizer.step()
        # Accumulate loss per batch
        runningLoss += loss.detach()    
    avgTrainLoss = runningLoss/1200
    trainLoss.append(avgTrainLoss)
    # Evaluating performance on test set for each epoch
    net.train(False) # For testing
    inputs = TestImages/1600
    if use_gpu:
        inputs = Variable(inputs.cuda())
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predicted = predicted.cpu()
    else:
        inputs = Variable(inputs)
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
    correct = 0
    total = 0
    total += TestLabels.size(0)
    correct += (predicted == TestLabels).sum()
    avgTestAcc = correct/400
    testAcc.append(avgTestAcc)
        
    # Plotting Loss vs Epochs
    fig1 = plt.figure(1)        
    plt.plot(range(epoch+1),trainLoss,'r--',label='train')        
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')    
    # Plotting testing accuracy vs Epochs
    fig2 = plt.figure(2)        
    plt.plot(range(epoch+1),testAcc,'g-',label='test')        
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Testing accuracy')    
    epochEnd = time.time()-epochStart
    print('At Iteration: {:.0f} /{:.0f}  ;  Training Loss: {:.6f} ; Testing Acc: {:.3f} ; Time consumed: {:.0f}m {:.0f}s '\
          .format(epoch + 1,iterations,avgTrainLoss,avgTestAcc*100,epochEnd//60,epochEnd%60))
end = time.time()-start
print('Training completed in {:.0f}m {:.0f}s'.format(end//60,end%60))

However I receive the following error

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-13-bd2bf36f7806> in <module>
     24         loss = criterion(outputs, labels)
     25         # Backpropagate loss and compute gradients
---> 26         loss.backward()
     27         # Update the network parameters
     28         optimizer.step()

~\anaconda3\envs\pytorch_env\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
    219                 retain_graph=retain_graph,
    220                 create_graph=create_graph)
--> 221         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    222 
    223     def register_hook(self, hook):

~\anaconda3\envs\pytorch_env\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    128         retain_graph = create_graph
    129 
--> 130     Variable._execution_engine.run_backward(
    131         tensors, grad_tensors_, retain_graph, create_graph,
    132         allow_unreachable=True)  # allow_unreachable flag

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

Could anyone let me know what is wrong here please?
Thanks in advance

Found a good stack overflow post on this here. Your output size might not match the size of your variables which may be causing the error, but you can try some of the fixes at that link.

I’m new here could you let me know how the sizes would change. All of them seem to be equal right? I know noob question but how do I check the sized of my outputs and inputs

just print the output of your models shape doing this

print(outputs.shape)
print(labels.shape)

You can print the labels using the second one. Also what loss are you using.

I am using nn.NLLLoss() # Negative Log-Likelihood
The net() values are as shown below

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace=True)
    (fc): Linear(in_features=4096, out_features=2, bias=True)
    (softmax): LogSoftmax(dim=None)
  )
)

If you are using 16 categories your output features on the fc layer needs to be 16.

Oh ok I’ll change that and try to see if anything changes
Thanks a lot!

1 Like

Hey Dwight. I have a 1650 and have run the code now, it is returning a different error. SInce I am new here must I put it in a new topic or is it fine if I ask it here. If it is fine the error I am facing is

CUDA out of memory. Tried to allocate 88.00 MiB (GPU 0; 4.00 GiB total capacity; 2.18 GiB already allocated; 42.45 MiB free; 2.33 GiB reserved in total by PyTorch)

Yes so that is just because your gpu ran out of memory. What is your batch size? It is fine you can just keep it here.

Batch Size is 32

# Creating pytorch dataset
trainDataset = TensorDataset(TrainImages, TrainLabels)
testDataset = TensorDataset(TestImages, TestLabels)
# Creating dataloader
BatchSize = 32
trainLoader = DataLoader(trainDataset, batch_size=BatchSize, shuffle=True,num_workers=4, pin_memory=True)
testLoader = DataLoader(testDataset, batch_size=BatchSize, shuffle=True,num_workers=4, pin_memory=True)

ya so that is probably too large for your gpu memory. Try halving it to 16.

Yes I’ve tried it again but have got the following error
Must I decrease it more?


RuntimeError: CUDA out of memory. Tried to allocate 88.00 MiB (GPU 0; 4.00 GiB total capacity; 2.27 GiB already allocated; 10.45 MiB free; 2.36 GiB reserved in total by PyTorch)

Ok your just going to have to keep lowering it until you don’t run out of memory. So just keep cutting it in half.

Ok Sure. Thanks a lot for your help!!!

I saw you created a good post that is probably a good idea. One more question what is the size of your images. If you don’t know you can print out images.shape in your training loop to find out.

Hey The size of my images is as follows

TrainImages = torch.FloatTensor(1200,3,256, 256)
TrainLabels = torch.LongTensor(1200)
TestImages = torch.FloatTensor(400,3,256,256)
TestLabels = torch.LongTensor(400)

Hope this helps!

edit I tried printing it out and it gave the following output

torch.Size([1200, 3, 256, 256])
torch.Size([1200])
torch.Size([400, 3, 256, 256])
torch.Size([400])

Wait what does the first number mean I’m confused. Why is the shape (1200,3,256,256) what is 1200 doing. The shape is only supposed to have 4 dimensions batch size, channels, x, and y dims.

Actually I’m using 1200 images for training and 400 images for test. Therefore I have created these tensors and saved numpy array data of 1200 images into the train tensor and 400 images into test tensor

Ok so when you print out the shapes in the training loop is the shape still (1200, 3, 256,256) or is it (batch_size, 3, 256,256)

I’m sorry but this is my loop for training and testing. If you don’t mind telling me where I can check about the sizes I will let you know about the results. Again, I’m really sorry for taking up your time this way due to my lack of knowledge

iterations = 10
trainLoss = []
testAcc = []

start = time.time()
for epoch in range(iterations):
    epochStart = time.time()
    runningLoss = 0    
    net.train(True) # For training
    for data in trainLoader:
        inputs,labels = data
        # Wrap them in Variable
        if use_gpu:
            inputs, labels = Variable(inputs.float().cuda()), \
                Variable(labels.long().cuda())
        else:
            inputs, labels = Variable(inputs), Variable(labels.long()) 
        inputs = inputs/1600
        # Initialize gradients to zero
        optimizer.zero_grad()
        # Feed-forward input data through the network
        outputs = net(inputs)
        # Compute loss/error
        loss = criterion(outputs, labels)        
        # Backpropagate loss and compute gradients
        loss.backward()
        # Update the network parameters
        optimizer.step()
        # Accumulate loss per batch
        runningLoss += loss.detach()    
    avgTrainLoss = runningLoss/1200
    trainLoss.append(avgTrainLoss)
    # Evaluating performance on test set for each epoch
    net.train(False) # For testing
    inputs = TestImages/1600
    if use_gpu:
        inputs = Variable(inputs.cuda())
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predicted = predicted.cpu()
    else:
        inputs = Variable(inputs)
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
    correct = 0
    total = 0
    total += TestLabels.size(0)
    correct += (predicted == TestLabels).sum()
    avgTestAcc = correct/400
    testAcc.append(avgTestAcc)
        
    # Plotting Loss vs Epochs
    fig1 = plt.figure(1)        
    plt.plot(range(epoch+1),trainLoss,'r--',label='train')        
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')    
    # Plotting testing accuracy vs Epochs
    fig2 = plt.figure(2)        
    plt.plot(range(epoch+1),testAcc,'g-',label='test')        
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Testing accuracy')    
    epochEnd = time.time()-epochStart
    print('At Iteration: {:.0f} /{:.0f}  ;  Training Loss: {:.6f} ; Testing Acc: {:.3f} ; Time consumed: {:.0f}m {:.0f}s '\
          .format(epoch + 1,iterations,avgTrainLoss,avgTestAcc*100,epochEnd//60,epochEnd%60))
end = time.time()-start
print('Training completed in {:.0f}m {:.0f}s'.format(end//60,end%60))