CNN: RuntimeError: CUDA error: device-side assert triggered

Well guys, i am getting this error described on the topic’s title. I added the CUDA_LAUNCH_BLOCKING=1 to the code for a batter description of the error.

When i run the code on the CPU i get a different error.

This is the code i am using:

import os
CUDA_LAUNCH_BLOCKING=1
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters 
num_epochs = 1
batch_size = 1
learning_rate = 0.001

# dataset has PILImage images of range [0, 1]. 
# We transform them to Tensors of normalized range [-1, 1]
transform = transforms.Compose([
    transforms.Resize(131),
    transforms.CenterCrop(130),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])



train_dataset = torchvision.datasets.ImageFolder(root='C:/Users/leo_f/OneDrive/Imagens/MV/', transform=transform)

test_dataset = torchvision.datasets.ImageFolder(root='C:/Users/leo_f/OneDrive/Imagens/MV/', transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
                                         shuffle=False)

classes = ('Cob', 'Broken', 'Rotten', 'Good')


def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get some random training images
dataiter = iter(train_loader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 5)
        self.fc1 = nn.Linear(64 * 29 * 29, 2048)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 4)

    def forward(self, x):
        # -> n, 3, 32, 32
        x = self.pool(F.relu(self.conv1(x)))  # -> 
        x = self.pool(F.relu(self.conv2(x)))  # -> 
        x = x.view(-1, 64 * 29 * 29)            # -> 
        x = F.relu(self.fc1(x))               # -> 
        x = F.relu(self.fc2(x))               # -> 
        x = self.fc3(x)                       # -> 
        return x


model = ConvNet().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # input_layer: x input channels, y output channels, z kernel size
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 2000 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

print('Finished Training')
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)

with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
        
        for i in range(batch_size):
            label = labels[i]
            pred = predicted[i]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network: {acc} %')

    for i in range(10):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f'Accuracy of {classes[i]}: {acc} %')

This is the error i am getting when using the code for GPU processing:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-5-6a25cd48fbbd> in <module>
     76 
     77 
---> 78 model = ConvNet().to(device)
     79 
     80 criterion = nn.CrossEntropyLoss()

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in to(self, *args, **kwargs)
    605             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    606 
--> 607         return self._apply(convert)
    608 
    609     def register_backward_hook(

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _apply(self, fn)
    352     def _apply(self, fn):
    353         for module in self.children():
--> 354             module._apply(fn)
    355 
    356         def compute_should_use_set_data(tensor, tensor_applied):

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _apply(self, fn)
    374                 # `with torch.no_grad():`
    375                 with torch.no_grad():
--> 376                     param_applied = fn(param)
    377                 should_use_set_data = compute_should_use_set_data(param, param_applied)
    378                 if should_use_set_data:

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in convert(t)
    603             if convert_to_format is not None and t.dim() == 4:
    604                 return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
--> 605             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    606 
    607         return self._apply(convert)

RuntimeError: CUDA error: device-side assert triggered

And this is the error i get when running the code on the CPU (just changed # Device configuration device = torch.device(‘cpu’)):

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-1-30043d5846ff> in <module>
     90 
     91         # Forward pass
---> 92         outputs = model(images)
     93         loss = criterion(outputs, labels)
     94 

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

<ipython-input-1-30043d5846ff> in forward(self, x)
     71         x = x.view(-1, 64 * 29 * 29)            # -> n, 400
     72         x = F.relu(self.fc1(x))               # -> n, 120
---> 73         x = F.relu(self.fc2(x))               # -> n, 84
     74         x = self.fc3(x)                       # -> n, 10
     75         return x

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~\anaconda3\lib\site-packages\torch\nn\modules\linear.py in forward(self, input)
     89 
     90     def forward(self, input: Tensor) -> Tensor:
---> 91         return F.linear(input, self.weight, self.bias)
     92 
     93     def extra_repr(self) -> str:

~\anaconda3\lib\site-packages\torch\nn\functional.py in linear(input, weight, bias)
   1672     if input.dim() == 2 and bias is not None:
   1673         # fused op is marginally faster
-> 1674         ret = torch.addmm(bias, input, weight.t())
   1675     else:
   1676         output = input.matmul(weight.t())

RuntimeError: size mismatch, m1: [1 x 2048], m2: [1024 x 512] at ..\aten\src\TH/generic/THTensorMath.cpp:41

Where are you getting F.relu from? I don’t see it declared in the initialization part.
Also it seems your layers don’t have the right dimentions