Getting a blue screen error "system thread exception not handled"

Karan_Nautiyal · March 10, 2023, 10:37pm

I am trying to run the following code on WSL2 (Ubuntu 20.04). Cuda version 12

class CustomDataSet(Dataset):
    
    def __init__(self, data_dir, transform = transforms.ToTensor()):
        self.images=[]
        self.data_dir = data_dir
        labels = os.listdir(self.data_dir)
        labels.sort()
        self.transform = transform
        
        for i, label in enumerate(labels):
            label_dir = os.path.join(data_dir,label)
            for image_name in os.listdir(label_dir):
                image_path = os.path.join(label_dir,image_name)
                self.images.append((image_path,i))
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_path,label = self.images[idx]
        image = Image.open(img_path).convert('RGB')
        
        if self.transform is not None:
            image= self.transform(image)
        
        return image,label

class CustomTestDataSet(Dataset):
 
    def __init__(self, data_dir, transform = transforms.ToTensor()):
        self.images=[]
        self.data_dir = data_dir
        labels = os.listdir(self.data_dir)
        labels.sort()
        self.transform = transform
        
        for i, label in enumerate(labels):
            label_dir = os.path.join(data_dir,label)
            for image_name in os.listdir(label_dir):
                image_path = os.path.join(label_dir,image_name)
                self.images.append((image_path,i))
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_path,label = self.images[idx]
        image = Image.open(img_path).convert('RGB')
        
        if self.transform is not None:
            image= self.transform(image)
        
        return image,img_path`

train_set = CustomDataSet(data_dir= os.path.join(datapath,"train_set") )


test_set = CustomTestDataSet(data_dir= os.path.join(datapath,"test_set") )

train_set,valid_set = torch.utils.data.random_split(train_set, [0.8, 0.2], generator=torch.Generator().manual_seed(0))

cuda_device_id=0



train_loader = DataLoader(train_set,
                          batch_size=64,
                          shuffle=True,
                          num_workers=2,
                          pin_memory = True,
                          pin_memory_device = "cuda:%i" % cuda_device_id)

test_loader = DataLoader(test_set,
                         batch_size=64,
                         shuffle=False,
                         num_workers=2,
                         pin_memory = True,
                         pin_memory_device = "cuda:%i" % cuda_device_id)

valid_loader = DataLoader(valid_set,
                         batch_size=64,
                         shuffle=True,
                         num_workers=2,
                         pin_memory = True,
                         pin_memory_device = "cuda:%i" % cuda_device_id)

All this works fine but when I try to run the following code

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d( 3 , 6, kernel_size = 5, stride = 1,padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=5, stride=2),
            nn.Conv2d(6, 12, kernel_size=5, stride=1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(12, 36, kernel_size=5, stride=1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=5, stride=2)
        )

        self.fc_layers = nn.Sequential(    
            nn.Linear(36*4*4,120),
            nn.Linear(120,84),
            nn.ReLU(),
            nn.Linear(84,30)

        )


    def forward(self, x):
      x = self.conv_layers(x)
      x = x.view(x.size(0), -1)
      x = self.fc_layers(x)
      return x


cnn = CNN()
cnn.to('cuda')
# Define the loss function
loss_function_cnn = nn.CrossEntropyLoss()

# Define the optimizer
optimizer_cnn = optim.SGD(cnn.parameters(), lr=0.01, momentum=0.9)

I get a blue screen of death nvlddmkm.sys failed in a few seconds. System thread exception not handled. I am using images from TinyImageNet30 to train the model.

I have narrowed it down to the line

cnn.to('cuda')

Karan_Nautiyal · March 24, 2023, 12:12am

Found the solution, the “latest” Nvidia drivers I had had a bug, and downloading a hotfix from the Nvidia forums fixed the issue

StoeckOverflow · December 9, 2023, 4:14pm

Hi, I’m also encountering the same blue screen error when attempting to run a PyTorch program on my Windows 10 system with an Nvidia Quadro P500 GPU (Driver version: 531.4, CUDA Version 12.1). I suspect the issue may also be related to the CUDA toolkit version I’m using.

Here are the specifics of my setup:

Python Version: 3.11 (installed via Microsoft Store)
PyTorch Version: 2.1.1+cu121 (installed via Pip)
Associated Libraries: torchaudio 2.1.1+cu121, torchvision 0.16.1+cu121

Could you may share me the hotfix from the nvidia forum ?

burhop · March 13, 2024, 9:22pm

@StoeckOverflow @Karan_Nautiyal Did you all come up with any solutions here? Training starts but will cause my PC to die after an hour or so:

Windows 10 32Gig
RTX 4800 32Gig
Python 3.11.8
torch 2.2.1+cu121
torchaudio 2.2.1+cu121
torchvision 0.17.1

NVidia Drivers 551.61 (relase date: 02/22/2024)

Minidump info below:
PROCESS_NAME: System

STACK_TEXT:
ffffe0032671e028 fffff80463e03ad0 : 0000000000000119 0000000000000005 ffffbc84caeaa000 ffffbc84caf30620 : nt!KeBugCheckEx
ffffe0032671e030 fffff8047cae029d : ffffbc84cafa6000 ffffbc84caeaa000 000000000000ffff fffff804712a434c : watchdog!WdLogEvent5_WdCriticalError+0xe0
ffffe0032671e070 fffff8047caced0d : ffffbc84cae78000 ffffbc84caeaa000 ffffe0032671e2c0 fffff8047cacef8e : dxgmms2!VidSchiProcessIsrFaultedPacket+0x26d
ffffe0032671e0f0 fffff8047cabe231 : ffffbc840001e794 fffff80471d4d690 ffffbc84cae78000 00000000ffffffff : dxgmms2!VidSchDdiNotifyInterruptWorker+0x10a9d
ffffe0032671e150 fffff80464a4d914 : ffffbc84c423a030 ffffe0032671e2c0 ffffbc84c4350000 0000000000000000 : dxgmms2!VidSchDdiNotifyInterrupt+0xd1
ffffe0032671e1a0 fffff80471d6b96f : ffffbc84c423a030 ffffbc84c4350000 ffffe0030000000e ffffe00300000000 : dxgkrnl!DxgNotifyInterruptCB+0x94
ffffe0032671e1d0 ffffbc84c423a030 : ffffbc84c4350000 ffffe0030000000e ffffe00300000000 fffff80471d6b8f1 : nvlddmkm+0xb9b96f
ffffe0032671e1d8 ffffbc84c4350000 : ffffe0030000000e ffffe00300000000 fffff80471d6b8f1 ffffbc84c4350000 : 0xffffbc84c423a030 ffffe0032671e1e0 ffffe0030000000e : ffffe00300000000 fffff80471d6b8f1 ffffbc84c4350000 0000000000000000 : 0xffffbc84c4350000
ffffe0032671e1e8 ffffe00300000000 : fffff80471d6b8f1 ffffbc84c4350000 0000000000000000 0000000000000000 : 0xffffe0030000000e ffffe0032671e1f0 fffff80471d6b8f1 : ffffbc84c4350000 0000000000000000 0000000000000000 0000000000000000 : 0xffffe00300000000
ffffe0032671e1f8 ffffbc84c4350000 : 0000000000000000 0000000000000000 0000000000000000 0000000000000000 : nvlddmkm+0xb9b8f1
ffffe0032671e200 0000000000000000 : 0000000000000000 0000000000000000 0000000000000000 ffffbc84c423a030 : 0xffffbc84`c4350000

SYMBOL_NAME: nvlddmkm+b9b96f

MODULE_NAME: nvlddmkm

IMAGE_NAME: nvlddmkm.sys

STACK_COMMAND: .cxr; .ecxr ; kb

BUCKET_ID_FUNC_OFFSET: b9b96f

FAILURE_BUCKET_ID: 0x119_5_DRIVER_FAULTED_SYSTEM_COMMAND_nvlddmkm!unknown_function

OS_VERSION: 10.0.19041.1

BUILDLAB_STR: vb_release

OSPLATFORM_TYPE: x64

OSNAME: Windows 10

FAILURE_ID_HASH: {55a61c3c-91b1-e527-dcff-f2f0d7348227}