Hello!
I run the test code on NVIDIA RTX A6000 GPU and it prompts the following errors:
Use GPU
Epoch: 0
1.000078866541965
Traceback (most recent call last):
File "C:\Users\hp\.conda\envs\Pytorch\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "d:\jktong\1cnn-fit-exp-size\250by250\test_forum.py", line 87, in <module>
outputs = net(inputs)
File "C:\Users\hp\.conda\envs\Pytorch\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "d:\jktong\1cnn-fit-exp-size\250by250\test_forum.py", line 42, in forward
x = nn.LeakyReLU(0.1)( self.conv3(x) )
File "C:\Users\hp\.conda\envs\Pytorch\lib\site-packages\torch\nn\modules\module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\hp\.conda\envs\Pytorch\lib\site-packages\torch\nn\modules\conv.py", line 463, in forward
return self._conv_forward(input, self.weight, self.bias)
File "C:\Users\hp\.conda\envs\Pytorch\lib\site-packages\torch\nn\modules\conv.py", line 459, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: CUDA error: CUBLAS_STATUS_INTERNAL_ERROR when calling `cublasDgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
To debug the code, I have set the environment variable as
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
The complete executing code is listed as follows:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
#self.conv1 = nn.Conv2d(3, 6, 5)
#self.fc1 = nn.Linear(72*12, 768)
#self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(2*3*64*64, 80*80)
self.fc2 = nn.Linear(80*80, 125*125)
#self.fc3 = nn.Linear(2*3*16*16, 60*60)
self.us = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
self.conv1 = nn.Conv2d(1, 64, 5,padding=2) # would decrease size 4 in each dimension
self.conv2 = nn.Conv2d(64, 32, 5,padding=2)
self.conv3 = nn.Conv2d(32, 16, 5,padding=2)
self.conv4 = nn.Conv2d(16, 4, 5,padding=2)
self.conv5 = nn.Conv2d(4, 1, 5,padding=2)
# With relu and wihout relu has similar result
def forward(self, x):
x = x.view(-1, 2*3*64*64)
#x = self.conv1(x)
#x = x.view(-1, 72*12)
## Fully connected layers
x = self.fc1(x)
#x = F.relu(self.fc1(x))
x = nn.LeakyReLU(0.1)(x)
x = self.fc2(x)
# x = F.relu(self.fc2(x))
x = nn.LeakyReLU(0.1)(x)
x = torch.reshape(x, (np.shape(x)[0], -1, 125, 125)) #
## Upsampling
x = self.us(x)
## Five convolution layers
x = nn.LeakyReLU(0.1)( self.conv1(x) )
x = nn.LeakyReLU(0.1)( self.conv2(x) )
x = nn.LeakyReLU(0.1)( self.conv3(x) )
x = nn.LeakyReLU(0.1)( self.conv4(x) )
x = self.conv5(x)
x = x.view(np.shape(x)[0], -1)
# x = self.conv1(x)
return x
#Generate model
torch.backends.cudnn.enabled = False
net = Net().double()
#Batch size too small, no parallelization, only 1 GPU
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
device = torch.device("cuda:0")
print("Use GPU")
else:
device = torch.device("cpu")
print("Use CPU")
x = torch.randn(64*100, 2, 3, 64, 64).double()
y = torch.randn(64*100, 62500).double()
dataset = TensorDataset(x, y)
trainloader = DataLoader(dataset, batch_size=64)
criterion = nn.MSELoss()
#Use all 3 GPUs
# if torch.cuda.device_count() > 1:
# print("Let's use", torch.cuda.device_count(), "GPUs!")
# net = nn.DataParallel(net)
#Put data on all GPUs
net.to(device)
#Learning Process
# Train the network
nepoch = 4
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
for epoch in range(nepoch):
print('Epoch: %d\n' %(epoch))
for i, data in enumerate(trainloader, 0):
#print(i)
inputs, labels = data[0].to(device), data[1].to(device)
# Zero gradient
optimizer.zero_grad()
# Forward, backward(gradient), optimize (add gradient to weight)
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# Print loss information
print(loss.item())
I have search for some solutions, and one explanations says that I only installed a single GPU on the workstation, when the computation load is huge, windows will force the CUDA to quit using a TdrDelay timeout, whose default value is 2 seconds. I tried to extend the timeout value to 90 sec or even 200 sec, but the same error will appear. If I don’t use
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
The Pytorch will report an illegal memory access error. Note that using the same single GPU on ubuntu systems no error will be reported. So I want to know what is the cause of the problem and how to solve it. Thanks!
PS: My NVIDIA driver version is 522.06, which is installed along with CUDA 11.8. Torch version is 1.13.0+cu116