I GET RuntimeError: CUDA error: an illegal memory access was encountered

tr_marwa · July 6, 2020, 2:01pm

hello everyone,
if someone can help me
my problem is that i’m using kfols method to do the cross validation, which means that i have 5 itération to fit my data into training and testing set
i made a loop for around my data, when i do just a training it works normally but when i do the validation after that, i get this error RuntimeError: CUDA error: an illegal memory access was encountered
knowing that i tryed

device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)
print(‘Using device:’, device)
print()

#Additional Info when using cuda
if device.type == ‘cuda’:
print(torch.cuda.get_device_name(0))
print(‘Memory Usage:’)
print(‘Allocated:’, round(torch.cuda.memory_allocated(0)/10243,1), ‘GB’)
print('Cached: ', round(torch.cuda.memory_cached(0)/10243,1), ‘GB’)

and i get

Using device: cuda

GeForce GTX 1080 Ti
Memory Usage:
Allocated: 0.4 GB
Cached: 1.9 GB

here is my code :

for l in range(0,len(XTrain)):

loss=0.0
loss_test=0.0

Net=UNet(1,4)  
Net.apply(init_weights)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torch.backends.cudnn.benchmark = True
# torch.cuda.set_device(0)
Net.to(device)
size=[]
size_test=[]
taille=[]
x={}
y={}
z={}
x_test={}
y_test={}
z_test={}
data_train_norm=[]
data_test_norm=[]
gt_train_norm=[]
gt_test_norm=[]

for k in range(0,int(0.8*len(data_ed))):
    data_train_norm.append(Preprocessing(XTrain[l][k]))
    gt_train_norm.append((YTrain[l][k]))
    size.append(data_train_norm[k].shape)
    x[k],y[k],z[k]=size[k]
    
for k in range(0,int(0.2*len(data_ed))):
    
    data_test_norm.append(Preprocessing(XTest[l][k]))
    gt_test_norm.append((YTest[l][k]))
    size_test.append(data_test_norm[k].shape)
    x_test[k],y_test[k],z_test[k]=size_test[k]

    
    
image_resized_train=[]
image_resized_test=[]
image_resized_train_gt=[]
image_resized_test_gt=[]


for i in range(0,int(0.8*len(data_ed))):
    for j in range(z[i]):
        
        image_resized_train.append(resize_image(data_train_norm[i][:,:,j],H_train[l][i][1],1.25))
        image_resized_train_gt.append(resize_gt(gt_train_norm[i][:,:,j],H_train[l][i][1],1.25))
      

for i in range(0,int(0.2*len(data_ed))):
    for j in range(z_test[i]):
        
        image_resized_test.append(resize_image(data_test_norm[i][:,:,j],H_test[l][i][1],1.25))
        image_resized_test_gt.append(resize_gt(gt_test_norm[i][:,:,j],H_test[l][i][1],1.25))
final_train=[] 
final_test=[]
final_train_gt=[] 
final_test_gt=[]
img_train=[]  
img_test=[]
img_train_gt=[]  
img_test_gt=[]
width=557
height=667
data_tensor_test=[]
data_tensor_train=[]
gt_tensor_test=[]
gt_tensor_train=[]

for i in range(0,len(image_resized_train)):  
    
    img_train.append(Resize(image_resized_train[i],width,height))
    
    final_train.append(crop(img_train[i],256,256))

    data_tensor_train.append(torch.from_numpy(np.array(final_train[i])).float())
    # groudtruth train 
    
    img_train_gt.append(Resize(image_resized_train_gt[i],width,height))
    
    final_train_gt.append(crop(img_train_gt[i],256,256))

    gt_tensor_train.append(torch.from_numpy(np.array(final_train_gt[i])).float())
    
      
for i in range(0,len(image_resized_test)):
    
    img_test.append(Resize(image_resized_test[i],width,height))
    
    final_test.append(crop(img_test[i],256,256))

    data_tensor_test.append(torch.from_numpy(np.array(final_test[i])).float())
    
    img_test_gt.append(Resize(image_resized_test_gt[i],width,height))
    
    final_test_gt.append(crop(img_test_gt[i],256,256))

    gt_tensor_test.append(torch.from_numpy(np.array(final_test_gt[i])).float())

    
X_train=torch.stack(data_tensor_train)
X_test=torch.stack(data_tensor_test)
Y_train=torch.stack(gt_tensor_train)
Y_test=torch.stack(gt_tensor_test)
print(l,X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

class LoadDataset_train(Dataset):
    

    def __init__(self):
        # load data
        self.X=X_train
        self.Y=Y_train
        self.len=X_train.shape[0]
        

    def __len__(self):
        
        # dataset's size
        return self.len
        

    def __getitem__(self, idx):
        X,Y = self.X[idx], self.Y[idx]
        # the position
        X = X.view(1,X.shape[0],X.shape[1])
        sample=X,Y
        return sample 
    
class LoadDataset_test(Dataset):
    

    def __init__(self):
        # load data
        self.X=X_test
        self.Y=Y_test
        self.len=X_test.shape[0]
        

    def __len__(self):
        
        # dataset's size
        return self.len
        

    def __getitem__(self, idx):
        X,Y = self.X[idx], self.Y[idx]
        # the position
        X = X.view(1,X.shape[0],X.shape[1])
        sample=X,Y
        return sample       
    
dataset=LoadDataset_train()   
trainloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=1)
dataset_test=LoadDataset_test()       
trainloader_test = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=1, shuffle=True, num_workers=1)


# print(trainloader)

learning_rate=10e-3
#define the weights using in the loss function 
weights = torch.tensor([0.07,0.31,0.31,0.31]).to(device)  ########
#define a loss function
criterion = nn.CrossEntropyLoss(weights)  
#define the optimizer aDAM 
optimizer=optim.Adam(Net.parameters(), lr=learning_rate , weight_decay=10e-8)



n_epochs=10
loss_values=[]
loss_values_test=[]

for epoch in range(n_epochs):
    Net.train()
    running_loss = 0.0
  
    
    
    
    for i, data in enumerate(trainloader):
        
            
        # get the inputs; data is a list of [inputs, labels]

        inputs, labels = data[0].to(device), data[1].to(device).long()
        
        
            
        optimizer.zero_grad()
        outputs=Net(inputs)
        loss=criterion(outputs,labels)
        running_loss+=loss.item()
        loss.backward()
        optimizer.step()
        
       
        print(i)
    
    loss_values.append(running_loss/len(trainloader)) 
     
    running_loss_test=0.0
    
    
    Net.eval()
    with torch.no_grad():
        
        for j, data_test in enumerate(trainloader_test,0):
        
    
            inputs_test, labels_test = data_test[0].to(device), data_test[1].to(device).long()
            outputs_test=Net(inputs_test)
            loss_test=criterion(outputs_test,labels_test)
            running_loss_test+=loss_test.item()
            

    loss_values_test.append(running_loss_test/len(trainloader_test))

THANK YOU !!!

ptrblck · July 7, 2020, 8:27am

Could you check, if the labels contains only values in the range [0, nb_classes-1]?
If you are using PyTorch 1.5.0, could you update to 1.5.1, as we had a bug in 1.5.0, which skips the valid assertions inside some CUDA methods and instead crashes with these unfriendly CUDA errors?

tr_marwa · July 7, 2020, 9:19am

thank you very much for reply
effectively, it’s because of the label values, I didn’t take the real labels … !
after training and validation i had this losses using kfolds cross validation (5 iterations), the BLUE one is for training and the ORANGE is for testing

loss(groupe1) loss(groupe2) loss(groupe3) loss(groupe4) loss(groupe5)

Do you think that what i’m having is good ??
Thank you

ptrblck · July 8, 2020, 2:54am

The loss curves look reasonable.
If they are “good” depends a bit on your use case and if the validation and training accuracy are in the range you want them to be.

tr_marwa · July 8, 2020, 8:41am

THANK YOU VERY MUCH @ptrblck, it’s so kind of you, hope you a great day !!!