Cuda Run Time need help diagnosing the Error when ran on CPU

StrafeNDestroy · July 27, 2022, 7:05pm

Hello I was editing my DataSet Class so it could pick up multiple bounding boxes and labels when it collects data. All I did was add a for loop to the getitem to extract all the boxes and got this error when running it on GPU

RuntimeError: CUDA error: device-side assert triggered"

I was told to run the model on a CPU rather than GPU and got this error message

/usr/local/lib/python3.7/dist-packages/torch/__init__.py in _assert(condition, message)
    831     if type(condition) is not torch.Tensor and has_torch_function((condition,)):
    832         return handle_torch_function(_assert, (condition,), condition, message)
--> 833     assert condition, message
    834 
    835 ################################################################################

AssertionError: Expected target boxes to be of type Tensor, got <class 'builtin_function_or_method'>.

Which leaves me confused as when I check the output at index 9 in my dataset class it says the boxes is a tensor.

This is my output when indexing the dataset.

(tensor([[[0.7961, 0.7922, 0.7843,  ..., 0.5608, 0.5608, 0.5608],
          [0.8039, 0.8000, 0.7922,  ..., 0.5529, 0.5529, 0.5529],
          [0.8235, 0.8235, 0.8157,  ..., 0.5294, 0.5294, 0.5294],
          ...,
          [0.5255, 0.5255, 0.5255,  ..., 0.4431, 0.4392, 0.4392],
          [0.5137, 0.5137, 0.5137,  ..., 0.4392, 0.4353, 0.4353],
          [0.5098, 0.5098, 0.5098,  ..., 0.4392, 0.4353, 0.4353]],
 
         [[0.7843, 0.7804, 0.7725,  ..., 0.5333, 0.5333, 0.5333],
          [0.7922, 0.7882, 0.7804,  ..., 0.5255, 0.5255, 0.5255],
          [0.8118, 0.8118, 0.8039,  ..., 0.5020, 0.5020, 0.5020],
          ...,
          [0.4980, 0.4980, 0.4980,  ..., 0.4471, 0.4431, 0.4431],
          [0.4863, 0.4863, 0.4863,  ..., 0.4431, 0.4392, 0.4392],
          [0.4824, 0.4824, 0.4824,  ..., 0.4431, 0.4392, 0.4392]],
 
         [[0.7176, 0.7137, 0.7059,  ..., 0.5020, 0.5020, 0.5020],
          [0.7255, 0.7216, 0.7137,  ..., 0.4941, 0.4941, 0.4941],
          [0.7451, 0.7451, 0.7373,  ..., 0.4706, 0.4706, 0.4706],
          ...,
          [0.4235, 0.4235, 0.4235,  ..., 0.4235, 0.4196, 0.4196],
          [0.4118, 0.4118, 0.4118,  ..., 0.4196, 0.4157, 0.4157],
          [0.4078, 0.4078, 0.4078,  ..., 0.4196, 0.4157, 0.4157]]]),
 {'area': tensor(1006000),
  'boxes': tensor([[ 52,   7, 948, 999]]),
  'image_id': tensor(9),
  'iscrowd': tensor([0]),
  'labels': tensor([1])})

Here is my Dataset Class Breakdown

# Create Custom DataSet
class ObjectDetectionDataset(torch.utils.data.Dataset):
  def __init__(self,phase:str,transforms=None):
    image_dir=f"/content/drive/My Drive/{phase} set/images"
    annotations_dir=f"/content/drive/My Drive/{phase} set/labels"
    self.image_dir = Path(image_dir) # image directory path
    self.annotations_dir = Path(annotations_dir) # annotations directory path
    # self.transforms = transforms # Transforms 
  
    self.imgs_names = list(sorted(self.image_dir.iterdir()))  # create list of path objects to photos, sorted()
    self.imgs_names = [Path(self.image_dir/img_name)for img_name in self.imgs_names]
  
    self.annotation_names = list(sorted(self.annotations_dir.iterdir()))
    self.annotation_names = [Path(annotations_dir/ann_name)for ann_name in self.annotation_names]

  # What happens when __getitem__ is used on the object, example dataset_object[0]
  def __getitem__(self,index):
    # Grabing path to image at "index" and transforming to tensor between 0-1
    img_path = self.imgs_names[index]  # Getting Image path object at idx and display the image
    image_pic = Image.open(img_path)
    rows = image_pic.size[0]
    columns = image_pic.size[1]
    resize_amount = (1024,1024)
    image_resized = transforms.Resize(resize_amount)(image_pic)
    image = transforms.ToTensor()(image_resized)
    
    #----------------------------------------------------------------------------------------------------

    # Grabing path to bounding box at "index" and grabbing its contents 
    annotation_path = self.annotation_names[index]
    annotation_tree = ET.parse(annotation_path)
    bounding_box = []
    for element in annotation_tree.findall("object"):
      bound_box_obj = element.find("bndbox")    
      resize_ratio_rows = resize_amount[0]/ rows
      resize_ratio_columns = resize_amount[1]/columns
      x_max = int(bound_box_obj.find('xmax').text) 
      x_min = int(bound_box_obj.find('xmin').text)
      y_max = int(bound_box_obj.find('ymax').text)
      y_min = int(bound_box_obj.find('ymin').text)
      x2 = round(x_max * resize_ratio_rows)
      x1 = round(x_min * resize_ratio_rows)
      y2 = round(y_max * resize_ratio_columns)
      y1 = round(y_min * resize_ratio_columns)
      blist = [x1,y1,x2,y2]
      bounding_box.append(blist)
      #-------------------------------------------
    bounding_box = torch.tensor(bounding_box)
  



   #----------------------------------------------------------------------------------------------------

   # Getting Label
    # label_list =["Truck","Car","Jeep"]
    label_list =["background","raccoon"]
    annotation_path = self.annotation_names[index]
    annotation_tree = ET.parse(annotation_path)
    label_name = annotation_tree.find("object").find("name").text
    
    if label_name in label_list:
      label = (label_list.index(label_name)) 
      label = torch.tensor([label],dtype=torch.int64)
  #----------------------------------------------------------------------------------------------------

  # Calculating Area
    area = torch.tensor((x1+x2)*(y1+y2))
  #----------------------------------------------------------------------------------------------------
   
  # Creating Image_Ids
    image_id = torch.tensor(index)

  #----------------------------------------------------------------------------------------------------
  # Setting "iscrowd to zero"
    iscrowd = torch.zeros(1,dtype=torch.int64)
  #----------------------------------------------------------------------------------------------------

  # Creating Tagets Dictionary 
    target = {}
    target["boxes"] = bounding_box
    target["labels"] = label
    target["image_id"] = image_id
    target["area"] = area
    target["iscrowd"] = iscrowd      

    return image,target

  def __len__(self):
    return len(self.imgs_names)

This is where the error occurred in training

lossHist = LossAverager()
valLossHist = LossAverager()
column_names = ["Epoch","Train_loss","Valid_loss","Error_rate","Duration"]
df  = pd.DataFrame(columns = column_names)
for epoch in tqdm(range(1,EPOCHS)):
    
    start_time = time()
    model.train() # setting out model to train
    lossHist.reset()  # resets our values,averages,sums,counts in lossHist
    for images, targets in dl_test:
        images = torch.stack(images).cpu 
        targets = [{k: v.cpu for k, v in t.items()} for t in targets] # sending targets to the GPU  
        bs = BATCH_SIZE
  -->   loss_dict = model(images, targets) # passing our model a single batch of images with repective targets       
        totalLoss = sum(loss for loss in loss_dict.values()) # adds up all the losses from the models output
        lossValue = totalLoss.item() # Converts tensor loss to interger Loss        
        lossHist.update(lossValue,bs) 
        optimizer.zero_grad() # zero outs any previous gradietns from our training
        totalLoss.backward() # finds the derivative of the total loss 
        optimizer.step() # optimizer takes a step based on derivative
    if lr_scheduler is not None:
      lr_scheduler.step(totalLoss)

    df = df.append({"Epoch":epoch,"Train_loss":lossHist.avg,"Duration":str(datetime.timedelta(seconds = time() - start_time))[2:7],"Valid_loss":valLossHist.avg},ignore_index=True)
torch.save(model.state_dict(), r'model_raccoonsv2.pth')
df

ptrblck · July 28, 2022, 5:37am

This line of code:

images = torch.stack(images).cpu

is wrong as you would need to call .cpu(). Otherwise you are passing the cpu function to the model as given in the error message.

StrafeNDestroy · July 31, 2022, 5:26pm

Wow thank that was really simple thanks.