Inplace operation error_

raesol · May 22, 2022, 1:52pm

i was trying to train the yolo v1 but, i keept finding the following error.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 4096]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
how can the error be solved? thanks in advance

Training & Testing.

import time

model = model.to(device)

train_iter = 0

test_iter = 0

start_time = time.time()

for epoch in range(1, max_epoch):

# print('\n')

train_total_loss = 0.0

total_batch = 0

# print('Starting epoch {} / {}'.format(epoch, num_epochs))

# Learning rate scheduling

if epoch in [50, 150]:

    lr *= 0.1

    for param_group in optimizer.param_groups:

        param_group['lr'] = lr

if epoch < last_epoch:

    continue

model.train()

_start_time = time.time()

for x, y in train_dloader:

    # P3.1. implement training pipeline here

    #load data by batches

    train_iter +=1

    b_size = x.size(0)

    imgs = x.to(device)

    labels = y.to(device)

    #forward pass to compute loss

    predictions = model(imgs)

    loss_xy, loss_wh, loss_obj, loss_noobj, loss_class = compute_loss.forward(predictions,labels) #how do i compute a loss in here??

    loss = (lambda_coord*loss_xy + lambda_coord*loss_wh + loss_obj + lambda_noobj*loss_noobj + loss_class)/float(batch_size)

    train_total_loss+=loss.item()*b_size

    total_batch += b_size

    #backward to update weights of the model

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

time_taken = time.time()-_start_time

print(f'Epoch {epoch}/{max_epoch} || train loss={train_total_loss/float(total_batch):.4f} % time={time_taken:.3f} secs')

_start_time = time.time()

model.eval()

test_total_batch = 0

test_total_loss = 0.0

with torch.no_grad():

    for x, y in test_dloader:

      # P3.2. implement testing pipeline here

      #load data by batches

      b_size = x.size(0)

      imgs = x.to(device)

      labels = y.to(device)

      #forward pass to compute loss

      predictions = model(imgs)

      loss = compute_loss.forward(predictions,labels)

      test_total_loss += loss.item()*b_size()

      test_total_batch += b_size

      #backward to update weights of the model

      optimizer.zero_grad()

      loss.backward()

      optimizer.step()

test_time = time.time()-_start_time

print(f'Epoch {epoch}/{max_epoch} || test loss={test_total_loss/float(test_total_batch):.4f} % time={test_time:.3f} secs')

ckpt = {'model':model.state_dict(),

        'optimizer':optimizer.state_dict(),

        'epoch':epoch}

torch.save(ckpt, ckpt_path)

total_t = time.time()-start_time

print(f’total_time taken={total_t:.3f} secs’)

AlphaBetaGamma96 · May 22, 2022, 2:55pm

Have you set inplace=True for your ReLU activation functions in your network? If so, set them to False. You can’t backprop through an in-place operation.

raesol · May 22, 2022, 7:55pm

yes, i have set in_place to true in relu. I did try setting in_place to false , however, i was getting the same error

AlphaBetaGamma96 · May 22, 2022, 8:08pm

Would you mind sharing the model?

raesol · May 23, 2022, 1:04am

the model is the following
class Yolo(nn.Module):

def __init__(self, grid_size, num_boxes, num_classes):

    super(Yolo, self).__init__()

    self.S = grid_size

    self.B = num_boxes

    self.C = num_classes

    self.features = nn.Sequential(

        # P1.1. implement VGG16 backbone network here.

        nn.Conv2d(3,64,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.Conv2d(64,64,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.MaxPool2d(kernel_size=2,stride=2,padding=0,dilation=1,ceil_mode = False),

        nn.Conv2d(64,128,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.Conv2d(128,128,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.MaxPool2d(kernel_size=2,stride=2,padding=0,dilation=1,ceil_mode = False),

        nn.Conv2d(128,256,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.Conv2d(256,256,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.Conv2d(256,256,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.MaxPool2d(kernel_size=2,stride=2,padding=0,dilation=1,ceil_mode = False),

        nn.Conv2d(256,512,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.Conv2d(512,512,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.Conv2d(512,512,3,1,1),

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.MaxPool2d(kernel_size=2,stride=2,padding=0,dilation=1,ceil_mode = False),

        nn.Conv2d(512,512,3,1,1),

        # nn.ReLU(inplace = True),

        nn.ReLU(inplace=False),

        nn.Conv2d(512,512,3,1,1),

        # nn.ReLU(inplace = True),

        nn.ReLU(inplace=False),

        nn.Conv2d(512,512,3,1,1),

        # nn.ReLU(inplace = True),

        nn.ReLU(inplace=False),

        nn.MaxPool2d(kernel_size=2,stride=2,padding=0,dilation=1,ceil_mode = False),

    )

    self.detector = nn.Sequential(

        # P1.2. implement detection head here.

        nn.Linear(25088,4096,bias=True), #how do i know the in_feature??

        # nn.ReLU(inplace=True),

        nn.ReLU(inplace=False),

        nn.Dropout(p=0.5,inplace=True),

        nn.Linear(4096,1470,bias=True)

    )

def forward(self, x):

    x = self.features(x)

    x = x.view(x.size(0), -1)

    x = self.detector(x)

    x = F.sigmoid(x)

    x = x.view(-1, self.S, self.S, self.B*5+self.C)

    return x

model = Yolo(grid_size, num_boxes, num_classes)

model = model.to(device)

pretrained_weights = torch.load(pretrained_backbone_path)

model.load_state_dict(pretrained_weights)
and the follwing function shows the loss function used in the training process
class Loss(nn.Module):

def __init__(self, grid_size=7, num_bboxes=2, num_classes=20):

    """ Loss module for Yolo v1.

    Use grid_size, num_bboxes, num_classes information if necessary.

    Args:

        grid_size: (int) size of input grid.

        num_bboxes: (int) number of bboxes per each cell.

        num_classes: (int) number of the object classes.

    """

    super(Loss, self).__init__()

    self.S = grid_size

    self.B = num_bboxes

    self.C = num_classes

def compute_iou(self, bbox1, bbox2):

    """ Compute the IoU (Intersection over Union) of two set of bboxes, each bbox format: [x1, y1, x2, y2].

    Use this function if necessary.

    Args:

        bbox1: (Tensor) bounding bboxes, sized [N, 4].

        bbox2: (Tensor) bounding bboxes, sized [M, 4].

    Returns:

        (Tensor) IoU, sized [N, M].

    """

    N = bbox1.size(0)

    M = bbox2.size(0)

    # Compute left-top coordinate of the intersections

    lt = torch.max(

        bbox1[:, :2].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]

        bbox2[:, :2].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]

    )

    # Conpute right-bottom coordinate of the intersections

    rb = torch.min(

        bbox1[:, 2:].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]

        bbox2[:, 2:].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]

    )

    # Compute area of the intersections from the coordinates

    wh = rb - lt   # width and height of the intersection, [N, M, 2]

    wh[wh < 0] = 0 # clip at 0

    inter = wh[:, :, 0] * wh[:, :, 1] # [N, M]

    # Compute area of the bboxes

    area1 = (bbox1[:, 2] - bbox1[:, 0]) * (bbox1[:, 3] - bbox1[:, 1]) # [N, ]

    area2 = (bbox2[:, 2] - bbox2[:, 0]) * (bbox2[:, 3] - bbox2[:, 1]) # [M, ]

    area1 = area1.unsqueeze(1).expand_as(inter) # [N, ] -> [N, 1] -> [N, M]

    area2 = area2.unsqueeze(0).expand_as(inter) # [M, ] -> [1, M] -> [N, M]

    # Compute IoU from the areas

    union = area1 + area2 - inter # [N, M, 2]

    iou = inter / union           # [N, M, 2]

    return iou

def forward(self, pred_tensor, target_tensor):

    """ Compute loss.

    Args:

        pred_tensor (Tensor): predictions, sized [batch_size, S, S, Bx5+C], 5=len([x, y, w, h, conf]).

        target_tensor (Tensor):  targets, sized [batch_size, S, S, Bx5+C].

    Returns:

        loss_xy (Tensor): localization loss for center positions (x, y) of bboxes.

        loss_wh (Tensor): localization loss for width, height of bboxes.

        loss_obj (Tensor): objectness loss.

        loss_noobj (Tensor): no-objectness loss.

        loss_class (Tensor): classification loss.

    """

    # P2. Write your code here

    grid = self.S

    num_box = self.B

    num_class = self.C

    N = 5*num_box + 5

    # get masks for cells which contain and those which don't contain objects

    no_obj = target_tensor[...,4]==0

    no_obj = no_obj.unsqueeze(3).expand_as(target_tensor)

    exists_obj = target_tensor[...,4]>0

    exists_obj = exists_obj.unsqueeze(3).expand_as(target_tensor)

    ########CELLS WITH NO_OBJECT################################

    # prediction tensors and target tensors on cells which don't contain objects

    no_obj_target = target_tensor[no_obj].view(-1,N)

    no_obj_prediction = pred_tensor[no_obj].view(-1,N)

    ########CELLS WITH OBJECT################################

    obj_prediction = pred_tensor[exists_obj].view(-1,N)

    obj_target = target_tensor[exists_obj].view(-1,N)

    bounding_box_prediction = obj_prediction[...,:5*num_box].contiguous().view(-1,5)

    # bbox_prediction = bounding_box_predition.contiguous().view(-1,5)

    bounding_box_target = obj_target[...,:5*num_box].contiguous().view(-1,5)

    # bbox_target = bounding_box_target.contiguous().view(-1,5)

    ############# USE IoU FOR PREDICTING THE BBOX FOR EACH CELL ###

           ### masks for coordinates

    obj_res = torch.cuda.ByteTensor(bounding_box_target.size()).fill_(0)

    obj_nres = torch.cuda.ByteTensor(bounding_box_target.size()).fill_(1)

    bounding_box_target_iou = torch.zeros(bounding_box_target.size()).to(device)

    for i in range(0,bounding_box_target.size(0),num_box):

      prediction_box = bounding_box_prediction[i:i+num_box]

      prediction_xyxy = Variable(torch.FloatTensor(prediction_box.size()))

      # to compute iou correctly lets change the structure(x,y,w,h)->(x1,y1,w,h) while normalizing them simultaneously

      prediction_xyxy[...,:2] = prediction_box[...,:2]/float(grid) - prediction_box[...,2:4]/2

      prediction_xyxy[...,2:4] = prediction_box[...,:2]/float(grid) + prediction_box[...,2:4]/2

      target_box = bounding_box_target[i]

      target_box = bounding_box_target[i].view(-1,5)

      target_xyxy = Variable(torch.FloatTensor(target_box.size()))

      # to compute iou correctly lets change the structure(x,y,w,h)->(x1,y1,w,h) while normalizing them simultaneously

      target_xyxy[...,:2] = target_box[...,:2]/float(grid) - target_box[...,2:4]/2

      target_xyxy[...,2:4] = target_box[...,:2]/float(grid) + target_box[...,2:4]/2

      #let's now calculate intersection over union

      inter_ovr_un = self.compute_iou(prediction_xyxy[...,:4],target_xyxy[...,:4])

      max_iou,max_index = inter_ovr_un.max(0)

      max_index = max_index.data.to(device)

      obj_res[i+max_index], obj_nres[i+max_index] = 1,0

      bounding_box_target_iou[i+max_index,torch.LongTensor([4]).to(device)] = max_iou.data.to(device)

    bounding_box_target_iou = Variable(bounding_box_target_iou).to(device)

    target_iou = bounding_box_target_iou[obj_res].view(-1,5)

    ################################################################

    ######      compute loss on cell with no objects ###############

    ################################################################

    # for no object confidence mask

    no_obj_confidence = torch.cuda.ByteTensor(no_obj_prediction.size()).fill_(0)

    #update the confidence value of the cells with no object for both bboxes

    for box in range(num_box):

      no_obj_confidence[...,4+box*5] = 1

    # then find the prediction and target confidence values

    no_obj_conf_prediction = no_obj_prediction[no_obj_confidence]

    no_obj_conf_target = no_obj_target[no_obj_confidence]

    loss_noobj = F.mse_loss(no_obj_conf_prediction,no_obj_conf_target,reduction="sum")

    ##########################################################################

    ################  FOR BOUNDING BOXES COORDNATES  #########################

    ##########################################################################

    bounding_box_pred_res = bounding_box_prediction[obj_res].view(-1,5)

    bounding_box_target_res = bounding_box_target[obj_res].view(-1,5)

    loss_xy = F.mse_loss(bounding_box_pred_res[...,:2],bounding_box_target_res[...,:2], reduction="sum")

    ##########################################################################

    ################# FOR BOUNDING BOXES WIDH AND HEIGHT    ##################

    ##########################################################################

    loss_wh = F.mse_loss(torch.sqrt(bounding_box_pred_res[...,2:4]),torch.sqrt(bounding_box_target_res[...,2:4]),reduction="sum")

    ##########################################################################

    ######################        FOR OBJECT LOSSES  #########################

    ##########################################################################

    loss_obj = F.mse_loss(bounding_box_pred_res[...,4],bounding_box_target_res[...,4],reduction="sum")

    ##########################################################################

    ################# FOR CLASS LOSS(THOSE WITH OBJECTS) #####################

    ##########################################################################

    prediction_of_class = obj_prediction[...,5*num_box:]

    target_of_class = obj_target[...,5*num_box:]

    loss_class = F.mse_loss(prediction_of_class,target_of_class,reduction="sum")

    return loss_xy, loss_wh, loss_obj, loss_noobj, loss_class

compute_loss = Loss(grid_size, num_boxes, num_classes)

AlphaBetaGamma96 · May 23, 2022, 1:08am

Make sure to run your code with torch.autograd.set_detect_anomaly(True) to highlight more about the issue. Try replacing the highlighted line above with wh = torch.clamp(wh, min=0)

raesol · May 23, 2022, 1:23am

i tried your suggestion and got the following error.

AlphaBetaGamma96 · May 23, 2022, 11:02am

When you re-wrap Tensors like this

You break your computation graph, so replace those. The same goes for BtyeTensor too, and also there’s no need to use Variable() as it’s deprecated.

Have you ran your code within torch.autograd.set_detect_anomaly(True) decorator?

raesol · May 23, 2022, 11:38am

thank you so much for following up and helping me in giving insights in the issue. The problem is now gone. there was just a simple problem in the model i used. i used in_place true for both ReLu and Dropout. however, once i changed the in_place in Dropout to false keeping the in_place to true incase of ReLu, the whole problem disappread.

AlphaBetaGamma96 · May 23, 2022, 11:41am

Not a problem. A general rule of thumb is to not use inplace=True anywhere really. I’d advise you set all inplace flags to False.