Dimension out of range (expected to be in range of [-3, 2], but got 3)

Hi, I am trying to train an instance segmentation model with pytorch. I am using the torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True). i have 143 classes which are not part of the categories the model was pretrained on. I created my datset class which functions properly. it returns the image, and the target dict. Below is the dataset class.
class clothDataset(torch.utils.data.Dataset):

class clothDataset(torch.utils.data.Dataset):

   

   def __init__(self, root, transforms=None):

       self.root = root

       self.transforms = transforms

       # load all image files, sorting them to

       # ensure that they are aligned

       self.imgs = list(sorted(os.listdir(os.path.join(root, "image"))))

       self.annos = list(sorted(os.listdir(os.path.join(root, "annos"))))

               

   def extract_detectors(self, file_path):

       #parse the json

       with open(file_path) as json_file:
           data = json.load(json_file)
       return [dict(data[detection_object]) for detection_object in data if type(data[detection_object]) == dict]

       

   def __getitem__(self, idx):
       annotation_path = os.path.join(self.root, 'annos')

       all_objects = self.extract_detectors( os.path.join(annotation_path, os.listdir(annotation_path)[idx]) )

       # load images ad masks

       num_objs = len(all_objects)

       

       img_path = os.path.join(self.root, "image", self.imgs[idx])

       masks = [detector['segmentation'][0] for detector in all_objects]

       boxes = [detector['bounding_box'] for detector in all_objects]

       labels = [detector['category_id'] for detector in all_objects]

       image_id = torch.tensor([idx])

       img = Image.open(img_path).convert("RGB")

       

       boxes = torch.as_tensor(boxes, dtype=torch.float32)

       area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

       iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

       labels = torch.as_tensor(labels, dtype=torch.int64)

       if len(masks) > 1:

           max_len = max([len(x) for x in masks])

           masks = [np.pad(x, (0, max_len - len(x)), 'constant') for x in masks]

       masks = torch.as_tensor(masks, dtype=torch.uint8)

       

       target = {}

       target["boxes"] = boxes

       target["labels"] = labels

       target["masks"] = masks

       target["image_id"] = image_id

       target["area"] = area

       target["iscrowd"] = iscrowd

       

       if self.transforms is not None:

           img, target = self.transforms(img, target)

       return img, target

   

   def __len__(self):

       return len(self.imgs)

Please any help at this point will be appreciated!

Based on the code snippet, I guess that boxes[:, 3] might fail.
Could you check the shape of this tensor before indexing it via:

print(boxes.shape)

and make sure dim1 has a size of 4?

Hi ptrblck,

Thank you so much for replying. I ran the print code and the result was torch.Size([2, 4]).

Please how do i make the size of dim1 = 4 without implications, am i to pad it with zeros or ones? If i make the dim1 =4, will that not mean that there are four boxes when there are actually 2 in this example?

The printed size is alright, so the error might be raised in another line of code or the boxes tensor is smaller in some iterations.

Could you post the complete stack trace, please?

OK.

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py:3000: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and uses scale_factor directly, instead of relying on the computed output size. If you wish to keep the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. 
  warnings.warn("The default behavior for interpolate/upsample with float scale_factor changed "
/usr/local/lib/python3.6/dist-packages/torchvision/ops/boxes.py:101: UserWarning: This overload of nonzero is deprecated:
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  keep = keep.nonzero().squeeze(1)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-14-2c80030da6ea> in <module>()
      4 for epoch in range(num_epochs):
      5     # train for one epoch, printing every 10 iterations
----> 6     train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
      7     # update the learning rate
      8     lr_scheduler.step()

8 frames
/content/engine.py in train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq)
     28         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
     29 
---> 30         loss_dict = model(images, targets)
     31 
     32         losses = sum(loss for loss in loss_dict.values())

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/generalized_rcnn.py in forward(self, images, targets)
     97             features = OrderedDict([('0', features)])
     98         proposals, proposal_losses = self.rpn(images, features, targets)
---> 99         detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
    100         detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
    101 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/roi_heads.py in forward(self, features, proposals, image_shapes, targets)
    809                 rcnn_loss_mask = maskrcnn_loss(
    810                     mask_logits, mask_proposals,
--> 811                     gt_masks, gt_labels, pos_matched_idxs)
    812                 loss_mask = {
    813                     "loss_mask": rcnn_loss_mask

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/roi_heads.py in maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs)
    115     mask_targets = [
    116         project_masks_on_boxes(m, p, i, discretization_size)
--> 117         for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
    118     ]
    119 

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/roi_heads.py in <listcomp>(.0)
    115     mask_targets = [
    116         project_masks_on_boxes(m, p, i, discretization_size)
--> 117         for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
    118     ]
    119 

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/roi_heads.py in project_masks_on_boxes(gt_masks, boxes, matched_idxs, M)
     96     rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
     97     gt_masks = gt_masks[:, None].to(rois)
---> 98     return roi_align(gt_masks, rois, (M, M), 1.)[:, 0]
     99 
    100 

/usr/local/lib/python3.6/dist-packages/torchvision/ops/roi_align.py in roi_align(input, boxes, output_size, spatial_scale, sampling_ratio, aligned)
     43     return torch.ops.torchvision.roi_align(input, rois, spatial_scale,
     44                                            output_size[0], output_size[1],
---> 45                                            sampling_ratio, aligned)
     46 
     47 

IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3)

Thanks for the stack trace. I don’t know, which operation fails exactly in roi_align, but guess that the target tensor might contain invalid indices.
Could you post the images and targets as well as the model for the failed operation so that we could try to reproduce and debug it?

Thank you ptrblck.

This is the model

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(256)
          )
        )
        (1): Bottleneck(
          (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256)
          (relu): ReLU(inplace=True)
        )
        (2): Bottleneck(
          (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256)
          (relu): ReLU(inplace=True)
        )
      )
      (layer2): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(128)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(128)
          (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(512)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
            (1): FrozenBatchNorm2d(512)
          )
        )
        (1): Bottleneck(
          (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(128)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(128)
          (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(512)
          (relu): ReLU(inplace=True)
        )
        (2): Bottleneck(
          (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(128)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(128)
          (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(512)
          (relu): ReLU(inplace=True)
        )
        (3): Bottleneck(
          (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(128)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(128)
          (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(512)
          (relu): ReLU(inplace=True)
        )
      )
      (layer3): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
            (1): FrozenBatchNorm2d(1024)
          )
        )
        (1): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024)
          (relu): ReLU(inplace=True)
        )
        (2): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024)
          (relu): ReLU(inplace=True)
        )
        (3): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024)
          (relu): ReLU(inplace=True)
        )
        (4): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024)
          (relu): ReLU(inplace=True)
        )
        (5): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024)
          (relu): ReLU(inplace=True)
        )
      )
      (layer4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(512)
          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(512)
          (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(2048)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
            (1): FrozenBatchNorm2d(2048)
          )
        )
        (1): Bottleneck(
          (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(512)
          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(512)
          (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(2048)
          (relu): ReLU(inplace=True)
        )
        (2): Bottleneck(
          (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(512)
          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(512)
          (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(2048)
          (relu): ReLU(inplace=True)
        )
      )
    )
    (fpn): FeaturePyramidNetwork(
      (inner_blocks): ModuleList(
        (0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
        (2): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
        (3): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
      )
      (layer_blocks): ModuleList(
        (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (extra_blocks): LastLevelMaxPool()
    )
  )
  (rpn): RegionProposalNetwork(
    (anchor_generator): AnchorGenerator()
    (head): RPNHead(
      (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
      (bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
    )
  )
  (roi_heads): RoIHeads(
    (box_roi_pool): MultiScaleRoIAlign()
    (box_head): TwoMLPHead(
      (fc6): Linear(in_features=12544, out_features=1024, bias=True)
      (fc7): Linear(in_features=1024, out_features=1024, bias=True)
    )
    (box_predictor): FastRCNNPredictor(
      (cls_score): Linear(in_features=1024, out_features=13, bias=True)
      (bbox_pred): Linear(in_features=1024, out_features=52, bias=True)
    )
    (mask_roi_pool): MultiScaleRoIAlign()
    (mask_head): MaskRCNNHeads(
      (mask_fcn1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu1): ReLU(inplace=True)
      (mask_fcn2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu2): ReLU(inplace=True)
      (mask_fcn3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu3): ReLU(inplace=True)
      (mask_fcn4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu4): ReLU(inplace=True)
    )
    (mask_predictor): MaskRCNNPredictor(
      (conv5_mask): ConvTranspose2d(256, 256, kernel_size=(2, 2), stride=(2, 2))
      (relu): ReLU(inplace=True)
      (mask_fcn_logits): Conv2d(256, 13, kernel_size=(1, 1), stride=(1, 1))
    )
  )
)

How would you like me to post the images and targets? Should I zip and upload here?

The shape, min and max values of both tensors could be enough.

OK, so that will be enough to troubleshoot right?

This would be enough. Could you thus post an executable code snippet I could use to reproduce the issue, please?

OK.

This is the command running the training.

from engine import train_one_epoch, evaluate

# '/content' is my google drive, i am using google colab with my drive mounted.
dataset = clothDataset('/content', get_transform(train=True))
dataset_test = clothDataset('/content', get_transform(train=False))

# split the dataset in train and test set
torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])


data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=10, shuffle=True, num_workers=0,
    collate_fn=utils.collate_fn)

learning_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)
# I want to train it for 10 epochs
for epoch in range(10):
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    learning_scheduler .step()

I am suspecting that the dataset may be contributing to the problem. Please let me know if you need any information.

OK, I don’t know where to get the data from to run into the issue and debug it.
I think the best way forward would be, if you could add:

print(images.min(), images.max(), images.shape)
print(targets.min(), targets.max(), targets.shape)

before calling loss_dict = model(images, targets) in train_one_epoch.
This should give us some information about potentially invalid inputs.

Thank you. I did as you said and I am currently getting an error. I have pasted the stacktrace here.

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py:3000: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and uses scale_factor directly, instead of relying on the computed output size. If you wish to keep the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. 
  warnings.warn("The default behavior for interpolate/upsample with float scale_factor changed "
/usr/local/lib/python3.6/dist-packages/torchvision/ops/boxes.py:101: UserWarning: This overload of nonzero is deprecated:
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  keep = keep.nonzero().squeeze(1)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-16-2c80030da6ea> in <module>()
      4 for epoch in range(num_epochs):
      5     # train for one epoch, printing every 10 iterations
----> 6     train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
      7     # update the learning rate
      8     lr_scheduler.step()

5 frames
/content/engine.py in train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq)
     28         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
     29 
---> 30         print(images.min(), images.max(), images.shape)
     31         print(targets.min(), targets.max(), targets.shape)
     32 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/generalized_rcnn.py in forward(self, images, targets)
     97             features = OrderedDict([('0', features)])
     98         proposals, proposal_losses = self.rpn(images, features, targets)
---> 99         detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
    100         detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
    101 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/roi_heads.py in forward(self, features, proposals, image_shapes, targets)
    759             assert labels is not None and regression_targets is not None
    760             loss_classifier, loss_box_reg = fastrcnn_loss(
--> 761                 class_logits, box_regression, labels, regression_targets)
    762             losses = {
    763                 "loss_classifier": loss_classifier,

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/roi_heads.py in fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
     38     # the corresponding ground truth labels, to be used with
     39     # advanced indexing
---> 40     sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1)
     41     labels_pos = labels[sampled_pos_inds_subset]
     42     N, num_classes = class_logits.shape

RuntimeError: copy_if failed to synchronize: cudaErrorAssert: device-side assert triggered

Please what do I do next?

Did you get any outputs from the print statement?
Based on the stack trace it seems that a previous CUDA operation was failing already and the stack trace might point to the wrong line of code.
Could you rerun the code via CUDA_LAUNCH_BLOCKING=1 python script.py args and post the stack trace here?
Also, if you are using an older PyTorch version, could you update to the latest stable release or the nightly binary?

Hi ptrblck,

Thank you for your efforts. I later found that the problem is with the dataset i was using. I am good now. Thank you very much!

Hi, GideonsMarch, may I know what is the exact problem that you have with your dataset? As I am also facing this problem now and I not sure where it goes wrong.

I have trained mask rcnn model and my problem was i have given whole mask as a input ,it gave me the same error therefore i have used following code to split the each mask from the binary image

# Number of Bounding Boxes
obj_ids = np.array(list(range(len(boxes))))
## Split mask into seperate mask
masks = mask == obj_ids[:, None,None]