I re ran the code with CUDA_LAUNCH_BLOCKING=1 and I get below stacktrace.
Traceback (most recent call last):
File “train.py”, line 27, in main
return train(config)
File “/home/haicu/harshavardhan.subramanian/AIS_DL/AIS_DL/src/ml_pipeline_template/training_pipeline.py”, line 87, in train
trainer.fit(model=model, datamodule=datamodule)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”, line 741, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”, line 685, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”, line 777, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”, line 1199, in _run
self._dispatch()
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”, line 1279, in _dispatch
self.training_type_plugin.start_training(self)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py”, line 202, in start_training
self._results = trainer.run_stage()
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”, line 1289, in run_stage
return self._run_train()
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”, line 1311, in _run_train
self._run_sanity_check(self.lightning_module)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”, line 1375, in _run_sanity_check
self._evaluation_loop.run()
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/loops/base.py”, line 145, in run
self.advance(*args, **kwargs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py”, line 110, in advance
dl_outputs = self.epoch_loop.run(dataloader, dataloader_idx, dl_max_batches, self.num_dataloaders)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/loops/base.py”, line 145, in run
self.advance(*args, **kwargs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py”, line 122, in advance
output = self._evaluation_step(batch, batch_idx, dataloader_idx)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py”, line 217, in _evaluation_step
output = self.trainer.accelerator.validation_step(step_kwargs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py”, line 239, in validation_step
return self.training_type_plugin.validation_step(*step_kwargs.values())
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py”, line 219, in validation_step
return self.model.validation_step(*args, **kwargs)
File “/home/haicu/harshavardhan.subramanian/AIS_DL/AIS_DL/src/ml_pipeline_template/models/AIS_UNetModule.py”, line 106, in validation_step
loss, loss_mask, loss_box_reg, loss_classifier = self.step(batch)
File “/home/haicu/harshavardhan.subramanian/AIS_DL/AIS_DL/src/ml_pipeline_template/models/AIS_UNetModule.py”, line 55, in step
loss_dict = self.forward(images, targets)
File “/home/haicu/harshavardhan.subramanian/AIS_DL/AIS_DL/src/ml_pipeline_template/models/AIS_UNetModule.py”, line 40, in forward
return self.net(x,y)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 1110, in _call_impl
return forward_call(*input, **kwargs)
File “/home/haicu/harshavardhan.subramanian/AIS_DL/AIS_DL/src/ml_pipeline_template/models/components/MRCNN.py”, line 51, in forward
return self.model(x,y)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 1110, in _call_impl
return forward_call(*input, **kwargs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torchvision/models/detection/generalized_rcnn.py”, line 99, in forward
detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 1110, in _call_impl
return forward_call(*input, **kwargs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torchvision/models/detection/roi_heads.py”, line 803, in forward
rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torchvision/models/detection/roi_heads.py”, line 113, in maskrcnn_loss
project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torchvision/models/detection/roi_heads.py”, line 113, in
project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torchvision/models/detection/roi_heads.py”, line 95, in project_masks_on_boxes
return roi_align(gt_masks, rois, (M, M), 1.0)[:, 0]
File “/home/haicu/harshavardhan.subramanian/miniconda3/envs/ml_template_env/lib/python3.7/site-packages/torchvision/ops/roi_align.py”, line 62, in roi_align
input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned
RuntimeError: CUDA error: an illegal memory access was encountered