This happens every few thousand iterations. Here is the error:
/opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/ATen/native/cuda/IndexKernel.cu:53: lambda [](int)->auto::operator()(int)->auto: block: [0,0,0], thread: [13,0,0] Assertion `index >$
-sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/ATen/native/cuda/IndexKernel.cu:53: lambda [](int)->auto::operator()(int)->auto: block: [0,0,0], thread: [13,0,0] Assertion `index >$
-sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/ATen/native/cuda/IndexKernel.cu:53: lambda [](int)->auto::operator()(int)->auto: block: [0,0,0], thread: [13,0,0] Assertion `index >$
-sizes[i] && index < sizes[i] && "index out of bounds"` failed.
THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/THC/THCGeneral.cpp line=383 error=59 : device-side assert triggered
Failed to apply mask
Failed to apply mask
torch.Size([6, 3, 40, 40])
torch.Size([6, 3, 10, 10])
torch.Size([6, 256, 256])
torch.Size([6, 256, 256])
40
10
Traceback (most recent call last):
File "/home/loopclosure/zzft-training/zzft/models/yolo.py", line 250, in build_targets
noobj_mask[:, i][resized_masks] = 0
RuntimeError: copy_if failed to synchronize: device-side assert triggered
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "runtime/train_detector.py", line 364, in <module>
main()
File "runtime/train_detector.py", line 277, in main
outputs = torch.nn.parallel.parallel_apply(replicas[1:], data_split, kwargs)
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply
raise output
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker
output = module(*input, **kwargs)
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/home/loopclosure/zzft-training/zzft/utils/common.py", line 83, in forward
return self.module(*args, **kwargs)
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/home/loopclosure/zzft-training/zzft/models/yolo.py", line 457, in forward
netout, layer_det_loss, layer_conf_loss, layer_cls_loss = module[0](x, targets=targets, classify=classify, target_has_desc=target_has_desc, masks=masks)
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/home/loopclosure/zzft-training/zzft/models/yolo.py", line 388, in forward
masks=masks,
File "/home/loopclosure/zzft-training/zzft/models/yolo.py", line 256, in build_targets
print(masks)
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/tensor.py", line 71, in __repr__
return torch._tensor_str._str(self)
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/_tensor_str.py", line 286, in _str
tensor_str = _tensor_str(self, indent)
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/_tensor_str.py", line 201, in _tensor_str
formatter = _Formatter(get_summarized_data(self) if summarize else self)
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/_tensor_str.py", line 236, in get_summarized_data
return torch.stack([get_summarized_data(x) for x in self])
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/_tensor_str.py", line 236, in <listcomp>
return torch.stack([get_summarized_data(x) for x in self])
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/_tensor_str.py", line 234, in get_summarized_data
return torch.stack([get_summarized_data(x) for x in (start + end)])
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/_tensor_str.py", line 234, in <listcomp>
return torch.stack([get_summarized_data(x) for x in (start + end)])
File "/home/alexmai/anaconda3/envs/th/lib/python3.6/site-packages/torch/_tensor_str.py", line 227, in get_summarized_data
return torch.cat((self[:PRINT_OPTS.edgeitems], self[-PRINT_OPTS.edgeitems:]))
RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/THC/THCCachingHostAllocator.cpp:265
The code is a modification of PyTorch-YOLOv3 from https://github.com/eriklindernoren/PyTorch-YOLOv3
def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres, masks=None):
# targets is an array batch_size x [batch_number, label, x, y, w, h]
device = pred_boxes.get_device()
ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor
FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor
nB = pred_boxes.size(0)
nA = pred_boxes.size(1)
nC = pred_cls.size(-1)
nG = pred_boxes.size(2)
# Output tensors
obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0).to(device)
noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1).to(device)
class_mask = FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
tx = FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
ty = FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
tw = FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
th = FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0).to(device)
# Convert to position relative to box
target_boxes = target[:, 2:6] * nG
gxy = target_boxes[:, :2]
gwh = target_boxes[:, 2:]
# Get anchors with best iou (this is just to find anchors with the right size)
ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors])
best_ious, best_n = ious.max(0)
# Separate target values
b, target_labels = target[:, :2].long().t()
gx, gy = gxy.t()
gw, gh = gwh.t()
gi, gj = gxy.long().t()
# Set masks
obj_mask[b, best_n, gj, gi] = 1
noobj_mask[b, best_n, gj, gi] = 0
# Convert noinfo_mask to grid size
if masks is not None:
# ic(masks.shape)
try:
resized_masks = F.interpolate(masks.unsqueeze(1), size=nG, mode="nearest").squeeze(1) > 0.1
# ic(resized_masks)
# cv2.imshow("Mask", resized_masks[0].detach().cpu().numpy())
# cv2.waitKey(0)
for i in range(nA):
noobj_mask[:, i][resized_masks] = 0
except:
print("Failed to apply mask")
print(noobj_mask.shape)
print(masks.shape)
print(nG)
print(masks)
resized_masks = F.interpolate(masks.unsqueeze(1), size=nG, mode="nearest").squeeze(1) > 0.1
print(resized_masks)
print(ious.shape)
# Set noobj mask to zero where iou exceeds ignore threshold
for i, anchor_ious in enumerate(ious.t()):
noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
# Coordinates
tx[b, best_n, gj, gi] = gx - gx.floor()
ty[b, best_n, gj, gi] = gy - gy.floor()
# Width and height
tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
# One-hot encoding of label
tcls[b, best_n, gj, gi, target_labels] = 1
# Compute label correctness and iou at best anchor
class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False)
tconf = obj_mask.float()
return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf
This is the new code:
# Convert noinfo_mask to grid size
if masks is not None:
# ic(masks.shape)
try:
resized_masks = F.interpolate(masks.unsqueeze(1), size=nG, mode="nearest").squeeze(1) > 0.1
# ic(resized_masks)
# cv2.imshow("Mask", resized_masks[0].detach().cpu().numpy())
# cv2.waitKey(0)
for i in range(nA):
noobj_mask[:, i][resized_masks] = 0
except:
print("Failed to apply mask")
print(noobj_mask.shape)
print(masks.shape)
print(nG)
print(masks)
resized_masks = F.interpolate(masks.unsqueeze(1), size=nG, mode="nearest").squeeze(1) > 0.1
print(resized_masks)
print(ious.shape)
The error seems to happen at F.interpolate, which is confusing because the shape of the tensors makes sense when being passed into F.interpolate. I have printing the shape of the resized_masks but it always crashes before it can get to that.