Hi, I’m using Complete box iou loss and I’m getting the above mentioned error when I run anomaly detection. Any ideas on how to solve this?
The full error code is:
Training: 0%| | 0/1 [00:00<?, ?it/s]/tmp/ipykernel_34/3114434181.py:83: UserWarning: Anomaly Detection has been enabled. This mode will increase the runtime and should only be enabled for debugging.
with torch.autograd.detect_anomaly(True):
/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:251: UserWarning: Error detected in DivBackward0. Traceback of forward call that caused the error:
File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
app.launch_new_instance()
File "/opt/conda/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
app.start()
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 701, in start
self.io_loop.start()
File "/opt/conda/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
self.asyncio_loop.run_forever()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue
await self.process_one()
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 523, in process_one
await dispatch(*args)
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell
await result
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 767, in execute_request
reply_content = await reply_content
File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 429, in do_execute
res = shell.run_cell(
File "/opt/conda/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
return super().run_cell(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell
result = self._run_cell(
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell
result = runner(coro)
File "/opt/conda/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
coro.send(None)
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes
if await self.run_code(code, result, async_=asy):
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "/tmp/ipykernel_34/3143850637.py", line 9, in <module>
train(model=model,
File "/tmp/ipykernel_34/3114434181.py", line 156, in train
train_model(
File "/tmp/ipykernel_34/3114434181.py", line 89, in train_model
+ criterion(outputs[2], target2, SCALED_ANCHORS[2])
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/tmp/ipykernel_34/2076888010.py", line 156, in forward
iou_loss = complete_box_iou_loss(box_preds2[obj_mask], box_true2[obj_mask])
File "/opt/conda/lib/python3.10/site-packages/torchvision/ops/ciou_loss.py", line 61, in complete_box_iou_loss
v = (4 / (torch.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
(Triggered internally at /usr/local/src/pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:114.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Training: 0%| | 0/1 [00:01<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[33], line 9
4 # optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
5 # scheduler = CyclicLR(optimizer, base_lr=0.0001, max_lr=0.01)
8 scaler = torch.cuda.amp.GradScaler()
----> 9 train(model=model,
10 dataloaders=dataloaders,
11 criterion=criterion,
12 optimizer=optimizer,
13 scheduler=scheduler,
14 scaler=scaler,
15 device=DEVICE,
16 epochs=EPOCHS,
17 log_dir=os.getcwd())
18 torch.save(model.state_dict(),"Final-model.pth")
Cell In[29], line 156, in train(model, dataloaders, criterion, optimizer, scheduler, scaler, device, epochs, log_dir)
153 print(f"Epoch {epoch + 1}/{epochs}")
154 print("-" * 10)
--> 156 train_model(
157 model=model,
158 dataloader=dataloaders["train"],
159 criterion=criterion,
160 optimizer=optimizer,
161 scaler=scaler,
162 device=device,
163 writer=writer,
164 epoch=epoch,
165 )
167 validate_model(
168 model=model,
169 dataloader=dataloaders["val"],
(...)
175 epoch=epoch,
176 )
177 if epoch % 10 == 0:
Cell In[29], line 97, in train_model(model, dataloader, criterion, optimizer, scaler, device, writer, epoch, scheduler)
93 optimizer.zero_grad()
94 # scaler.scale(loss).backward()
95 # scaler.step(optimizer)
96 # scaler.update()
---> 97 loss.backward()
98 optimizer.step()
100 running_loss += loss.item() * inputs.size(0)
File /opt/conda/lib/python3.10/site-packages/torch/_tensor.py:492, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
482 if has_torch_function_unary(self):
483 return handle_torch_function(
484 Tensor.backward,
485 (self,),
(...)
490 inputs=inputs,
491 )
--> 492 torch.autograd.backward(
493 self, gradient, retain_graph, create_graph, inputs=inputs
494 )
File /opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:251, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
246 retain_graph = create_graph
248 # The reason we repeat the same comment below is that
249 # some Python versions print out the first line of a multi-line function
250 # calls in the traceback and some print out the last line
--> 251 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
252 tensors,
253 grad_tensors_,
254 retain_graph,
255 create_graph,
256 inputs,
257 allow_unreachable=True,
258 accumulate_grad=True,
259 )
RuntimeError: Function 'DivBackward0' returned nan values in its 0th output.
And the code I’m using to calculate the loss is:
obj_mask = target[..., 0] == 1
no_obj_mask = target[..., 0] == 0
box_pred_xy = torch.sigmoid(pred[..., 1:3])
box_pred_wh = ((torch.sigmoid(pred[...,3:5]) *2) **3) * anchors
box_preds = torch.cat([box_pred_xy, box_pred_wh], dim=-1)
box_true_xy = target[..., 1:3]
box_true_wh = target[..., 3:5]
box_true = torch.cat([box_true_xy,box_true_wh], dim=-1)
# Coordinate loss
box_preds2 = convert_cwch_xy(box_preds).clamp(1e-6)
box_true2 = convert_cwch_xy(box_true).clamp(1e-6)
iou_loss = complete_box_iou_loss(box_preds2[obj_mask], box_true2[obj_mask])
Edit: I tried printing the model gradients, using nanmean() on loss and they are all nans.
Gradient for layers.0.conv.weight:
tensor([[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]]], device='cuda:0')
Gradient for layers.0.conv.bias:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan], device='cuda:0')
Gradient for layers.0.bn.weight:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan], device='cuda:0')
Gradient for layers.0.bn.bias:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan], device='cuda:0')
Gradient for layers.1.conv.weight:
tensor([[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
...,
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
...,
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
...,
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
...,
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
...,
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
...,
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]],
[[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
...,
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]],
[[nan, nan, nan],
[nan, nan, nan],
[nan, nan, nan]]]], device='cuda:0')
Gradient for layers.1.conv.bias:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
device='cuda:0')
Gradient for layers.1.bn.weight:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
device='cuda:0')
Gradient for layers.1.bn.bias:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
device='cuda:0')