optimizer:No inf checks were recorded for this optimizer

i want to get mask by yolov5, so make some change in yolov5. but get the error like this:

Traceback (most recent call last):
File “train.py”, line 713, in
main(opt)
File “train.py”, line 610, in main
train(opt.hyp, opt, device, callbacks)
File “train.py”, line 411, in train
scaler_mask.step(optimizer_mask) # optimizer.step
File “/home/mopanzhong/.conda/envs/rope_detection/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py”, line 318, in step
assert len(optimizer_state[“found_inf_per_device”]) > 0, “No inf checks were recorded for this optimizer.”

train.py:

optimizer.zero_grad()
optimizer_mask.zero_grad()
for …:

with amp.autocast(enabled=cuda):
pred, mask_res = model(imgs, get_mask_res=True) # forward
loss, loss_mask, loss_items = compute_loss(pred, targets.to(device), mask_res=mask_res,img_masks=img_masks) # loss scaled by batch_size

        # Backward
        print("$"*20, loss, loss_mask)  #$$$$$$$$$$$$$$$$$$$$ tensor([3.15543], device='cuda:0', grad_fn=<MulBackward0>) tensor(0.74285, device='cuda:0', grad_fn=<MeanBackward0>)
        scaler.scale(loss).backward()
        scaler_mask.scale(loss_mask).backward()

        # Optimize
        if ni - last_opt_step >= accumulate:
            print("%"*20,optimizer)
            print("%" * 20, optimizer_mask)

print:
%%%%%%%%%%%%%%%%%%%% SGD (
Parameter Group 0
dampening: 0
initial_lr: 0.01
lr: 0.0
momentum: 0.8
nesterov: True
weight_decay: 0

Parameter Group 1
dampening: 0
initial_lr: 0.01
lr: 0.0
momentum: 0.8
nesterov: True
weight_decay: 0.0005

Parameter Group 2
dampening: 0
initial_lr: 0.01
lr: 0.1
momentum: 0.8
nesterov: True
weight_decay: 0
)
%%%%%%%%%%%%%%%%%%%% SGD (
Parameter Group 0
dampening: 0
initial_lr: 0.01
lr: 0.0
momentum: 0.8
nesterov: True
weight_decay: 0

Parameter Group 1
dampening: 0
initial_lr: 0.01
lr: 0.0
momentum: 0.8
nesterov: True
weight_decay: 0.0005

Parameter Group 2
dampening: 0
initial_lr: 0.01
lr: 0.1
momentum: 0.8
nesterov: True
weight_decay: 0
)

            scaler.step(optimizer)  # optimizer.step
            scaler.update()
            scaler_mask.step(optimizer_mask)  # optimizer.step
            scaler_mask.update()
            optimizer.zero_grad()
            optimizer_mask.zero_grad()
            if ema:
                ema.update(model)
            last_opt_step = ni

.....

and compute_loss:

# mask_losses is a list, like [1,2,3,4,5]
mask_loss = torch.tensor(mask_losses, device=device, requires_grad=True)
mask_losses = torch.tensor(mask_losses, device=device).mean(0,keepdim=True)
print(mask_losses)
print(lbox, lobj, lcls)
#print(torch.cat((lbox, lobj, lcls)).detach())
return (lbox + lobj + lcls) * bs, mask_loss.mean(), torch.cat((lbox, lobj, lcls, mask_losses)).detach()