I tried to clear loss tenser to continue, but it didn’t work. It reports “RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn”:

`with torch.cuda.amp.autocast(): loss, _, _ = model(samples, mask_ratio=args.mask_ratio) loss_value = loss.item() if not math.isfinite(loss_value): print(f"Loss is {loss_value} in iter {data_iter_step}, continue training") print(loss.shape) loss = torch.zeros_like(loss) loss_value = 0 loss /= accum_iter loss_scaler(loss, optimizer, parameters=model.parameters(), update_grad=(data_iter_step + 1) % accum_iter == 0) if (data_iter_step + 1) % accum_iter == 0: optimizer.zero_grad() torch.cuda.synchronize() metric_logger.update(loss=loss_value) lr = optimizer.param_groups[0]["lr"] metric_logger.update(lr=lr) loss_value_reduce = misc.all_reduce_mean(loss_value) if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: """ We use epoch_1000x as the x-axis in tensorboard. This calibrates different curves when batch size changes. """ epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x) log_writer.add_scalar('lr', lr, epoch_1000x)`