Issue calculating gradient

I’m attempting to calculate the gradient w.r.t. an input using the formula

(self.gamma / 2.0) * (torch.norm(grad(output.mean(), inpt)[0]) ** 2)

where grad is the torch.autograd function, and both output and inpt require gradients. In some runs, it works fine; however, it often comes up with the error

RuntimeError: grad can be implicitly created only for scalar outputs

I’ve tried fixing the issue based on past discussions (i.e. Loss.backward() raises error 'grad can be implicitly created only for scalar outputs' and Error: grad can be implicitly created only for scalar outputs) but to no avail. Any help would be greatly appreciated.

Could you share a reproducible snippet? I could not reproduce this with

>>> import torch
>>> input = torch.randn(1000, 1000)
>>> input.requires_grad = True
>>> weight = torch.randn(1000, 1000)
>>> weight.requires_grad = True
>>> output = input * weight
>>> torch.autograd.grad(output.mean(), input)
(tensor([[ 3.6471e-07, -5.4263e-07,  3.5560e-08,  ..., -2.5698e-08,
         -4.6246e-07, -1.2394e-06],
        [-2.9459e-07,  2.4342e-06,  9.5083e-07,  ...,  2.1179e-06,
          4.3364e-07,  9.8099e-07],
        [ 3.5879e-07, -2.9793e-07, -3.0315e-08,  ..., -8.2863e-07,
         -3.8005e-07,  9.4537e-07],
        ...,
        [ 9.3157e-07, -4.0749e-08, -9.6082e-07,  ...,  9.7867e-07,
         -5.8472e-07,  8.4605e-07],
        [ 1.3695e-06, -3.3069e-07, -1.4985e-07,  ..., -2.6728e-07,
         -1.3704e-07,  5.8181e-07],
        [ 7.2464e-07, -4.6793e-07, -4.7812e-07,  ...,  7.8470e-08,
          1.1135e-06, -3.6685e-08]]),)
>>>

Sure thing, here is the function that is calling:

class DiscGradReg(nn.Module):
    """Discriminator gradient regularization for inversion"""

    def __init__(self, gamma: float = 10.):
        """
        Args:
            net: String representing the network to use for LPIPS loss (default: alex)
        """
        super().__init__()
        self.gamma = gamma

    def forward(self, inpt, output):
        # Disc represents the discriminator loss
        return (self.gamma / 2.0) * (torch.norm(grad(output.mean(), inpt)[0]) ** 2)

and this is the main training loop

def search(**kwargs):
    device = torch.device('cuda')
    opts = dnnlib.EasyDict(kwargs) # Command line arguments.


    # Determine the number of classes
    num_classes = 0  

    for folders in os.listdir(opts['data']):
        num_classes += 1  

    # Define transforms 
    tsfms = transforms.Compose([
        transforms.Resize(256, interpolation=1),
        transforms.CenterCrop(256),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

    # Calculate number of steps to accumulate gradient 
    accumulate_steps = (opts['batch'] // opts['batch_gpu']) if opts.get('batch_gpu') else 1

    dataset = ImageFolder(opts['data'], transform=tsfms, target_transform=lambda x: F.one_hot(torch.tensor(x, dtype=torch.int64), num_classes).float())
    dataloader = DataLoader(
        dataset, 
        shuffle=True, 
        batch_size=opts['batch_gpu'] if opts.get('batch_gpu') else opts['batch'], 
        drop_last=True,
        num_workers=opts['workers']
    )

    encoder_loss = EncoderLoss()
    discriminator_grad_reg = DiscGradReg()

    # Load Generator in evaluation mode
    with dnnlib.util.open_url(opts['pkl']) as f:
        G = legacy.load_network_pkl(f)['G_ema'].to(device)
    G.eval()

    with dnnlib.util.open_url(opts['pkl']) as f:
        D = legacy.load_network_pkl(f)['D'].to(device)
    D.train()

    E = FastGANEncoder()
    E.to(device)
    E.train()

    # Initialize optimizers
    optim_d = AdamW(D.parameters())
    optim_e = AdamW(E.parameters())

    print(f"\n\nTraining for {opts['epochs']} epochs with batch size {str(opts['batch_gpu']) + ' and total batch ' + str(opts['batch']) if opts.get('batch_gpu') else int(opts['batch'])} on {num_classes} classes...\n\n")

    for epoch in range(opts['epochs']):
        running_loss_d = 0.
        running_loss_e = 0.

        # Iterate over dataset 
        iters = 0

        for i, (imgs, labels) in enumerate(dataloader, 0):
            with autocast():
                imgs, labels = imgs.cuda(), labels.cuda()

                # Encode the batch
                z_pred = E(imgs)

                # Pass the batch through the generator
                reconsts = G(z_pred, labels)

                # Pass the images through the discriminator
                fake_score = D(reconsts, labels)
                real_score = D(imgs.requires_grad_(), labels)

                # Calculate the loss 
                loss_e = encoder_loss(imgs, reconsts, fake_score)
                loss_d = fake_score - real_score + discriminator_grad_reg(imgs, real_score.requires_grad_())

                # Determine whether or not to back prop
                if (iters + 1) % accumulate_steps == 0:
                    iters = 0
                    loss_e.backward()
                    loss_d.backward()
                    optim_e.step()
                    optim_d.step()            
            
            iters += 1

        print(f"Running loss of the discriminator at epoch {epoch + 1}: {running_loss_d}")
        print(f"Running loss of the encoder at epoch {epoch + 1}: {running_loss_e}")

I hope that helps, I truly do appreciate your help. The function interfaces with the click library, so that’s why it loads in options from the dictionary opts.

I’ve found that the issue stems from one of my other loss functions instead of the autograd function…