WGAN model with this error RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Melani_Cook · February 27, 2023, 2:56pm

In my model, I have a critic and I use the Wasserstein distance with gradient penalty. Here is the loss function in the model class:

def wgan_gp_reg(self, x_real, x_fake, center=1., lambda_gp=10.0):
    
    batch_size = x_real.shape[0]
    eps = torch.rand(batch_size, 1, 1,  device=self.device, dtype=x_real.dtype)
    
    eps = eps.expand_as(x_real)
    #eps = torch.randn_like(x_real).to(self.device)
    x_interp = torch.autograd.Variable((eps * x_real + (1 - eps) * x_fake), requires_grad=True)
    d_out = self.discriminator(x_interp)

    gradients = torch.autograd.grad(inputs = x_interp,
                                   outputs  = d_out,
                                   grad_outputs = torch.ones_like(d_out, device=self.device),
                                   create_graph = True,
                                   retain_graph = True,
                                   )[0]

    gradients = gradients.view(gradients.size(), -1)
    gradient_penalty = (((gradients + 1e-16).norm(2, dim=1) - center) ** 2).mean() * lambda_gp
    return gradient_penalty

I also use torch.autocast and torch.cuda.amp.GradScaler together during training

def train(epoch):

    modelstate.model.train()

    total_loss = 0
    total_batches = 0
    total_points = 0
    if torch.cuda.is_available():
        scaler = torch.cuda.amp.GradScaler()
    for i, (u, y) in enumerate(loader_train):
        u = u.to(device)
        y = y.to(device)
        modelstate.optimizer.zero_grad()
        if torch.cuda.is_available():
            with torch.autocast(device_type='cuda', dtype=torch.float32) and torch.backends.cudnn.flags(enabled=False):
                loss_ = modelstate.model(u, y)
                
            scaled_grad_params = torch.autograd.grad(outputs=scaler.scale(loss_),
                                                    inputs=modelstate.model.parameters(),
                                                    create_graph=True,
                                                    #retain_graph=True,
                                                    allow_unused=True #Whether to allow differentiation of unused parameters.
                                                    )

                
            inv_scale = 1./scaler.get_scale()
            
            grad_params = [ p * inv_scale if p is not None and not torch.isnan(p).any() else torch.tensor(0, device=device, dtype=torch.float32) for p in scaled_grad_params ]
            
            with torch.autocast(device_type='cuda', dtype=torch.float32):
                #grad_norm = torch.tensor(0, device=grad_params[0].device, dtype=grad_params[0].dtype)
                grad_norm = 0
                for grad in grad_params:
                    grad_norm += grad.pow(2).sum()
                    grad_norm = grad_norm**0.5
                # Compute the L2 Norm as penalty and add that to loss
                loss_ = loss_ + grad_norm

However, I got this error

→ 2827 gradient_penalty = self.wgan_gp_reg(input_feature, fake_input_feature)
2828 d_loss = -torch.mean(disc_real) + torch.mean(disc_fake) + gradient_penalty
2829 total_loss += d_loss

/tmp/ipykernel_59758/571975632.py in wgan_gp_reg(self, x_real, x_fake, center, lambda_gp)
2691 d_out = self.discriminator(x_interp)
2692
→ 2693 gradients = torch.autograd.grad(inputs = x_interp,
2694 outputs = d_out,
2695 grad_outputs = torch.ones_like(d_out, device=self.device),

~/anaconda3/lib/python3.9/site-packages/torch/autograd/init.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused, is_grads_batched)
298 return _vmap_internals.vmap(vjp, 0, 0, allow_none_pass_through=True)(grad_outputs)
299 else:
→ 300 return Variable.execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
301 t_outputs, grad_outputs, retain_graph, create_graph, t_inputs,
302 allow_unused, accumulate_grad=False) # Calls into the C++ engine to run the backward pass

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

I tried the same critic with the given wasserstein gradient penalty module when I did not use torch.autocast without any error. I am wondering whether this error related to this and how I can integrate both together?

ptrblck · February 28, 2023, 1:20am

Could you post a minimal and executable code snippet to reproduce the issue, please, as I don’t see how autocast could interact with the computation graph creation in your code snippet?

Melani_Cook · February 28, 2023, 1:41am

@ptrblck, thanks for your amazing work by answering questions. The discriminator part of the model which uses the gradient penalty and giving an error

from torchgan.models import Generator, Discriminator
class RGANDiscriminator(Discriminator):
        def __init__(self,
                            sequence_length,
                            input_size,
                            hidden_size=None,
                            num_layers=1,
                            dropout=0,
                            last_layer=None,
                            device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                            **kwargs):

        hidden_size = hidden_size or input_size
        self.device = device
        self.input_size = input_size
        self.sequence_length = sequence_length
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.label_type ="none"
        # Set kwargs (might overried above attributes)
        for key, value in kwargs.items():
            setattr(self, key, value)

        super(RGANDiscriminator, self).__init__(self.input_size,
                                                self.label_type)


        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_size, 1)
        self.last_layer = last_layer

        nn.init.xavier_normal_(self.linear.weight)

    def forward(self, x, length):
        h0 =  torch.autograd.Variable(torch.randn((self.num_layers, x.size(0), self.hidden_size)), requires_grad=True).to(self.device)
        c0 =  torch.autograd.Variable(torch.randn((self.num_layers, x.size(0), self.hidden_size)), requires_grad=True).to(self.device)
                
        packed = nn.utils.rnn.pack_padded_sequence(
            x, length, batch_first=True, enforce_sorted=False
        )
        out_packed, (_, _) = self.rnn(packed, (h0, c0))
        y, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
        y = self.dropout(y)
        y = self.linear(y)
        return y if self.last_layer is None else self.last_layer(y)

I also explored more in the web and I changed parameters. I noticed in the wgan_gp_reg module, these two changes made that error to disapper

    d_out =  torch.autograd.Variable(d_out, requires_grad = True)
    gradients = torch.autograd.grad(inputs = x_interp,
                                   outputs  = d_out,
                                   grad_outputs = torch.ones_like(d_out, device=self.device).type_as(x_real),
                                   create_graph = True,
                                   retain_graph = True,
                                   allow_unused=True 
                                   )[0]

But I got a new error message:

→ 2830 gradient_penalty = self.wgan_gp_reg(input_feature, fake_input_feature)
2831 d_loss = -torch.mean(disc_real) + torch.mean(disc_fake) + gradient_penalty
2832 total_loss += d_loss

/tmp/ipykernel_59758/1668205513.py in wgan_gp_reg(self, x_real, x_fake, center, lambda_gp)
2700 )[0]
2701
→ 2702 gradients = gradients.view(gradients.size(), -1)
2703 gradient_penalty = (((gradients + 1e-16).norm(2, dim=1) - center) ** 2).mean() * lambda_gp
2704

AttributeError: ‘NoneType’ object has no attribute ‘view’

I am wondering under which circumstances the gradient becomes NoneType?

ptrblck · February 28, 2023, 5:28am

Your proposed solution won’t work since you are rewrapping a tensor and are thus breaking the computation graph, which explains why the gradients are None for previously used parameters.
Also, Variables are deprecated since PyTorch 0.4, so don’t use them anymore.

Your code snippet is unfortunately not executable, so I used random inputs which seem to work:

class RGANDiscriminator(nn.Module):
    def __init__(self,
                            sequence_length,
                            input_size,
                            hidden_size=None,
                            num_layers=1,
                            dropout=0,
                            last_layer=None,
                            device = "cpu",
                            **kwargs):
        super().__init__()
        hidden_size = hidden_size or input_size
        self.device = device
        self.input_size = input_size
        self.sequence_length = sequence_length
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.label_type ="none"
        # Set kwargs (might overried above attributes)
        for key, value in kwargs.items():
            setattr(self, key, value)

        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_size, 1)
        self.last_layer = last_layer

        nn.init.xavier_normal_(self.linear.weight)

    def forward(self, x, length):
        h0 =  torch.randn((self.num_layers, x.size(0), self.hidden_size), requires_grad=True).to(self.device)
        c0 =  torch.randn((self.num_layers, x.size(0), self.hidden_size), requires_grad=True).to(self.device)
                
        packed = nn.utils.rnn.pack_padded_sequence(
            x, length, batch_first=True, enforce_sorted=False
        )
        out_packed, (_, _) = self.rnn(packed, (h0, c0))
        y, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
        y = self.dropout(y)
        y = self.linear(y)
        return y if self.last_layer is None else self.last_layer(y)
    
model = RGANDiscriminator(10, 10, 10)

x_interp = torch.randn(10, 10, 10, requires_grad=True)
d_out = model(x_interp, torch.tensor([10]))

gradients = torch.autograd.grad(inputs = x_interp,
                               outputs  = d_out,
                               grad_outputs = torch.ones_like(d_out),
                               create_graph = True,
                               retain_graph = True,
                               )[0]
print(gradients.abs().sum())
# tensor(6.1653, grad_fn=<SumBackward0>)

Melani_Cook · February 28, 2023, 10:38pm

actually I noticed this is happening during the validation where I have:

def validate(loader):
        modelstate.model.eval()
        total_vloss = 0
        total_batches = 0
        total_points = 0
        with torch.no_grad():
            for i, (u, y) in enumerate(loader):
                u = u.to(device)
                y = y.to(device)
                vloss_, d_loss, hidden = modelstate.model(u, y)

                total_batches += u.size()[0]
                total_points += np.prod(u.shape)
                total_vloss += vloss_.item()

        return total_vloss / total_points  # total_batches

Do you have any suggestion about how to avoid getting this error?

ptrblck · March 1, 2023, 1:20am

In your validation step you are wrapping the forward pass into a with torch.no_grad() guard, which will disable Autograd and will save memory as the intermediate forward activations are not stored anymore.
Thus also no computation graph is created and the error is raised.
Calling .backward() during the validation step is uncommon and you should double check if you really want to update your model using the validation dataset as it could easily create a data leak.
If you really want to call backward in the validation phase, remove the no_grad guard.

Melani_Cook · March 1, 2023, 2:07pm

I took out the gradient penalty computation from the forward module and I added the loss regarding the discriminator to the training process. I also separated the discrimiator optimizer from the rest of the model. In order to get the rest parameters of the whole model, I wrote the following line

self._params = itertools.chain( self.enc.parameters(), self.enc_logvar.parameters(), self.enc_mean.parameters(), self.prior.parameters(), self.decoder.parameters())

The training part of the model looks like this

        modelstate.optimizer.zero_grad()
        modelstate.model.optimizer_discriminator.zero_grad()
        if torch.cuda.is_available():
            with torch.autocast(device_type='cuda', dtype=torch.float32) and torch.backends.cudnn.flags(enabled=False):
                loss_, disc_loss, hidden, real, fake = modelstate.model(u, y)
                
            scaled_grad_params = torch.autograd.grad(outputs=scaler.scale(loss_),
                                                    inputs= modelstate.model._params,
                                                    create_graph=True,
                                                    retain_graph=True,
                                                    allow_unused=True #Whether to allow differentiation of unused parameters.
                                                    )
 
            
            inv_scale = 1./scaler.get_scale()
            
            grad_params = [ p * inv_scale if p is not None and not torch.isnan(p).any() else torch.tensor(0, device=device, dtype=torch.float32) for p in scaled_grad_params ]
            
            with torch.autocast(device_type='cuda', dtype=torch.float32):
                
                grad_norm = 0
                for grad in grad_params:
                    grad_norm += grad.pow(2).sum()
                    grad_norm = grad_norm**0.5
                loss_ = loss_ + grad_norm

            assert not torch.isnan(loss_)
            scaler.scale(loss_).backward(retain_graph=True, inputs=list( modelstate.model._params))

            scaler.step(modelstate.optimizer)
            scaler.update()

            gradient_penalty = modelstate.model.wgan_gp_reg(real, fake)
            discriminator_loss = disc_loss+ gradient_penalty 
            discriminator_loss.backward()
            modelstate.model.optimizer_discriminator.step()

The validation pass works without ann error now. However, during the training, I’ve got this new error:

<ipython-input-2-9f02e04438ad> in train(epoch)
   3196                     loss_, disc_loss, hidden, real, fake = modelstate.model(u, y)
   3197 
-> 3198                 scaled_grad_params = torch.autograd.grad(outputs=scaler.scale(loss_),
   3199                                                         inputs= modelstate.model._params,
   3200                                                         create_graph=True,

/usr/local/lib/python3.8/dist-packages/torch/autograd/__init__.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused, is_grads_batched)
    298         return _vmap_internals._vmap(vjp, 0, 0, allow_none_pass_through=True)(grad_outputs_)
    299     else:
--> 300         return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    301             t_outputs, grad_outputs_, retain_graph, create_graph, t_inputs,
    302             allow_unused, accumulate_grad=False)  # Calls into the C++ engine to run the backward pass

ValueError: grad requires non-empty inputs.

How should I fix this error?