hello together,

I am trying to reverse engineer the WGAN-GP and adapt it to my Pix2Pix problem for depth estimation. Everything works fine, but my gradient penalty function gives me the following error:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

```
def _gradient_penalty(self, rgb, real_data, generated_data):
batch_size = real_data.size()[0]
# Calculate interpolation
alpha = torch.rand(batch_size, 1, 1, 1)
alpha = alpha.expand_as(real_data)
if self.settings["cuda"]:
alpha = alpha.cuda()
interpolated = alpha * real_data.data + (1 - alpha) * generated_data.data
interpolated = Variable(interpolated, requires_grad=True)
if self.settings["cuda"]:
interpolated = interpolated.cuda()
# Calculate probability of interpolated examples
prob_interpolated = self.diskriminator(rgb, interpolated)
concat = torch.cat((rgb, interpolated), dim=1)
concat = Variable(concat, requires_grad=True)
#allow_unused=True
Variable(prob_interpolated, requires_grad=True)
# Calculate gradients of probabilities with respect to examples
gradients = torch_grad(outputs=prob_interpolated.detach(), inputs=concat.detach(),
grad_outputs=torch.ones(
prob_interpolated.size()).cuda() if self.settings["cuda"] else torch.ones(
prob_interpolated.size()),
create_graph=True, retain_graph=True)[0]
# Gradients have shape (batch_size, num_channels, img_width, img_height),
# so flatten to easily take norm per example in batch
gradients = gradients.view(batch_size, -1)
self.losses['gradient_norm'] = gradients.norm(2, dim=1).mean().data[0]
# Derivatives of the gradient close to 0 can cause problems because of
# the square root, so manually calculate norm and add epsilon
gradients_norm = torch.sqrt(torch.sum(gradients ** 2, dim=1) + 1e-12)
# Return gradient penalty
return self.settings["gp_weight"] * ((gradients_norm - 1) ** 2).mean()
```