Prediction == Target but Gradient is > 0

import torch
import torch.nn as nn

class BinaryTverskyLossV2(nn.Module):
    def __init__(self, alpha=0.3, beta=0.7):
        super(BinaryTverskyLossV2, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.epsilon = 1e-6
        s = self.beta + self.alpha

    def forward(self, output, target):
        batch_size = output.size(0)
        output = output.view(batch_size, -1)
        target = target.view(batch_size, -1)
        P_G = torch.sum(output * target, 1)  # TP
        print("P_G: ", P_G)
        P_NG = torch.sum(output * (1. - target), 1)  # FP
        print("P_NG: ", P_NG)
        NP_G = torch.sum((1. - output) * target, 1)  # FN
        print("NP_G: ", NP_G)
        tversky_index = P_G / (P_G + self.alpha * P_NG + self.beta * NP_G)
        print("tversky_index: ", tversky_index)
        loss = 1. - tversky_index
        print("loss: ", loss)

        loss = torch.mean(loss)
        return loss

pred = nn.Parameter(torch.zeros(1,1,10,10, dtype=torch.float32), requires_grad=True)
pred.data[..., 5:8, 5:8] = 1

gt = pred.data.clone()

btl = BinaryTverskyLossV2()
loss = btl(pred, gt)

loss.backward()

print("grad mean: ", pred.grad.mean())

Gives:

P_G:  tensor([9.], grad_fn=<SumBackward1>)
P_NG:  tensor([0.], grad_fn=<SumBackward1>)
NP_G:  tensor([0.], grad_fn=<SumBackward1>)
tversky_index:  tensor([1.], grad_fn=<DivBackward0>)
loss:  tensor([0.], grad_fn=<RsubBackward1>)
grad mean:  tensor(0.0233)

Why is my gradient > 0? Both prediction and target tensors are equal.

This is because the gradients wrt P_NG and NP_G are not 0. And these are computed in a differentiable manner wrt output. So you get some gradients for your output.