MSELoss gives same results for different reduction values if my tensors requires_grad

jackstraw · December 7, 2018, 9:03pm

torch.manual_seed(5)
x = torch.rand(4, 3).requires_grad_()
y = torch.rand(4, 3).requires_grad_()

(
torch.nn.functional.mse_loss(x, y),
torch.nn.functional.mse_loss(x, y, reduction='elementwise_mean'),
torch.nn.functional.mse_loss(x, y, reduction='sum'),
torch.nn.functional.mse_loss(x, y, reduction='none'),
torch.nn.functional.mse_loss(x, y, size_average=True, reduce=True),
torch.nn.functional.mse_loss(x, y, size_average=True, reduce=False),
torch.nn.functional.mse_loss(x, y, size_average=False, reduce=True),
torch.nn.functional.mse_loss(x, y, size_average=False, reduce=False),
)

yields

(tensor(2.5974, grad_fn=<SumBackward0>),
 tensor(2.5974, grad_fn=<SumBackward0>),
 tensor(2.5974, grad_fn=<SumBackward0>),
 tensor(2.5974, grad_fn=<SumBackward0>),
 tensor(2.5974, grad_fn=<SumBackward0>),
 tensor(2.5974, grad_fn=<SumBackward0>),
 tensor(2.5974, grad_fn=<SumBackward0>),
 tensor(2.5974, grad_fn=<SumBackward0>))

whereas

torch.manual_seed(5)
x = torch.rand(4, 3)
y = torch.rand(4, 3)

(
torch.nn.functional.mse_loss(x, y),
torch.nn.functional.mse_loss(x, y, reduction='elementwise_mean'),
torch.nn.functional.mse_loss(x, y, reduction='sum'),
torch.nn.functional.mse_loss(x, y, reduction='none'),
torch.nn.functional.mse_loss(x, y, size_average=True, reduce=True),
torch.nn.functional.mse_loss(x, y, size_average=True, reduce=False),
torch.nn.functional.mse_loss(x, y, size_average=False, reduce=True),
torch.nn.functional.mse_loss(x, y, size_average=False, reduce=False),
)

yields

(tensor(0.2164),
 tensor(0.2164),
 tensor(2.5974),
 tensor([[0.5743, 0.2843, 0.0370],
         [0.0579, 0.0037, 0.2846],
         [0.0116, 0.0332, 0.4051],
         [0.5369, 0.3200, 0.0486]]),
 tensor(0.2164),
 tensor([[0.5743, 0.2843, 0.0370],
         [0.0579, 0.0037, 0.2846],
         [0.0116, 0.0332, 0.4051],
         [0.5369, 0.3200, 0.0486]]),
 tensor(2.5974),
 tensor([[0.5743, 0.2843, 0.0370],
         [0.0579, 0.0037, 0.2846],
         [0.0116, 0.0332, 0.4051],
         [0.5369, 0.3200, 0.0486]]))

Why do I get different results if my tensors requires_grad?
And why are the results for the different values of reduction all the same if my tensors requires_grad?