Unexpected nan gradient

Hello everybody,

in the very simple example below, pytorch produces a nan-gradient. However, I would have expected a finite gradient. Is my math wrong or something strange happening inside pytorch?

import torch
def run_simple():
    #  y = s*[x1,x2] = 3 * [1,NaN]
    x = torch.tensor([1, float('NaN')]).float()
    scalor = torch.tensor([3]).float()
    scalor.requires_grad = True
    y = x * scalor

    # loss = y[mask_valid].mean() = s*x1 = 3
    mask_valid = torch.isfinite(x)
    loss = y[mask_valid].mean()  # = 3
    loss.backward()
    print(scalor.grad)  # [nan] ?! Would have expected dloss/ds = d(s*x1)/ds = x1 = 1

P.s: Tested with version 0.4.1 and 1.4.0