Pytorch Tensor scaling

Is there a pytorch command that scales tensors like sklearn (example below)?

X = data[:,:num_inputs]
x_scaler = preprocessing.StandardScaler()
X_scaled = x_scaler.fit_transform(X)

From class
sklearn.preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)

3 Likes

You can easily clone the sklearn behavior using this small script:

x = torch.randn(10, 5) * 10

scaler = StandardScaler()
arr_norm = scaler.fit_transform(x.numpy())

# PyTorch impl
m = x.mean(0, keepdim=True)
s = x.std(0, unbiased=False, keepdim=True)
x -= m
x /= s

torch.allclose(x, torch.from_numpy(arr_norm))

Alternatively, you could of course just use the sklearn scaler directly, as torch.numpy() and torch.from_numpy() return arrays which share the underlying data, and are thus very cheap. :wink:

8 Likes

Thank you for the quick response and helpful advice.

I have an input that has required_grad=True. I need to scale it, and I wondered if the solution in this post would break the graph such that the gradient is not computable later?

The x.numpy() operation will break the computation graph, so you should use the plain PyTorch approach.

If you need gradients for the input, I would also recommend to not normalize it inplace, but create a new normalized tensor.

2 Likes

Thanks for your reply. That makes lots of sense. :slight_smile:

FWIW I had implemented something similar before stumbling upon StandardScaler; I gave it a slightly worse name :wink:

class DatasetNorm1d(nn.Module):
    """Records dataset stats. kthxbye"""
    # Similar to: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
    def __init__(self, num_features):
        super().__init__()
        self._num_features = num_features
        self.register_buffer('mean', torch.full((self._num_features,), np.nan))
        self.register_buffer('var', torch.full((self._num_features,), np.nan))

    def _is_initialized(self):
        if torch.isnan(self.mean).any() or torch.isnan(self.var).any():
            return False
        return True

    @torch.no_grad()
    def initialize(self, input_batch):
        """
        Args:
            input_batch: Batch that should represent *all* inputs for a given
                dataset.

        Example:

            norm.initialize(torch.cat([x for (x, _) in train_loader]))
        """
        # TODO(eric.cousineau): Use an accurate running computation?
        # See: https://github.com/pytorch/pytorch/blob/480851ad/aten/src/ATen/native/Normalization.cpp#L215-L269
        assert not self._is_initialized()
        N, L = input_batch.shape
        assert L == self._num_features
        assert N > 1
        var_mean = torch.var_mean(input_batch, dim=0)
        self.var.data[:], self.mean.data[:] = var_mean

    def forward(self, x):
        if not self._is_initialized():
            raise RuntimeError("This must be initialized on the dataset!")
        y = (x - self.mean) / torch.sqrt(self.var)
        return y
# Terse test code
class TestStuff(unittest.TestCase):
    def test_dataset_norm(self):
        xs = torch.Tensor([
            [1., 10.],
            [2., 20.],
            [3., 30.],
            [4., 40.],
        ])

        norm = mut.DatasetNorm1d(2)
        with self.assertRaises(RuntimeError):
            norm(xs)

        norm.initialize(xs)
        ys = norm(xs)
        np.testing.assert_array_equal(ys[:, 0].numpy(), ys[:, 1].numpy())
        self.assertEqual(list(norm.state_dict().keys()), ["mean", "var"])

if this is useful to anyone, i wrapped ptrblcks code in a class for my convenience

class TorchStandardScaler:
  def fit(self, x):
    self.mean = x.mean(0, keepdim=True)
    self.std = x.std(0, unbiased=False, keepdim=True)
  def transform(self, x):
    x -= self.mean
    x /= (self.std + 1e-7)
    return x

data = torch.tensor([[0, 0], [0, 0], [1, 1], [1, 1]]).float()
foo = TorchStandardScaler()
foo.fit(data)
print(f"mean {foo.mean}, std {foo.std}")
foo.transform(data)
5 Likes