Custom LayerNorm vs PyTorch implementation

james5 · February 1, 2024, 10:59am

Hello,
I stumbled upon the implementation of LayerNorm which was based on ConvNeXt/models/convnext.py at main · facebookresearch/ConvNeXt · GitHub. I wanted to compare it to GroupNorm and the results are pretty weird. Here is snippet:

import torch
import torch.nn as nn

class LayerNormFunction(torch.autograd.Function):

    @staticmethod
    def forward(ctx, x, weight, bias, eps):
        ctx.eps = eps
        N, C, H, W = x.size()
        var, mu = torch.var_mean(x, dim=1, keepdim=True, unbiased=False)
        y = (x - mu) / torch.sqrt(var + eps)
        ctx.save_for_backward(y, var, weight)
        y = weight.view(1, C, 1, 1) * y + bias.view(1, C, 1, 1)
        return y

class LayerNorm2d(nn.Module):

    def __init__(self, channels, eps=1e-5):
        super(LayerNorm2d, self).__init__()
        self.register_parameter('weight', nn.Parameter(torch.ones(channels)))
        self.register_parameter('bias', nn.Parameter(torch.zeros(channels)))
        self.eps = eps

    def forward(self, x):
        return LayerNormFunction.apply(x, self.weight, self.bias, self.eps)

torch.manual_seed(0)
x = torch.randn(1, 3, 5, 5)
g = nn.GroupNorm(1, 3)
l = nn.LayerNorm((3, 5, 5))
lc = LayerNorm2d(3)

y_g = g(x)
y_l = l(x)
y_lc = lc(x)
print(f"(y_g - y_lc).pow(2).sum().sqrt() = {(y_g - y_lc).pow(2).sum().sqrt()}")
print(f"(y_g - y_l).pow(2).sum().sqrt() = {(y_g - y_l).pow(2).sum().sqrt()}")
print(f"(y_l - y_lc).pow(2).sum().sqrt() = {(y_l - y_lc).pow(2).sum().sqrt()}")

Can someone explain why results of LayerNorm2d is different from PyTorchs’ norms?