In meta-learning task, the gradient of nn.KLDivLoss is zero while self-defined KL loss are different

Hi, I am working on a meta-learning task.

Now, I found the gradient of nn.KLDivLoss and self-defined KL loss are different:
If I use the official nn.KLDivLoss, the gradient of metanet will be all zero.
But when I use self-defined KL loss, the gradient of metanet won’t be zero.
Btw, when I use official MSE loss, the gradient of metanet is also not zero.
Why does that happen?

I have to use the MetaModule, the reason is:

the reason is because we need to compute for the gradients after the parameter updates and the original implementations of the layers doesn’t really allow you to do that in a straightforward way. (I may be wrong but i think its because the original update will do something like this = - learning_rate * gradient, so its not going to compute the gradients after the update) . I found it easier to just define the weights as variables instead.

Please refer here: Why redefining the default pytorch nn.Modules? · Issue #2 · danieltan07/learning-to-reweight-examples · GitHub

Thank you for your help!

Here is the code:

note: the upper part is for defining models, the important part is in the last.

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from torch.autograd import Variable
import torch.nn.init as init

def to_var(x, requires_grad=True):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x, requires_grad=requires_grad)

class MetaModule(nn.Module):
    # adopted from: Adrien Ecoffet
    def params(self):
        for name, param in self.named_params(self):
            yield param

    def named_leaves(self):
        return []

    def named_submodules(self):
        return []

    def named_params(self, curr_module=None, memo=None, prefix=''):
        if memo is None:
            memo = set()

        if hasattr(curr_module, 'named_leaves'):
            for name, p in curr_module.named_leaves():
                if p is not None and p not in memo:
                    yield prefix + ('.' if prefix else '') + name, p
            for name, p in curr_module._parameters.items():
                if p is not None and p not in memo:
                    yield prefix + ('.' if prefix else '') + name, p

        for mname, module in curr_module.named_children():
            submodule_prefix = prefix + ('.' if prefix else '') + mname
            for name, p in self.named_params(module, memo, submodule_prefix):
                yield name, p

    def update_params(self, lr_inner, first_order=False, source_params=None, detach=False):
        if source_params is not None:
            for tgt, src in zip(self.named_params(self), source_params):
                name_t, param_t = tgt
                # name_s, param_s = src
                # grad = param_s.grad
                # name_s, param_s = src
                grad = src
                if first_order:
                    grad = to_var(grad.detach().data)
                tmp = param_t - lr_inner * grad
                self.set_param(self, name_t, tmp)
            for name, param in self.named_params(self):
                if not detach:
                    grad = param.grad
                    if first_order:
                        grad = to_var(grad.detach().data)
                    tmp = param - lr_inner * grad
                    self.set_param(self, name, tmp)
                    param = param.detach_()  #
                    self.set_param(self, name, param)

    def set_param(self, curr_mod, name, param):
        if '.' in name:
            n = name.split('.')
            module_name = n[0]
            rest = '.'.join(n[1:])
            for name, mod in curr_mod.named_children():
                if module_name == name:
                    self.set_param(mod, rest, param)
            setattr(curr_mod, name, param)

    def detach_params(self):
        for name, param in self.named_params(self):
            self.set_param(self, name, param.detach())

    def copy(self, other, same_var=False):
        for name, param in other.named_params():
            if not same_var:
                param = to_var(, requires_grad=True)
            self.set_param(name, param)

class MetaLinear(MetaModule):
    def __init__(self, *args, **kwargs):
        ignore = nn.Linear(*args, **kwargs)

        self.register_buffer('weight', to_var(, requires_grad=True))
        self.register_buffer('bias', to_var(, requires_grad=True))

    def forward(self, x):
        return F.linear(x, self.weight, self.bias)

    def named_leaves(self):
        return [('weight', self.weight), ('bias', self.bias)]

class MetaConv2d(MetaModule):
    def __init__(self, *args, **kwargs):
        ignore = nn.Conv2d(*args, **kwargs)

        self.in_channels = ignore.in_channels
        self.out_channels = ignore.out_channels
        self.stride = ignore.stride
        self.padding = ignore.padding
        self.dilation = ignore.dilation
        self.groups = ignore.groups
        self.kernel_size = ignore.kernel_size

        self.register_buffer('weight', to_var(, requires_grad=True))

        if ignore.bias is not None:
            self.register_buffer('bias', to_var(, requires_grad=True))
            self.register_buffer('bias', None)

    def forward(self, x):
        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)

    def named_leaves(self):
        return [('weight', self.weight), ('bias', self.bias)]

class MetaBatchNorm2d(MetaModule):
    def __init__(self, *args, **kwargs):
        ignore = nn.BatchNorm2d(*args, **kwargs)

        self.num_features = ignore.num_features
        self.eps = ignore.eps
        self.momentum = ignore.momentum
        self.affine = ignore.affine
        self.track_running_stats = ignore.track_running_stats

        if self.affine:
            self.register_buffer('weight', to_var(, requires_grad=True))
            self.register_buffer('bias', to_var(, requires_grad=True))

        if self.track_running_stats:
            self.register_buffer('running_mean', torch.zeros(self.num_features))
            self.register_buffer('running_var', torch.ones(self.num_features))
            self.register_parameter('running_mean', None)
            self.register_parameter('running_var', None)

    def forward(self, x):
        return F.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
                   or not self.track_running_stats, self.momentum, self.eps)

    def named_leaves(self):
        return [('weight', self.weight), ('bias', self.bias)]

class LambdaLayer(MetaModule):
    def __init__(self, lambd):
        super(LambdaLayer, self).__init__()
        self.lambd = lambd

    def forward(self, x):
        return self.lambd(x)

class BasicBlock(MetaModule):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, option='A'):
        super(BasicBlock, self).__init__()
        self.conv1 = MetaConv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = MetaBatchNorm2d(planes)
        self.conv2 = MetaConv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = MetaBatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            if option == 'A':
                For CIFAR10 ResNet paper uses option A.
                self.shortcut = LambdaLayer(lambda x:
                                            F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), "constant", 0))
            elif option == 'B':
                self.shortcut = nn.Sequential(
                    MetaConv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                    MetaBatchNorm2d(self.expansion * planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

def _weights_init(m):
    classname = m.__class__.__name__
    if isinstance(m, MetaLinear) or isinstance(m, MetaConv2d):

class ResNetDifferentSize(MetaModule):
    # defalut, resnet32
    def __init__(self, num_classes=10, block=BasicBlock, num_blocks=[5, 5, 5]):
        super(ResNetDifferentSize, self).__init__()
        self.in_planes = 16

        self.conv1 = MetaConv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = MetaBatchNorm2d(16)
        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
        self.linear = MetaLinear(64, num_classes)


    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion

        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.avg_pool2d(out, out.size()[3])
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

class VNet(MetaModule):
    def __init__(self, input, hidden1, output):
        super(VNet, self).__init__()
        self.linear1 = MetaLinear(input, hidden1)
        self.relu1 = nn.ReLU(inplace=True)
        self.linear2 = MetaLinear(hidden1, output)

    def forward(self, x1, x2):
        x =, x2), dim=1)
        x = self.linear1(x)
        x = self.relu1(x)
        out = self.linear2(x)
        return out

class TrainManager(object):
    def __init__(self):
        self.model = ResNetDifferentSize(num_classes=10, num_blocks=[1,1,1])
        self.metanet = VNet(10*2, 100, 10)
        self.device = 'cpu'
        self.optimizer = optim.SGD(self.model.params(), lr=0.01, momentum=0.99, weight_decay=0.001)
        self.optimizer_metanet = torch.optim.Adam(self.metanet.params(), 1e-3, weight_decay=1e-4)

    def train(self):
        for epoch in range(10):

            # fake data, just use as examples
            data, target = torch.randn((2,3,32,32)), torch.tensor([[1,0,0,0,0,0,0,0,0,0], [0,1,0,0,0,0,0,0,0,0]])
            data, target =,

            # get the meta_model for fake output
            meta_model = ResNetDifferentSize(num_classes=10, num_blocks=[1,1,1])
            meta_output = meta_model(data)

            # get fake teacher_outputs
            teacher_outputs = torch.tensor([[0,0,0.9,0.1,0,0,0,0,0,0], [0,0.1,0.9,0,0,0,0,0,0,0]])
            teacher_outputs =
            # get residual
            residual_teacher_outputs = self.metanet(,
            teacher_outputs += residual_teacher_outputs

            # get losss
            # play with these loss as you want
            def kl_categorical(p_logit, q_logit):
              _kl = torch.sum(F.softmax(p_logit, dim=-1) * (F.log_softmax(p_logit, dim=-1) - F.log_softmax(q_logit, dim=1)), 1)
              return torch.mean(_kl)
            loss_KD = kl_categorical(teacher_outputs, meta_output)
            # loss_KD = nn.KLDivLoss(reduction='batchmean')(F.log_softmax(meta_output, dim=1), F.softmax(teacher_outputs, dim=1)) # fake model KDloss
            # loss_KD = nn.MSELoss()(F.log_softmax(meta_output, dim=1), F.softmax(teacher_outputs, dim=1))
            l_f_meta = loss_KD
            # one step update for meta_model
            grads = torch.autograd.grad(l_f_meta, (meta_model.params()), create_graph=True)
            meta_model.update_params(lr_inner=0.1, source_params=grads)
            del grads

            # get meta_batch, # fake meta data, just use as examples
            inputs_meta, targets_meta = torch.randn((2,3,32,32)), torch.tensor([2,3]) #torch.tensor([[0,0,0,0,0,0,1,0,0,0], [0,0,0,0,0,0,0,0,0,1]])
            inputs_meta, targets_meta =,

            # meta loss for metanet
            y_g_hat = meta_model(inputs_meta)
            l_g_meta = F.cross_entropy(y_g_hat, targets_meta)

            # update metanet
            l_g_meta = l_g_meta
            # try to show the gradient of self.metanet!
            # if use the kl_categorical or nn.MSELoss as the loss function of l_f_meta, the gradient won't be zero
            # if use the official nn.KLDivLoss as the loss function of l_f_meta, the gradient will be zero
            print('the gradient of self.metanet after l_g_meta.backward()!')
            for parms in self.metanet.params():
              print('-->grad_requirs:',parms.requires_grad, ' -->grad_value:',torch.sum(torch.abs(parms.grad)))

trainer = TrainManager()