How to calculate loss properly?

Background here. I want to implement Neural Style Transfer using Pytorch from scratch (for educational purpose). There are 2 models, which are transformer net (T) and loss net (L). With given input x, I then will compute o = T(x). Then compute loss from L(x, o). But the loss from loss net did not propagate back to transformer net.

I don’t think I understand how pytorch autograd well enough to know what is going wrong. But the code below is what I came up with, and if I set lossNet on requires_grad to False. The code give me a error of loss does not have grad_fn.

Please advised, Thank you


Here are the simplify code of my current situation where Model A is transformer net and Model B is loss net

class ModelA(nn.Module):

    def __init__(self, requires_grad=True):
        super(ModelA, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 16, 3, padding=1)
        self.relu1 = torch.nn.ReLU(inplace=False)
        self.maxpool = torch.nn.MaxPool2d(2, padding=0)
        self.conv2 = torch.nn.Conv2d(16, 16, 1)
        self.relu2 = torch.nn.ReLU(inplace=False)
        self.conv3T = torch.nn.ConvTranspose2d(16, 3, 1)
        self.relu3 = torch.nn.ReLU(inplace=False)
        self.upSampling = torch.nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, in_x):
        x = in_x
        x = self.relu1(self.conv1(x))
        x = self.maxpool(x)
        x = self.relu2(self.conv2(x))
        x = self.relu3(self.conv3T(x))
        x = self.sigmoid(self.upSampling(x))
        return x

class ModelB(nn.Module):

    def __init__(self):
        super(ModelB, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 64, 3, padding=1)
        self.relu1 = torch.nn.ReLU(inplace=False)
        self.maxpool1 = torch.nn.MaxPool2d(2, padding=1)
        self.conv2 = torch.nn.Conv2d(64, 128, 3, padding=1)
        self.relu2 = torch.nn.ReLU(inplace=False)
        self.maxpool2 = torch.nn.MaxPool2d(2, padding=1)
        self.conv3 = torch.nn.Conv2d(128, 256, 3, padding=1)
        self.relu3 = torch.nn.ReLU(inplace=False)
        self.maxpool3 = torch.nn.MaxPool2d(2, padding=1)
    def forward(self, in_x, in_o):
        x = in_x
        o = in_o

        x = self.maxpool1(self.relu1(self.conv1(x)))
        x = self.maxpool2(self.relu2(self.conv2(x)))
        x = self.maxpool3(self.relu3(self.conv3(x)))

        o = self.maxpool1(self.relu1(self.conv1(o)))
        o = self.maxpool2(self.relu2(self.conv2(o)))
        o = self.maxpool3(self.relu3(self.conv3(o)))
        return x, o

The Model B is then separate into subnetwork and wrapped inside another class before merge again

class ModuleWrapper(nn.Module):

    def __init__(self, subnetwork, isUseCuda):

        super(ModuleWrapper, self).__init__()

        self.layers = list(subnetwork.children()) = nn.Sequential(*self.layers)

        self.loss = torch.tensor(0.0).float()
        self.not_inplace = lambda layer: nn.ReLU(inplace=False) if isinstance(layer, nn.ReLU) else layer

        if isUseCuda:
            self.loss = self.loss.cuda()
    def forward(self, in_x, in_o):
        if torch.cuda.is_available():
            x = in_x.cuda()
            o = in_o.cuda()
            x = in_x
            o = in_o
        for layer in self.layers:
            layer = self.not_inplace(layer)
            x = layer(x)
            o = layer(o)
            if torch.cuda.is_available():
                x = x.cuda()
                o = o.cuda()

        self.loss = F.mse_loss(x, o)
        return x, o

class MergeWrapper(nn.Module):
    def __init__(self, modules):
        super(MergeWrapper, self).__init__() = nn.Sequential(*modules)
    def forward(self, in_x, in_o):
        x, o = in_x, in_o
        for module in
            x, o = module(x, o)
        return x, o
    def get_module(self, index):
        assert 0 <= index < len(
    def max_seq(self):
        return len(

backboneNet= ModelA()
for param in backboneNet.parameters():
    param.requires_grad = True

lossNet = ModelB()
# Assume Model B is pretrained
for param in lossNet.parameters():
    param.requires_grad = False

layers = list(lossNet.children())
subnet = []
subnet.append(ModuleWrapper(nn.Sequential(*layers[0:3]), useCuda))
subnet.append(ModuleWrapper(nn.Sequential(*layers[3:6]), useCuda))
subnet.append(ModuleWrapper(nn.Sequential(*layers[6:9]), useCuda))

lossNet = MergeWrapper(subnet)

Then I test with the following code

testInput = torch.rand(1, 3, 8, 8)

def train(testInput, num_steps = 10):
    optimizer = optim.LBFGS([testInput.requires_grad_()])
    run = [0]
    while run[0] <= num_steps:
        def closure():
            x = testInput
            o = backboneNet(x)
            x =, 1)
            o =, 1)
            lossNet(x, o)
            loss = torch.tensor(0.0).float().to(device)
            for module in subnet:
                loss += module.loss
            run[0] += 1
            return loss


The printed loss value is not zero when set requires_grad = True on lossNet but the printed back-propagated layers is incorrect. And I have no idea how to fix the issue.



for param in backboneNet.parameters():
    param.requires_grad = True

is not necessary as these parameters are created as nn.Parameters and so will have requires_grad=True by default.

You can also replace loss = torch.tensor(0.0).float().to(device) by loss = 0.

Your code otherwise looks ok, the backward call should backpropagate in all the network.
Why do you think this does not happen? What is the print_backprop function doing?

print_backprop function print out the module in which the loss will back-propagate

Here is the code

def print_backprop(loss):
    acc = []
    tmp = loss.grad_fn
    while tmp != None:
            tmp = tmp.next_functions[0][0]

The result I’ve got is

['<AddBackward1 object at 0x107d03940>', '<AddBackward1 object at 0x107d03470>', 'None']

Also the loss value never change. So I assume that it loss is never propagate back into the network (Also from the print_backprop function).

This function is only looking at the first of the next functions. That might be why it does not show everything? next_functions is a list of tuples.

Thank you to point that out for me. I will modify my code shortly. But it still doesn’t explain why the loss is not changing at all. Any ideas?

The modified code now is

def loop_stack(loss, acc):
    if loss == None:
        print(list(map(lambda x: str(x)[1:-1].split(" ")[0], acc)))
    new_acc = acc[:] + [loss]
        losses_child = list(map(lambda x: x[0], loss.next_functions))
        for l in losses_child:
            loop_stack(l, new_acc)
        print(list(map(lambda x: str(x)[1:-1].split(" ")[0], acc)))
def print_backprop(loss):
    tmp = loss.grad_fn
    loop_stack(tmp, [])

The back-propagated stack is now make sense. But this doesn’t explain why the loss is not updating the backbone model. I can verify this by the following snippet


And like I mentioned before if I set lossNet model requires_grad=False, it raise this error
element 0 of tensors does not require grad and does not have a grad_fn and the output of print_backprop is also an empty list.
So I suspect my code isn’t correct, but I am lost on how to solve this problem

Update #1

After some debugging, I found that if I remove, 1) from my train function, the back-propagate stack is now include the backbone module’s layer. I think it is because clamp operation is not differentiable, so it just stop there. But the loss.backward() is still not updating the backbone model

Update #2

I think I have found the bug. The line where I wrote optim.LBFGS([testInput.requires_grad_()]) need to be changed to optim.LBFGS(backboneNet.parameters())

Thank you for your support @albanD


Ho I guessed you were trying to update your image only to generate a new one.
Yes otherwise, the parameters that are updated are the ones given to the optimizer only !

1 Like