# How to calculate loss properly?

Background here. I want to implement Neural Style Transfer using Pytorch from scratch (for educational purpose). There are 2 models, which are transformer net (T) and loss net (L). With given input x, I then will compute o = T(x). Then compute loss from L(x, o). But the loss from loss net did not propagate back to transformer net.

I don’t think I understand how pytorch autograd well enough to know what is going wrong. But the code below is what I came up with, and if I set lossNet on requires_grad to False. The code give me a error of loss does not have grad_fn.

## Code

Here are the simplify code of my current situation where Model A is transformer net and Model B is loss net

class ModelA(nn.Module):

super(ModelA, self).__init__()
self.conv1 = torch.nn.Conv2d(3, 16, 3, padding=1)
self.relu1 = torch.nn.ReLU(inplace=False)
self.conv2 = torch.nn.Conv2d(16, 16, 1)
self.relu2 = torch.nn.ReLU(inplace=False)
self.conv3T = torch.nn.ConvTranspose2d(16, 3, 1)
self.relu3 = torch.nn.ReLU(inplace=False)
self.upSampling = torch.nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
self.sigmoid = torch.nn.Sigmoid()

def forward(self, in_x):
x = in_x
x = self.relu1(self.conv1(x))
x = self.maxpool(x)
x = self.relu2(self.conv2(x))
x = self.relu3(self.conv3T(x))
x = self.sigmoid(self.upSampling(x))
return x

class ModelB(nn.Module):

def __init__(self):
super(ModelB, self).__init__()
self.conv1 = torch.nn.Conv2d(3, 64, 3, padding=1)
self.relu1 = torch.nn.ReLU(inplace=False)
self.conv2 = torch.nn.Conv2d(64, 128, 3, padding=1)
self.relu2 = torch.nn.ReLU(inplace=False)
self.conv3 = torch.nn.Conv2d(128, 256, 3, padding=1)
self.relu3 = torch.nn.ReLU(inplace=False)

def forward(self, in_x, in_o):
x = in_x
o = in_o

x = self.maxpool1(self.relu1(self.conv1(x)))
x = self.maxpool2(self.relu2(self.conv2(x)))
x = self.maxpool3(self.relu3(self.conv3(x)))

o = self.maxpool1(self.relu1(self.conv1(o)))
o = self.maxpool2(self.relu2(self.conv2(o)))
o = self.maxpool3(self.relu3(self.conv3(o)))

return x, o

The Model B is then separate into subnetwork and wrapped inside another class before merge again

class ModuleWrapper(nn.Module):

def __init__(self, subnetwork, isUseCuda):

super(ModuleWrapper, self).__init__()

self.layers = list(subnetwork.children())
self.net = nn.Sequential(*self.layers)

self.loss = torch.tensor(0.0).float()
self.not_inplace = lambda layer: nn.ReLU(inplace=False) if isinstance(layer, nn.ReLU) else layer

if isUseCuda:
self.loss = self.loss.cuda()

def forward(self, in_x, in_o):
if torch.cuda.is_available():
x = in_x.cuda()
o = in_o.cuda()
else:
x = in_x
o = in_o

for layer in self.layers:
layer = self.not_inplace(layer)
x = layer(x)
o = layer(o)
if torch.cuda.is_available():
x = x.cuda()
o = o.cuda()

self.loss = F.mse_loss(x, o)
return x, o

class MergeWrapper(nn.Module):
def __init__(self, modules):
super(MergeWrapper, self).__init__()
self.net = nn.Sequential(*modules)

def forward(self, in_x, in_o):
x, o = in_x, in_o
for module in self.net:
x, o = module(x, o)
return x, o

def get_module(self, index):
assert 0 <= index < len(self.net)
return self.net[index]

def max_seq(self):
return len(self.net)

backboneNet= ModelA()
for param in backboneNet.parameters():

lossNet = ModelB()
# Assume Model B is pretrained
for param in lossNet.parameters():

layers = list(lossNet.children())
subnet = []
subnet.append(ModuleWrapper(nn.Sequential(*layers[0:3]), useCuda))
subnet.append(ModuleWrapper(nn.Sequential(*layers[3:6]), useCuda))
subnet.append(ModuleWrapper(nn.Sequential(*layers[6:9]), useCuda))

lossNet = MergeWrapper(subnet)

Then I test with the following code

testInput = torch.rand(1, 3, 8, 8)

def train(testInput, num_steps = 10):
run = [0]

while run[0] <= num_steps:
def closure():
x = testInput
o = backboneNet(x)

x = x.data.clamp_(0, 1)
o = o.data.clamp_(0, 1)

lossNet(x, o)

loss = torch.tensor(0.0).float().to(device)

for module in subnet:
loss += module.loss
print(loss)
print_backprop(loss)

loss.backward(retain_graph=True)
run[0] += 1
return loss

optimizer.step(closure)

train(testInput)

The printed loss value is not zero when set requires_grad = True on lossNet but the printed back-propagated layers is incorrect. And I have no idea how to fix the issue.

Hi,

Doing

for param in backboneNet.parameters():

is not necessary as these parameters are created as nn.Parameters and so will have requires_grad=True by default.

You can also replace loss = torch.tensor(0.0).float().to(device) by loss = 0.

Your code otherwise looks ok, the backward call should backpropagate in all the network.
Why do you think this does not happen? What is the print_backprop function doing?

print_backprop function print out the module in which the loss will back-propagate

Here is the code

def print_backprop(loss):
acc = []
while tmp != None:
try:
tmp = tmp.next_functions[0][0]
acc.append(tmp.__str__())
except:
break
print("{}".format(acc))

The result I’ve got is

Also the loss value never change. So I assume that it loss is never propagate back into the network (Also from the print_backprop function).

This function is only looking at the first of the next functions. That might be why it does not show everything? next_functions is a list of tuples.

Thank you to point that out for me. I will modify my code shortly. But it still doesn’t explain why the loss is not changing at all. Any ideas?

The modified code now is

def loop_stack(loss, acc):
if loss == None:
print(list(map(lambda x: str(x)[1:-1].split(" ")[0], acc)))
return
new_acc = acc[:] + [loss]
try:
losses_child = list(map(lambda x: x[0], loss.next_functions))
for l in losses_child:
loop_stack(l, new_acc)
except:
print(list(map(lambda x: str(x)[1:-1].split(" ")[0], acc)))
return

def print_backprop(loss):
loop_stack(tmp, [])

The back-propagated stack is now make sense. But this doesn’t explain why the loss is not updating the backbone model. I can verify this by the following snippet

print(testInput)
print(backboneNet(testInput))
train(testInput)
print(testInput)
print(backboneNet(testInput))

And like I mentioned before if I set lossNet model requires_grad=False, it raise this error
element 0 of tensors does not require grad and does not have a grad_fn and the output of print_backprop is also an empty list.
So I suspect my code isn’t correct, but I am lost on how to solve this problem

## Update #1

After some debugging, I found that if I remove o.data.clamp_(0, 1) from my train function, the back-propagate stack is now include the backbone module’s layer. I think it is because clamp operation is not differentiable, so it just stop there. But the loss.backward() is still not updating the backbone model

## Update #2

I think I have found the bug. The line where I wrote optim.LBFGS([testInput.requires_grad_()]) need to be changed to optim.LBFGS(backboneNet.parameters())

Thank you for your support @albanD

Hi,

Ho I guessed you were trying to update your image only to generate a new one.
Yes otherwise, the parameters that are updated are the ones given to the optimizer only !

1 Like