How to calculate a loss twice with same input and same function

I want to calculate loss from same input twice but feed the result to different optimizer

def cal_loss(output, target):
    return F.nll_loss(output, target)

def train(args, model, device, train_loader, epoch):
    optimizer1 = optim.Adadelta(model.parameters(), lr=args.lr)
    optimizer2 = optim.Adadelta(model.parameters(), lr=args.lr)

    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)

        optimizer1.zero_grad()
        loss1 = cal_loss(output, target)
        loss1.backward()
        optimizer1.step()

        optimizer2.zero_grad()
        loss2 = cal_loss(output, target)
        loss2.backward()
        optimizer2.step()

But I get the error on loss2.backward()

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

I tried to use torch.Tensor.clone to clone the input, but the error still occurs.

        loss1 = cal_loss(torch.Tensor.clone(output),
                         torch.Tensor.clone(target))
Click to view Mnist reproducable example
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


def cal_loss(output, target):
    return F.nll_loss(output, target)

def train(args, model, device, train_loader, epoch):
    optimizer1 = optim.Adadelta(model.parameters(), lr=args.lr)
    optimizer2 = optim.Adadelta(model.parameters(), lr=args.lr)

    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)

        optimizer1.zero_grad()
        loss1 = cal_loss(output, target)
        loss1.backward()
        optimizer1.step()

        optimizer2.zero_grad()
        loss2 = cal_loss(output, target)
        loss2.backward()
        optimizer2.step()

        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            if args.dry_run:
                break


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=14, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--dry-run', action='store_true', default=False,
                        help='quickly check a single pass')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    train_kwargs = {'batch_size': args.batch_size}
    test_kwargs = {'batch_size': args.test_batch_size}
    if use_cuda:
        cuda_kwargs = {'num_workers': 1,
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)

    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])
    dataset1 = datasets.MNIST('./data', train=True, download=True,
                              transform=transform)
    dataset2 = datasets.MNIST('./data', train=False,
                              transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net().to(device)

    #scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, epoch)
        test(model, device, test_loader)
        #scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")


if __name__ == '__main__':
    main()

Hi Ynj!

If I understand your use case, this should be as simple as:

        optimizer1.zero_grad()
        loss1 = cal_loss(output, target)
        loss1.backward()
        # save gradients
        optimizer1.step()
        # restore gradients
        optimizer2.step()

loss1 and loss2 are the same, so calculating loss2 and calling
loss2.backward() (after zeroing the gradients) just repopulates the
gradients with what they were before.

Note, pytorch optimizers with momentum and weight decay implement
these features by modifying the gradients before performing the
parameter-update step. This is why we “restore gradients” before
calling optimizer2.step().

(I don’t know the specifics of Adadelta, so maybe it doesn’t modify the
gradients, but it might well, and if so, restoring the gradients would be
necessary.)

Best.

K. Frank

Thanks for your reply, Frank! This would work in this case but I have simplified the problem, the real case I’m facing is that the first loss is a sum of cal_loss with the other loss and the second loss is just the cal_loss, so I think it couldn’t be simplified to do one backward. Also the parameters passed into optimizer are different.

def cal_loss(output, target):
    return F.nll_loss(output, target)

def train(args, model, device, train_loader, epoch):
    optimizer1 = optim.Adadelta(model.parameters(), lr=args.lr)
    optimizer2 = optim.Adadelta(model.parameters(), lr=args.lr)

    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)

        optimizer1.zero_grad()
        loss1 = cal_loss(output, target)
        loss2 = cal_loss2() # different loss calculation function
        loss3 = cal_loss3() # another different loss calculation function
        loss = loss1 + loss2 + loss3
        loss.backward()
        optimizer1.step()

        optimizer2.zero_grad()
        loss4 = cal_loss(output, target)
        loss4.backward()
        optimizer2.step()

What confuses me is why the error occurs when I copy the object with torch.Tensor.clone.

I try to fix it by adding retain_graph=True to the first backward and move optimizer1.step() after the second backward.

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)

        optimizer1.zero_grad()
        loss1 = cal_loss(output,
                         target)
        loss1.backward(retain_graph=True)

        optimizer2.zero_grad()
        loss2 = cal_loss(output,
                          target)
        loss2.backward()

        optimizer1.step()
        optimizer2.step()

But I’m not sure if it is suitable since the input for cal_loss is the same and if torch.Tensor.clone is needed.

Hi Ynj!

Using retain_graph = True should work for your use case by letting
you backpropagate through the output = model (data) part of the
graph twice as you backpropagate your two losses.

I would not recommend this. There is no point in doing so, and if your
two optimizers share any parameters (and modify the gradients, as I
assume Adadelta does), the call to optimizer1.step() will corrupt
the gradients used by optimizer2.step(). Move optimizer1.step()
to just after loss1.backward(retain_graph=True) and before
optimizer2.zero_grad().

There is no need for torch.Tensor.clone()

As an aside, it’s really unhelpful that you’re not posting your actual code.
Here, loss1 and loss2 are the same, but in your previous post you
emphasized that they are actually different (as is the case in the code in
your previous post).

And then in your previous post you said

but posted this code:

in which the same parameters are passed into the two Optimizers.

If you randomly modify your code before posting it, you are unlikely to
get useful answers to your questions.

The best way to post code is to post fully-self-contained, runnable code,
together with the actual output you get from running that code, followed
by an explanation of why that output isn’t what you wanted or expected.

If you have to simply or trim down your code to make it fully self-contained,
that’s actually a good thing, as doing so is the first step in your debugging
process.

Best.

K. Frank

1 Like

Thanks for your time and detailed explanation and sorry for not bring you good answering experience.

Move optimizer1.step() to just after loss1.backward(retain_graph=True) and before optimizer2.zero_grad() .

If doing so, I will get modified by an inplace error.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 32, 3, 3]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

As an aside, it’s really unhelpful that you’re not posting your actual code.

I’m sorry this causes inconvenience for you. If I post the actual code, it is not executable and quite long to read. So I decided to extract the key problem and create a minimal Mnist example and attach the reproducible code below the post.


It’s my bad, I didn’t notice that optimizer parameters matter much in this case, so is the optimizer. Util you point out that if your two optimizers share any parameters (and modify the gradients, as I assume Adadelta does), the call to optimizer1.step() will corrupt the gradients used by optimizer2.step().

You’re right, it’s my bad trimming my original code too much in this example[1]. And the simplicity seems to lead to a X-Y problem. Finally it turns out that the problem is not I want to calculate the same loss twice but the sequence of calling optimizer.step() and the twice loss.backward().

[1] I don’t know except F.nll_loss, which loss is suitable for Mnist example. So I only create the same loss but not create other loss methods.

Anyway, if anyone is interested, in the following code I define the original try into bad_train and the runnable fixed one into train.

Click to view code
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


def cal_loss(output, target):
    return F.nll_loss(output, target)

def bad_train(args, model, device, train_loader, epoch):
    optimizer1 = optim.Adam(list(model.parameters())[:4], lr=args.lr)
    optimizer2 = optim.Adam(list(model.parameters())[4:], lr=args.lr)

    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)

        optimizer1.zero_grad()
        loss1 = cal_loss(output,
                         target)
        loss1.backward()
        optimizer1.step()

        optimizer2.zero_grad()
        loss2 = cal_loss(output,
                         target)
        loss2.backward()
        optimizer2.step()

        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss1: {:.6f}\tLoss2: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss1.item(), loss2.item()))
            if args.dry_run:
                break

def train(args, model, device, train_loader, epoch):
    optimizer1 = optim.Adam(list(model.parameters())[:4], lr=args.lr)
    optimizer2 = optim.Adam(list(model.parameters())[4:], lr=args.lr)

    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)

        optimizer1.zero_grad()
        loss1 = cal_loss(output,
                         target)
        loss1.backward(retain_graph=True)

        optimizer2.zero_grad()
        loss2 = cal_loss(output,
                         target)
        loss2.backward()

        optimizer1.step()
        optimizer2.step()

        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss1: {:.6f}\tLoss2: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss1.item(), loss2.item()))
            if args.dry_run:
                break

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=14, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--dry-run', action='store_true', default=False,
                        help='quickly check a single pass')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    train_kwargs = {'batch_size': args.batch_size}
    test_kwargs = {'batch_size': args.test_batch_size}
    if use_cuda:
        cuda_kwargs = {'num_workers': 1,
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)

    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])
    dataset1 = datasets.MNIST('./data', train=True, download=True,
                              transform=transform)
    dataset2 = datasets.MNIST('./data', train=False,
                              transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net().to(device)

    #scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, epoch)
        test(model, device, test_loader)
        #scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")


if __name__ == '__main__':
    main()