Memory blow-up for partitioned backpropagation

I am implementing a block-partitioned method to train. I do partitioned backpropagation as in the following MWE.

import argparse
import sys
import time
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torchvision import models


def main(args):
    cuda = torch.cuda.is_available()
    net = models.resnet50(num_classes=10)
    if cuda:
        net = net.to('cuda')
    # The usual CIFAR10 dataloader function
    trainloader, testloader = get_dataloader(args.bs, args.workers, cuda)
    param_list = list(net.parameters())
    criterion = nn.CrossEntropyLoss()
    # the indices of the tensors which need to be backproped with respect to
    rankstart, rankstop = int(args.start_part * len(param_list) /
                              100), len(param_list)
    started_at = time.perf_counter()
    for epoch in range(args.epochs):
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            if cuda:
                inputs, targets = inputs.to('cuda'), targets.to('cuda')
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            grad = torch.autograd.grad(loss, param_list[rankstart:rankstop])
            counter = 0
            for index, p in enumerate(param_list):
                if (index < rankstart or index >= rankstop):
                    continue
                d_p = grad[counter]
                counter += 1
                p.data.add_(d_p, alpha=-args.lr)
            print("Epoch: ", epoch, "Batch-id: ", batch_idx, "Loss: ",
                  loss.item(), " Elapsed-time: ",
                  time.perf_counter() - started_at)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Paritioned Backprop')
    parser.add_argument(
        '--start-part',
        type=int,
        default=50,
        help='percentage index of first tensor in the partition (default: 0)')
    parser.add_argument('--workers',
                        default=4,
                        type=int,
                        help='number of data loading workers (default: 4)')
    parser.add_argument('--bs',
                        default=256,
                        type=int,
                        help='Batch size (default: 256)')
    parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
    parser.add_argument('--epochs', default=1, type=int, help='Epochs')
    args = parser.parse_args()
    try:
        main(args)
        print("Completed!")
    except KeyboardInterrupt:
        print("Stopped!")
        sys.exit()

The argument --start-part determines which tensor (weight/bias in a layer) to start the backprop at in terms of percentage of the length of the parameter list. Now, if I do backpropagation with respect to the entire model, i.e. take --start-part = 0, the memory footprint on my GPU is approx 1590 MB; whereas, if I take --start-part = 50, i.e. do backprop with respect to a half of the model, the memory footprint is approx 1840 MB. Isn’t it counterintuitive?
Is there something wrong here in the way I am using torch.autograd.grad?