Why the backward propagation time is not accumulated?

I do a test to profile the runtime of forward and backward propagation from input layer to any layer in the neural network. I find that the runtime is not accumulated. For my understand, the forward and backward pass execute as chain. Thus, the time should be accumulated. How to explain it. My code is shown below.

import torch.nn as nn

__all__ = ['AlexNet', 'alexnet']

class AlexNet(nn.Module):
    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), # 0
            nn.ReLU(inplace=True), # 1
            nn.MaxPool2d(kernel_size=3, stride=2), # 2
            nn.Conv2d(64, 192, kernel_size=5, padding=2), # 3
            nn.ReLU(inplace=True), # 4
            nn.MaxPool2d(kernel_size=3, stride=2), # 5
            nn.Conv2d(192, 384, kernel_size=3, padding=1), # 6
            nn.ReLU(inplace=True), # 7
            nn.Conv2d(384, 256, kernel_size=3, padding=1), # 8
            nn.ReLU(inplace=True), # 9
            nn.Conv2d(256, 256, kernel_size=3, padding=1), # 10
            nn.ReLU(inplace=True), # 11
            nn.MaxPool2d(kernel_size=3, stride=2), # 12
            nn.AdaptiveAvgPool2d((6, 6)), # 13
            nn.Flatten(), # 14
            nn.Dropout(), # 15
            nn.Linear(256 * 6 * 6, 4096), # 16
            nn.ReLU(inplace=True), # 17
            nn.Dropout(), # 18
            nn.Linear(4096, 4096), # 19
            nn.ReLU(inplace=True), # 20
            nn.Linear(4096, num_classes), # 21
        )

    def forward(self, x, partition=0):
            if partition == 0:
                x = self.features(x)
            elif partition == 1:
                x = self.features[2:](x)
            elif partition == 2:
                x = self.features[3:](x)
            elif partition == 3:
                x = self.features[5:](x)
            elif partition == 4:
                x = self.features[6:](x)
            elif partition == 5:
                x = self.features[8:](x)
            elif partition == 6:
                x = self.features[10:](x)
            elif partition == 7:
                x = self.features[12:](x)
            elif partition == 8:
                x = self.features[13:](x)
            elif partition == 9:
                x = self.features[14:](x)
            elif partition == 10:
                x = self.features[15:](x)
            elif partition == 11:
                x = self.features[16:](x)
            elif partition == 12:
                x = self.features[18:](x)
            elif partition == 13:
                x = self.features[19:](x)
            elif partition == 14:
                x = self.features[21:](x)
            else:
                print('Please give the right partition point.')
        return x

def alexnet(num_classes=1000):
    model = AlexNet(num_classes)
    return model
saveTime = []
for each_par in range(1, partition_number):
    partition = int(network_dict[each_par][0])
    print('each_par:', partition)
    Model_mobile = set_mobile_model_parameters(Model_mobile, partition)
    Model_mobile.train()
    optimizer_mobile = optim.SGD(filter(lambda p: p.requires_grad, Model_mobile.parameters()), lr=0.001, momentum=0.9)

    i = 0
    runtime = 0
    for i, data in enumerate(trainloader):
        i += 1
        if i > iterNum:
            break
        data, target = data[0].to(device), data[1].to(device)

        torch.cuda.synchronize(device=device)
        start_time = time.time()
        optimizer_mobile.zero_grad()
        middle = Model_mobile(data, partition=each_par )
        # print(middle.size())
        if partition == partition_number - 1:
            middle = criterion(middle, target)
            middle.backward()
        else:
            gradient = torch.randn(middle.size())
            # print(gradient.size())
            gradient = gradient.to(device)
            torch.autograd.backward(middle, grad_tensors=gradient)

        optimizer_mobile.step()

        torch.cuda.synchronize(device=device)
        end_time = time.time()
        runtime += end_time - start_time

    saveTime.append(runtime/iterNum)
    print('runtime:', runtime/iterNum)

result are shown:

each_par: 2
runtime: 0.015620875358581542
each_par: 3
runtime: 0.04686017036437988
each_par: 5
runtime: 0.028117918968200685
each_par: 6
runtime: 0.020307302474975586
each_par: 8
runtime: 0.014059686660766601
each_par: 10
runtime: 0.0140580415725708
each_par: 12
runtime: 0.015621137619018555
each_par: 13
runtime: 0.01718132495880127
each_par: 14
runtime: 0.018743491172790526
each_par: 15
runtime: 0.017183089256286622
each_par: 16
runtime: 0.017182421684265137
each_par: 18
runtime: 0.02342941761016846
each_par: 19
runtime: 0.02186737060546875
each_par: 21
runtime: 0.02499072551727295
each_par: 22
runtime: 0.02187349796295166