Pytorch question : loss backward takes too much time!

I cannot reproduce it on Google Colab using the following script:

import torch
from torch import nn
import time
from datetime import datetime
import numpy as np

class ResidualLearningNet(nn.Module):
    def __init__(self):
        super(ResidualLearningNet, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv8 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv9 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
        self.conv10 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv11 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv12 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
    def forward(self, x):
        identify1=x
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out) 
        out = out + identify1
        identify2=out
        out = self.conv4(out)
        out = self.conv5(out)
        out = self.conv6(out) 
        out = out + identify2
        identify3=out
        out = self.conv7(out)
        out = self.conv8(out)
        out = self.conv9(out) 
        out = out + identify3
        identify4=out
      
        out = self.conv10(out)
        out = self.conv11(out)
        out = self.conv12(out) 
        out = out + identify4

        return out

model = ResidualLearningNet().cuda()
optimizer=torch.optim.SGD(model.parameters(),lr=0.000001)
criterion=nn.MSELoss()

n_epochs=6
print_every=1
val_loss_min=np.Inf
for epoch in range(n_epochs):
  train_loss=0
  model.train()
  for iteration in range(10):
    print('Iteration:',iteration,'started')
    torch.cuda.synchronize()
    strat_iteration=datetime.now()
    optimizer.zero_grad()
    
    Xstage1 = torch.randn(128, 1, 64, 64, device='cuda')
    output=model(Xstage1)
    X_batch = torch.rand_like(output)
    loss=criterion(output,X_batch)
    loss.backward(retain_graph=True)
    optimizer.step()
    train_loss+=loss.item()*X_batch.shape[0]
    torch.cuda.synchronize()
    end_iteration = datetime.now() 
    print('Iteration stopped.')
    print('Iteration time:',end_iteration-strat_iteration)

Output on Colab:

Iteration: 0 started
Iteration stopped.
Iteration time: 0:00:00.609628
Iteration: 1 started
Iteration stopped.
Iteration time: 0:00:00.560231
Iteration: 2 started
Iteration stopped.
Iteration time: 0:00:00.559885
Iteration: 3 started
Iteration stopped.
Iteration time: 0:00:00.560044
Iteration: 4 started
Iteration stopped.
Iteration time: 0:00:00.559213
Iteration: 5 started
Iteration stopped.
Iteration time: 0:00:00.560005
Iteration: 6 started
Iteration stopped.
Iteration time: 0:00:00.559196
Iteration: 7 started
Iteration stopped.
Iteration time: 0:00:00.560828
Iteration: 8 started
Iteration stopped.
Iteration time: 0:00:00.559657
Iteration: 9 started
Iteration stopped.
Iteration time: 0:00:00.560312
Iteration: 0 started
Iteration stopped.
Iteration time: 0:00:00.560171
...

Where do you have your dataset stored - google drive or colab enviroment?

I once had the same experience and it was because my dataset is stored in google drive. After I copied the dataset from google drive to the colab environment, the timing for training becomes reasonable.

Kindly note that you will have to copy everytime once your runtime session is reset.