Pytorch question : loss backward takes too much time!

Hello!
I have a question about simulation of a network by Pytorch.
This is the network that I want o simulate:


and I wrote it’s code:

class ResidualLearningNet(nn.Module):
    def __init__(self):
        super(ResidualLearningNet, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv8 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv9 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
        self.conv10 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv11 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv12 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
    def forward(self, x):
        identify1=x
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out) 
        out = out + identify1
        identify2=out
        out = self.conv4(out)
        out = self.conv5(out)
        out = self.conv6(out) 
        out = out + identify2
        identify3=out
        out = self.conv7(out)
        out = self.conv8(out)
        out = self.conv9(out) 
        out = out + identify3
        identify4=out
      
        out = self.conv10(out)
        out = self.conv11(out)
        out = self.conv12(out) 
        out = out + identify4

        return out

is it wrong?

It’s loss backward time takes hours.
Would you mind helping me with fixing my problems,please.

Hi,

You mean doing a single backward call take hours?
Can you give a small code sample that reproduces that?

Thanks for your response.
Yes. Doing a single backward call, takes approximately an hour.

**Instantiating the Network, Optimizer,and Loss Criterion:**
model=ResidualLearningNet()
model.apply(weights_init)
model.to(device)
optimizer=torch.optim.SGD(model.parameters(),lr=0.00001)
criterion=nn.MSELoss()

**Training**
start_time = datetime.now()
n_epochs=10
print_every=1
val_loss_min=np.Inf
for epoch in range(n_epochs):
  train_loss=0
  model.train()
  for iteration,(Xstage1,X_batch) in enumerate(Train_Loader):
    optimizer.zero_grad()
    output=model(Xstage1.to(device))
    loss=criterion(output,X_batch.to(device))
    retain_graph=True
    loss.backward(retain_graph=True)
    optimizer.step()
    train_loss+=loss.item()*X_batch.shape[0]
    if iteration % print_every==0:
      is_training=model.training
      val_loss=0
      model.eval()
      for (Xstage1_valid,X_valid_batch) in Valid_Loader:
        output=model(Xstage1_valid.to(device))
        loss=criterion(output,X_valid.to(device))
        val_loss+=loss.item()*data.shape[0]
      val_loss=val_loss/len(Valid_Loader.sampler)
      if val_loss<=val_loss_min:
        print('\tValidation loss decreased ({:.6f}-->{:.6f}). Saving Model...'.format(val_loss_min,val_loss))
        torch.save(model.state_dict(),'model_second_step.pt')
        val_loss_min=val_loss
      model.train(mode=is_training)   
  train_loss=train_loss/len(Train_Loader.sampler)
  print('Epoch: {} \tTraining loss:{:.6f}'.format(epoch+1,train_loss))
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

did you mean this part?

Thanks for the code.
Here the timing is for the whole epoch. This is possible that it takes an hour depending on how big your dataset is (and how big your model is already).

thanks.
A single backward takes an hour not one epoch.
Is it OK?

I wrote the script below to time the backward and it says that it took ~120ms on my laptop.
If you use your code above to time, you are timing the whole epoch, not just a single backward call.

import torch
from torch import nn
import time

class ResidualLearningNet(nn.Module):
    def __init__(self):
        super(ResidualLearningNet, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv8 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv9 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
        self.conv10 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, padding=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.conv11 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.conv12 = nn.Sequential(
            nn.Conv2d(32, 1, kernel_size=7, padding=3)
        )
    def forward(self, x):
        identify1=x
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out) 
        out = out + identify1
        identify2=out
        out = self.conv4(out)
        out = self.conv5(out)
        out = self.conv6(out) 
        out = out + identify2
        identify3=out
        out = self.conv7(out)
        out = self.conv8(out)
        out = self.conv9(out) 
        out = out + identify3
        identify4=out
      
        out = self.conv10(out)
        out = self.conv11(out)
        out = self.conv12(out) 
        out = out + identify4

        return out

model = ResidualLearningNet()

inp = torch.rand(2, 1, 64, 64)
out = model(inp)

start = time.time()
out.sum().backward()
print("Took: ", time.time() - start)

I’m very very thankful of your reply.
But I cannot find out which part of the code takes so much time.
one 128 batch of my data in your takes 1.5 seconds but in my code takes 1 hour.
Is it because of the initialization of the network?

model=ResidualLearningNet()
model.apply(weights_init)
model.to(device)
optimizer=torch.optim.SGD(model.parameters(),lr=0.000001)
criterion=nn.MSELoss()

I wrote the code in google colab. If it helps, I can send you the link of my code.

How big is your dataset? If it one batch takes 1.5s and the whole thing takes 1h. It would mean 2400 batches. So for 10 epochs, that’s 240 batches per epoch. And so 30k samples in your dataset.
Is that correct?

no,I said before that one BATCH takes one hour… :pensive: :pensive: :pensive: :frowning_face:
I have 663 batches so one epoch takes 663 hours :disappointed: :sob:

As @albanD explained, your code profiles the complete training.
Did you change the code to profile a single iteration?
What does @albanD’s code return, if you run it on your machine?

Thanks for your respond.
@albanD 's code took 280 ms in my laptop. I cannot find out what out what is the difference between then and which part of my code is responsible to taking too much time of loss backward part.

Could you explain this part? I can not understand this part.

Did you change the code to profile a single iteration?

Your code profiles the complete training including all epochs:

start_time = datetime.now()
for epoch in range(n_epochs):
  train_loss=0
  model.train()
  for iteration,(Xstage1,X_batch) in enumerate(Train_Loader):
    optimizer.zero_grad()
    ...
  print('Epoch: {} \tTraining loss:{:.6f}'.format(epoch+1,train_loss))
end_time = datetime.now()

start_time is called before the first epoch and iteration is executed, while end_time is called after all epochs are done.

@albanD’s code profiles just the backward call:

start = time.time()
out.sum().backward()
print("Took: ", time.time() - start)

So not a single batch is taking one hour, but the complete training, i.e. 10 epochs.

1 Like

I put a print code to know when a single loss backward finished. There was almost one hour gap between two output of the print code.
I wrote the code in google colab. If it helps, I can send you the link of my code.

Sure, feel free to post the link here.

You can find my code here:
https://colab.research.google.com/drive/1AfernpzyhNpxX6Y5gU31YwqVR_sisVZu?usp=sharing

Thanks for the code.
In the cells, which use datetime to profile the code, you are still starting the timer before the first epoch and stopping after all epochs were done.
The output also shows this:

Epoch: 100 	Training loss:0.020610
Duration: 0:07:08.332585

As you can see, 100 epochs were performed before the timer was stopped.

If you want to profile a single backward call, you would need to stick to the posted code, i.e. start the timer before the backward pass, perform the backward pass, and stop the timer.
If you are working with a GPU, you should additionally synchronize the code before starting and stopping the timer via torch.cuda.synchronize().

1 Like

Hi @ptrblck … thanks for reply… I changed the timing part of the code and it showed that one iteration takes one hour…

start_time = datetime.now()
n_epochs=10
print_every=1
val_loss_min=np.Inf
for epoch in range(n_epochs):
  train_loss=0
  model.train()
  for iteration,(Xstage1,X_batch) in enumerate(Train_Loader):
    optimizer.zero_grad()
    output=model(Xstage1.to(device))
    loss=criterion(output,X_batch.to(device))
    retain_graph=True
    end_time = datetime.now()
    print('before:',end_time-start_time)
    loss.backward(retain_graph=True)
    end_time_back = datetime.now()
    print('backward',end_time_back-end_time)
    optimizer.step()
    train_loss+=loss.item()*X_batch.shape[0]
    if iteration % print_every==0:
      is_training=model.training
      val_loss=0
      model.eval()
      for (Xstage1_valid,X_valid_batch) in Valid_Loader:
        output=model(Xstage1_valid.to(device))
        loss=criterion(output,X_valid_batch.to(device))
        val_loss+=loss.item()*data.shape[0]
      val_loss=val_loss/len(Valid_Loader.sampler)
      if val_loss<=val_loss_min:
        print('\tValidation loss decreased ({:.6f}-->{:.6f}). Saving Model...'.format(val_loss_min,val_loss))
        torch.save(model.state_dict(),'/content/drive/My Drive/dataset/model_second_step.pt')
        val_loss_min=val_loss
      model.train(mode=is_training)   
  train_loss=train_loss/len(Train_Loader.sampler)
  print('Epoch: {} \tTraining loss:{:.6f}'.format(epoch+1,train_loss))
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

The output is:

before: 0:00:00.055158
backward 1:02:52.721540

one hour for a single iteration :frowning_face:

Could you post the complete output, please?
You are still using other timers before the training starts and ends, so I’m not convinced that you are indeed profiling a single iteration.

I used google colab for running the code. After 5 iteration it stopped. I will run i again and post it again.

This is the code:

n_epochs=6
print_every=1
val_loss_min=np.Inf
for epoch in range(n_epochs):
  train_loss=0
  model.train()
  for iteration,(Xstage1,X_batch) in enumerate(Train_Loader):
    print('Iteration:',iteration,'started')
    torch.cuda.synchronize()
    strat_iteration=datetime.now()
    optimizer.zero_grad()
    output=model(Xstage1.to(device))
    loss=criterion(output,X_batch.to(device))
    retain_graph=True
    loss.backward(retain_graph=True)
    optimizer.step()
    train_loss+=loss.item()*X_batch.shape[0]
    torch.cuda.synchronize()
    end_iteration = datetime.now() 
    print('Iteration stopped.')
    print('Iteration time:',end_iteration-strat_iteration)
    if iteration % print_every==0:
      is_training=model.training
      val_loss=0
      model.eval()
      for (Xstage1_valid,X_valid_batch) in Valid_Loader:
        output=model(Xstage1_valid.to(device))
        loss=criterion(output,X_valid_batch.to(device))
        val_loss+=loss.item()*data.shape[0]
      val_loss=val_loss/len(Valid_Loader.sampler)
      if val_loss<=val_loss_min:
        print('\tValidation loss decreased ({:.6f}-->{:.6f}). Saving Model...'.format(val_loss_min,val_loss))
        torch.save(model.state_dict(),'/content/drive/My Drive/dataset/model_second_step.pt')
        val_loss_min=val_loss
      model.train(mode=is_training)   
  train_loss=train_loss/len(Train_Loader.sampler)
  print('Epoch: {} \tTraining loss:{:.6f}'.format(epoch+1,train_loss))
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

and the complete output is:

Iteration: 0 started
Iteration stopped.
Iteration time: 1:04:42.598970
	Validation loss decreased (inf-->0.008777). Saving Model...
Iteration: 1 started
Iteration stopped.
Iteration time: 1:04:23.584298
	Validation loss decreased (0.008777-->0.008767). Saving Model...
Iteration: 2 started
Iteration stopped.
Iteration time: 1:05:28.936429
	Validation loss decreased (0.008767-->0.008762). Saving Model...
Iteration: 3 started
Iteration stopped.
Iteration time: 1:03:29.209401
	Validation loss decreased (0.008762-->0.008758). Saving Model...
Iteration: 4 started
Iteration stopped.
Iteration time: 1:03:13.451551
	Validation loss decreased (0.008758-->0.008754). Saving Model...
Iteration: 5 started
Iteration stopped.
Iteration time: 1:03:05.595077
	Validation loss decreased (0.008754-->0.008751). Saving Model...
Iteration: 6 started
Iteration stopped.
Iteration time: 1:03:31.586236
	Validation loss decreased (0.008751-->0.008749). Saving Model...
Iteration: 7 started
Iteration stopped.
Iteration time: 1:05:06.575140
Iteration: 8 started