# Backward process is too slow in my customized network!

I wanted to do 1D convolution on a circle(diffraction signal circle). So i customized a CircleConv layer and made a network using it. But the speed of the network is too slow to train on my dataset. The code is as follows:

``````import torch
import torch.nn as nn
import torch.nn.init as init
from collections import OrderedDict
import time

class CircleConv(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1, stride=1):
super(CircleConv, self).__init__()

self.kernel_size = kernel_size
self.kernel_size_number = kernel_size * 1
self.out_channels = out_channels
self.dilation = (dilation, dilation)
self.in_channels = in_channels
self.weights = nn.Parameter(torch.Tensor(self.out_channels, self.in_channels, self.kernel_size_number))
init.xavier_uniform_(self.weights)

def forward(self, x):
"""x shape: (batch, channels, l)"""
windows = self.calculateWindows(x)
result = torch.zeros(
[x.shape * self.out_channels, x.shape], dtype=torch.float,
device=x.device)
for channel in range(x.shape):
for i_convNumber in range(self.out_channels):
xx = torch.matmul(windows[channel], self.weights[i_convNumber][channel])
xx = xx.view(-1, x.shape)
result[i_convNumber * xx.shape: (i_convNumber + 1) * xx.shape] += xx
result = result.view(self.out_channels, x.shape, x.shape).permute(1, 0, 2)
return result

def calculateWindows(self, x):
length = x.shape[-1]
half = self.kernel_size // 2
triple = torch.cat(tensors=[x[:, :, -half:], x, x[:, :, :half]], dim=-1)
lt = []
for step in range(length):
window = triple[:, :, half + step - half:half + step + half + 1]
window = window.reshape(x.shape, -1)
lt.append(window)
windows = torch.stack(lt, dim=2)
windows = windows.transpose(1, 2).contiguous().view(-1, x.shape, self.kernel_size_number)
windows = windows.transpose(0, 1)

return windows

class CircleConvCNN(nn.Module):
def __init__(self, in_channels, output_features):
super(CircleConvCNN, self).__init__()
self.net1 = nn.Sequential(CircleConv(in_channels, 32),  nn.GELU(), CircleConv(32, 32),  nn.GELU(), CircleConv(32, 64),  nn.GELU(), CircleConv(64, 64),  nn.GELU(), CircleConv(64, 128),  nn.GELU(), CircleConv(128, 128),  nn.GELU())
self.net2 = nn.Sequential(OrderedDict([('linear1', nn.Linear(128, 1024)), ('activation1', nn.GELU()), ('linear2', nn.Linear(1024, 512)), ('activation2', nn.GELU()), ('linear3', nn.Linear(512, 9))]))
def forward(self, x):
x = x.view(x.shape, x.shape, 1)
x = x.permute(0, 2, 1)
x = self.net1(x)
x = self.global_pool(x)
#         x = self.flatten(x)
x = x.view(x.size(), -1)
#         print(x.size())
output = self.net2(x)
return output
# Below is the test code
CNN = CircleConvCNN(1, 9)
CNN.to('cuda')
for i in range(10):
print('time tick:')
ticks = time.time()
x = torch.randn(800, 147).to('cuda')
out = CNN(x)
print('forward elapsed time:{}'.format(time.time() - ticks))
print('time tick:')
ticks = time.time()
out.mean().backward()
print('backward elapsed time:{}'.format(time.time() - ticks))
``````

The output of above code is:

``````time tick:
forward elapsed time:2.740222454071045
time tick:
backward elapsed time:85.08087038993835
time tick:
forward elapsed time:2.3308119773864746
time tick:
backward elapsed time:84.98330068588257
time tick:
forward elapsed time:2.3467724323272705
time tick:
......
``````

Why the backpropogation is so slow? Maybe some operations are slow, but i could not find them. Can someone give some suggestions?

CUDA operations are executed asynchronously, so you would need to synchronize the code manually via `torch.cuda.synchronize()` before starting and stopping the timers or use e.g. `torch.utils.benchmark`, which will synchronize for you and will also add warmup iterations for a valid profiling.
Without syncs you would accumulate the duration of previous operations into a blocking one and would thus get invalid times.

I revised the test code as follows:

``````CNN = CircleConvCNN(1, 9)
CNN.to('cuda')
for i in range(10):
torch.cuda.synchronize()
print('time tick:')
ticks = time.time()
x = torch.randn(800, 147).to('cuda')
out = CNN(x)
torch.cuda.synchronize()
print('forward elapsed time:{}'.format(time.time() - ticks))
torch.cuda.synchronize()
print('time tick:')
ticks = time.time()
out.mean().backward()
torch.cuda.synchronize()
print('backward elapsed time:{}'.format(time.time() - ticks))

``````

So it will take the time record after cuda operations complete.
But the operations(mainly in CircleConv layer) i defined were very slow, how to make it faster?

You could profile the different steps in this module and check where the bottleneck is.
Generally, I would try to see, if the loops (using `range(length)` and `range(x.shape)`) can be avoided, in case these are indeed the bottleneck. Also, you might want to check `unfold`, which could be faster than your slicing.

1 Like

I dont’t know how to profile the backward steps since the time bottleneck happens in the backword. To solve the problem, I use the functions implemented in the pytorch. By replacing the windows abstraction and matrix multiplication with nn.Conv1d, the speed in backward is far faster than before.

``````class CircleConv(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
super(CircleConv, self).__init__()
assert kernel_size // 2 != 0, 'kernel size should be an odd!'
self.kernel_size = kernel_size
self.kernel_size_number = kernel_size * 1
self.out_channels = out_channels
self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=stride, padding=0)

def forward(self, x):
length = x.shape[-1]
half = self.kernel_size // 2
assert length>=half, 'length is shorter than half kernel!'
triple = torch.cat(tensors=[x[:, :, -half:], x, x[:, :, :half]], dim=-1)
result = self.conv1(triple)
return result

``````

The time record is as follows:

``````forward eclapsed time:1.2164795398712158
backward eclapsed time:0.11817550659179688
forward eclapsed time:0.009972095489501953
backward eclapsed time:0.025435686111450195
forward eclapsed time:0.009973287582397461
backward eclapsed time:0.02593064308166504
forward eclapsed time:0.009973287582397461
backward eclapsed time:0.02593064308166504
forward eclapsed time:0.009973287582397461
backward eclapsed time:0.02593088150024414
forward eclapsed time:0.009973526000976562
backward eclapsed time:0.024933338165283203
forward eclapsed time:0.009972810745239258
backward eclapsed time:0.0249330997467041
``````

With the limited knowledge on the programming, I don’t understand what results in the time consumption in the backward progress.