I found that indexing is very slow for backpropagation.
import time
import numpy as np
import torch
from torch.autograd import Variable
_num_units = 1000
def FC_layer(inputs, num_units=_num_units):
input_size = inputs.size()[0]
W = Variable(torch.rand(input_size, num_units), requires_grad=True)
ret = torch.matmul(inputs, W)
# Avoid large number
# ret = torch.sigmoid(ret)
return ret
def stupid_FC_layer(inputs, num_units=_num_units):
input_size = inputs.size()[0]
W = Variable(torch.rand(input_size, num_units), requires_grad=True)
ret = Variable(torch.rand(num_units))
splited_W = torch.split(W, 1, dim=1)
for i in range(num_units):
ret[num_units - 1 - i] = torch.matmul(inputs, splited_W[i])
# Avoid large number
# ret = torch.sigmoid(ret)
return ret
def another_stupid_FC_layer(inputs, num_units=_num_units):
input_size = inputs.size()[0]
W = Variable(torch.rand(input_size, num_units), requires_grad=True)
ret_list = []
splited_W = torch.split(W, 1, dim=1)
for i in range(num_units):
ret_list.append(torch.matmul(inputs, splited_W[i]))
ret = torch.cat(ret_list)
# Avoid large number
# ret = torch.sigmoid(ret)
return ret
if __name__ == "__main__":
num_layers = 2
print("=" * 80)
print("Common FC layer Test")
print("-" * 80)
print("Construct and Forward Time:")
T0 = time.time()
inputs = Variable(torch.rand(_num_units))
for i in range(num_layers):
inputs = FC_layer(inputs)
inputs = FC_layer(inputs, 1)
T1 = time.time()
print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
print("-" * 80)
print("Backward Time:")
T0 = time.time()
inputs.backward()
T1 = time.time()
print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
print("=" * 80)
print("Stupid FC layer Test")
print("-" * 80)
print("Construct and Forward Time:")
T0 = time.time()
inputs = Variable(torch.rand(_num_units))
for i in range(num_layers):
inputs = stupid_FC_layer(inputs)
inputs = stupid_FC_layer(inputs, 1)
T1 = time.time()
print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
print("-" * 80)
print("Backward Time:")
T0 = time.time()
inputs.backward()
T1 = time.time()
print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
print("=" * 80)
print("Another Stupid FC layer Test")
print("-" * 80)
print("Construct and Forward Time:")
T0 = time.time()
inputs = Variable(torch.rand(_num_units))
for i in range(num_layers):
inputs = another_stupid_FC_layer(inputs)
inputs = another_stupid_FC_layer(inputs, 1)
T1 = time.time()
print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
print("-" * 80)
print("Backward Time:")
T0 = time.time()
inputs.backward()
T1 = time.time()
print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
It shows that …
================================================================================
Common FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
Elapsed 17.241 ms
--------------------------------------------------------------------------------
Backward Time:
Elapsed 2.436 ms
================================================================================
Stupid FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
Elapsed 137.253 ms
--------------------------------------------------------------------------------
Backward Time:
Elapsed 2317.587 ms
================================================================================
Another Stupid FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
Elapsed 123.154 ms
--------------------------------------------------------------------------------
Backward Time:
Elapsed 1749.845 ms
Isn’t that expected? You are comparing one tensor multiplication with multiple ones. While they are equivalent, the first have can have better optimization basing on size and less overhead. It is natural that the second is much slower.
================================================================================
Common FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
------------- --------------- --------------- --------------- --------------- ---------------
Name CPU time CUDA time Calls CPU total CUDA total
------------- --------------- --------------- --------------- --------------- ---------------
unsqueeze 13.605us 0.000us 2 27.210us 0.000us
mm 5156.834us 0.000us 2 10313.668us 0.000us
squeeze_ 5.965us 0.000us 2 11.929us 0.000us
Elapsed 61.172 ms
--------------------------------------------------------------------------------
Backward Time:
------------------------------------- --------------- --------------- --------------- --------------- ---------------
Name CPU time CUDA time Calls CPU total CUDA total
------------------------------------- --------------- --------------- --------------- --------------- ---------------
N5torch8autograd9GraphRootE 5.604us 0.000us 1 5.604us 0.000us
SqueezeBackward1 3.305us 0.000us 2 6.610us 0.000us
unsqueeze 2.368us 0.000us 2 4.736us 0.000us
MmBackward 2872.008us 0.000us 2 5744.016us 0.000us
t 3.864us 0.000us 3 11.593us 0.000us
mm 1905.354us 0.000us 3 5716.063us 0.000us
UnsqueezeBackward 2.018us 0.000us 1 2.018us 0.000us
squeeze 1.411us 0.000us 1 1.411us 0.000us
N5torch8autograd14AccumulateGradE 2083.309us 0.000us 2 4166.618us 0.000us
N5torch8autograd5CloneE 2078.855us 0.000us 2 4157.710us 0.000us
Elapsed 28.548 ms
================================================================================
Stupid FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
------------- --------------- --------------- --------------- --------------- ---------------
Name CPU time CUDA time Calls CPU total CUDA total
------------- --------------- --------------- --------------- --------------- ---------------
narrow 0.752us 0.000us 3000 2255.612us 0.000us
unsqueeze 0.717us 0.000us 3001 2150.282us 0.000us
mm 24.225us 0.000us 3001 72700.511us 0.000us
squeeze_ 0.579us 0.000us 3001 1736.924us 0.000us
SetItem 8.354us 0.000us 3000 25061.452us 0.000us
Elapsed 232.678 ms
--------------------------------------------------------------------------------
Backward Time:
------------------------------------- --------------- --------------- --------------- --------------- ---------------
Name CPU time CUDA time Calls CPU total CUDA total
------------------------------------- --------------- --------------- --------------- --------------- ---------------
N5torch8autograd9GraphRootE 4.172us 0.000us 1 4.172us 0.000us
SqueezeBackward1 3.720us 0.000us 3001 11162.951us 0.000us
unsqueeze 2.465us 0.000us 3001 7398.137us 0.000us
MmBackward 21.510us 0.000us 3001 64551.329us 0.000us
t 1.542us 0.000us 3002 4630.286us 0.000us
mm 16.302us 0.000us 3002 48937.424us 0.000us
UnsqueezeBackward 1.444us 0.000us 1 1.444us 0.000us
squeeze 0.937us 0.000us 1 0.937us 0.000us
N5torch8autograd14AccumulateGradE 2602.311us 0.000us 2 5204.621us 0.000us
N5torch8autograd5CloneE 2598.121us 0.000us 2 5196.242us 0.000us
SetItemBackward 90.268us 0.000us 3000 270804.749us 0.000us
clone 9.292us 0.000us 3000 27875.561us 0.000us
SetItem 28.317us 0.000us 3000 84951.591us 0.000us
Index 11.494us 0.000us 3000 34483.173us 0.000us
view 5.437us 0.000us 3000 16311.041us 0.000us
NarrowBackward 2183.574us 0.000us 3000 6550722.268us 0.000us
_unnarrow 2181.987us 0.000us 3000 6545962.471us 0.000us
N5torch8autograd3AddE 5623.963us 0.000us 2999 16866264.974us 0.000us
Elapsed 24832.020 ms
It’s profiler’s result. It doesn’t make sense that a stupid fc layer([3000] * [3000, 1] matmul operations 3000 times) will need 25s for backward operation.
It shows that most time consumed by NarrowBackward/unnarrow and autograd add operations.
I have no idea how does it implement inside. In my opinion, narrowbackward and unnarrow are mostly index computing operations, and autograd.add are mostly element-wise add operations, and autograd.add are much much slower than common element-wise addition operations.
Narrow is a very cheap operation in forwards, because it doesn’t have to do much, but it is expensive in backwards, since you must allocate a full size tensor and zero pad it out. This is a good opportunity for the optimizer.
Hi~ I also encountered this problem. My code is like this:
# fd_prob: [batch_size, tgt_len, vocab_size], means words probability distribution
# bd_hyp: [batch_size, infer_len], means another output labels
# fd_bd_attn: [batch_size, tgt_len, infer_len], means edit probability distribution
# fd_p_gen: [batch_size, tgt_len, 1], means copy mode probability
batch_size, tgt_len, _ = fd_prob.size()
_, infer_len = bd_hyp.size()
# incorporate copy mode
for i in range(batch_size):
for j in range(tgt_len):
for k in range(infer_len):
fd_prob[i][j][bd_hyp[i][k]] += (1 - fd_p_gen[i][j][0]) * fd_bd_attn[i][j][k]
loss = criterion(fd_prob, ground_truth)
loss.backward()
I modified output distribution by a 3-layer loop which contained lots of index, and I found that forward cost less than 1s while backward cost more than 1.5min, which is unacceptable. Have you found the solution or is there an elegant way to do this?