Indexing is very slow for backpropagation

I found that indexing is very slow for backpropagation.

import time

import numpy as np
import torch
from torch.autograd import Variable

_num_units = 1000

def FC_layer(inputs, num_units=_num_units):
	input_size = inputs.size()[0]
	W = Variable(torch.rand(input_size, num_units), requires_grad=True)
	ret = torch.matmul(inputs, W)
	
	# Avoid large number
	# ret = torch.sigmoid(ret)
	return ret

def stupid_FC_layer(inputs, num_units=_num_units):
	input_size = inputs.size()[0]
	W = Variable(torch.rand(input_size, num_units), requires_grad=True)
	ret = Variable(torch.rand(num_units))

	splited_W = torch.split(W, 1, dim=1)
	for i in range(num_units):
	    ret[num_units - 1 - i] = torch.matmul(inputs, splited_W[i])
	
	# Avoid large number
	# ret = torch.sigmoid(ret)
	return ret

def another_stupid_FC_layer(inputs, num_units=_num_units):
	input_size = inputs.size()[0]
	W = Variable(torch.rand(input_size, num_units), requires_grad=True)

	ret_list = []
	splited_W = torch.split(W, 1, dim=1)
	for i in range(num_units):
	    ret_list.append(torch.matmul(inputs, splited_W[i]))
	ret = torch.cat(ret_list)

	# Avoid large number
	# ret = torch.sigmoid(ret)
	return ret

if __name__ == "__main__":
	num_layers = 2

	print("=" * 80)
	print("Common FC layer Test")
	print("-" * 80)
	print("Construct and Forward Time:")
	T0 = time.time()
	inputs = Variable(torch.rand(_num_units))
	for i in range(num_layers):
	    inputs = FC_layer(inputs)
	inputs = FC_layer(inputs, 1)
	T1 = time.time()
	print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
	print("-" * 80)
	print("Backward Time:")
	T0 = time.time()
	inputs.backward()
	T1 = time.time()
	print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))

	print("=" * 80)
	print("Stupid FC layer Test")
	print("-" * 80)
	print("Construct and Forward Time:")
	T0 = time.time()
	inputs = Variable(torch.rand(_num_units))
	for i in range(num_layers):
	    inputs = stupid_FC_layer(inputs)
	inputs = stupid_FC_layer(inputs, 1)
	T1 = time.time()
	print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
	print("-" * 80)
	print("Backward Time:")
	T0 = time.time()
	inputs.backward()
	T1 = time.time()
	print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))

	print("=" * 80)
	print("Another Stupid FC layer Test")
	print("-" * 80)
	print("Construct and Forward Time:")
	T0 = time.time()
	inputs = Variable(torch.rand(_num_units))
	for i in range(num_layers):
	    inputs = another_stupid_FC_layer(inputs)
	inputs = another_stupid_FC_layer(inputs, 1)
	T1 = time.time()
	print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))
	print("-" * 80)
	print("Backward Time:")
	T0 = time.time()
	inputs.backward()
	T1 = time.time()
	print('Elapsed %.3f ms' % ((T1 - T0) * 1000.0))

It shows that …

================================================================================
Common FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
Elapsed 17.241 ms
--------------------------------------------------------------------------------
Backward Time:
Elapsed 2.436 ms
================================================================================
Stupid FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
Elapsed 137.253 ms
--------------------------------------------------------------------------------
Backward Time:
Elapsed 2317.587 ms
================================================================================
Another Stupid FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
Elapsed 123.154 ms
--------------------------------------------------------------------------------
Backward Time:
Elapsed 1749.845 ms

I am using python 3.6, pytorch 0.2.0

There’s been perf improvements to autograd in pytorch 0.3 (coming soon). You could try building from source or waiting for the release.

I have tested on pytorch 0.3, but it didn’t show improvements on this point.

Isn’t that expected? You are comparing one tensor multiplication with multiple ones. While they are equivalent, the first have can have better optimization basing on size and less overhead. It is natural that the second is much slower.

It could be slower, but not that much.

================================================================================
Common FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
-------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                  CPU time        CUDA time            Calls        CPU total       CUDA total
-------------  ---------------  ---------------  ---------------  ---------------  ---------------
unsqueeze             13.605us          0.000us                2         27.210us          0.000us
mm                  5156.834us          0.000us                2      10313.668us          0.000us
squeeze_               5.965us          0.000us                2         11.929us          0.000us

Elapsed 61.172 ms
--------------------------------------------------------------------------------
Backward Time:
-------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                                          CPU time        CUDA time            Calls        CPU total       CUDA total
-------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
N5torch8autograd9GraphRootE                    5.604us          0.000us                1          5.604us          0.000us
SqueezeBackward1                               3.305us          0.000us                2          6.610us          0.000us
unsqueeze                                      2.368us          0.000us                2          4.736us          0.000us
MmBackward                                  2872.008us          0.000us                2       5744.016us          0.000us
t                                              3.864us          0.000us                3         11.593us          0.000us
mm                                          1905.354us          0.000us                3       5716.063us          0.000us
UnsqueezeBackward                              2.018us          0.000us                1          2.018us          0.000us
squeeze                                        1.411us          0.000us                1          1.411us          0.000us
N5torch8autograd14AccumulateGradE           2083.309us          0.000us                2       4166.618us          0.000us
N5torch8autograd5CloneE                     2078.855us          0.000us                2       4157.710us          0.000us

Elapsed 28.548 ms
================================================================================
Stupid FC layer Test
--------------------------------------------------------------------------------
Construct and Forward Time:
-------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                  CPU time        CUDA time            Calls        CPU total       CUDA total
-------------  ---------------  ---------------  ---------------  ---------------  ---------------
narrow                 0.752us          0.000us             3000       2255.612us          0.000us
unsqueeze              0.717us          0.000us             3001       2150.282us          0.000us
mm                    24.225us          0.000us             3001      72700.511us          0.000us
squeeze_               0.579us          0.000us             3001       1736.924us          0.000us
SetItem                8.354us          0.000us             3000      25061.452us          0.000us

Elapsed 232.678 ms
--------------------------------------------------------------------------------
Backward Time:
-------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                                          CPU time        CUDA time            Calls        CPU total       CUDA total
-------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
N5torch8autograd9GraphRootE                    4.172us          0.000us                1          4.172us          0.000us
SqueezeBackward1                               3.720us          0.000us             3001      11162.951us          0.000us
unsqueeze                                      2.465us          0.000us             3001       7398.137us          0.000us
MmBackward                                    21.510us          0.000us             3001      64551.329us          0.000us
t                                              1.542us          0.000us             3002       4630.286us          0.000us
mm                                            16.302us          0.000us             3002      48937.424us          0.000us
UnsqueezeBackward                              1.444us          0.000us                1          1.444us          0.000us
squeeze                                        0.937us          0.000us                1          0.937us          0.000us
N5torch8autograd14AccumulateGradE           2602.311us          0.000us                2       5204.621us          0.000us
N5torch8autograd5CloneE                     2598.121us          0.000us                2       5196.242us          0.000us
SetItemBackward                               90.268us          0.000us             3000     270804.749us          0.000us
clone                                          9.292us          0.000us             3000      27875.561us          0.000us
SetItem                                       28.317us          0.000us             3000      84951.591us          0.000us
Index                                         11.494us          0.000us             3000      34483.173us          0.000us
view                                           5.437us          0.000us             3000      16311.041us          0.000us
NarrowBackward                              2183.574us          0.000us             3000    6550722.268us          0.000us
_unnarrow                                   2181.987us          0.000us             3000    6545962.471us          0.000us
N5torch8autograd3AddE                       5623.963us          0.000us             2999   16866264.974us          0.000us

Elapsed 24832.020 ms

It’s profiler’s result. It doesn’t make sense that a stupid fc layer([3000] * [3000, 1] matmul operations 3000 times) will need 25s for backward operation.
It shows that most time consumed by NarrowBackward/unnarrow and autograd add operations.
I have no idea how does it implement inside. In my opinion, narrowbackward and unnarrow are mostly index computing operations, and autograd.add are mostly element-wise add operations, and autograd.add are much much slower than common element-wise addition operations.

I found this issue about backward for narrow operations. https://github.com/ezyang/pytorch/issues/255

Narrow is a very cheap operation in forwards, because it doesn’t have to do much, but it is expensive in backwards, since you must allocate a full size tensor and zero pad it out. This is a good opportunity for the optimizer.

Hi~ I also encountered this problem. My code is like this:

# fd_prob: [batch_size, tgt_len, vocab_size], means words probability distribution
# bd_hyp: [batch_size, infer_len], means another output labels
# fd_bd_attn: [batch_size, tgt_len, infer_len], means edit probability distribution
# fd_p_gen: [batch_size, tgt_len, 1], means copy mode probability
batch_size, tgt_len, _ = fd_prob.size()
_, infer_len = bd_hyp.size()
# incorporate copy mode
for i in range(batch_size):
    for j in range(tgt_len):
        for k in range(infer_len):
            fd_prob[i][j][bd_hyp[i][k]] += (1 - fd_p_gen[i][j][0]) * fd_bd_attn[i][j][k]
loss = criterion(fd_prob, ground_truth)
loss.backward()

I modified output distribution by a 3-layer loop which contained lots of index, and I found that forward cost less than 1s while backward cost more than 1.5min, which is unacceptable. Have you found the solution or is there an elegant way to do this?