Hi,
I am trying to use pack and unpack for bidirectional RNN with variable length. I noticed significant performance loss about 10x slower with pack and unpack process. Is is normal or I use it wrong ?
Here I attached some benchmark code :
run_lstm_bf() is 10x faster than run_lstm_bf_pack() using %timeit operation on IPython
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence as pack, pad_packed_sequence as unpack
x = Variable(torch.randn(20, 700, 512).cuda())
x_bs = Variable(x.data.clone().transpose(1, 0))
xlen = [690]*20
llstm_bf = nn.LSTM(512, 512, bidirectional=True, batch_first=True)
llstm_bs = nn.LSTM(512, 512, bidirectional=True, batch_first=False)
llstmcell = nn.LSTMCell(512, 512)
llstmcell_bw = nn.LSTMCell(512, 512)
llstm_bf.cuda()
llstm_bs.cuda()
llstmcell.cuda()
llstmcell_bw.cuda()
def run_lstm_bf() :
res, _ = llstm_bf(x)
res.sum().backward()
def run_lstm_bs() :
res, _ = llstm_bs(x_bs)
res.sum().backward()
def run_lstm_bf_pack() :
xpack = pack(x, xlen, True)
res,_ = llstm_bf(xpack)
xunpack = unpack(res, True)
res = xunpack[0]
res.sum().backward()
def run_lstm_bs_pack() :
xpack = pack(x_bs, xlen, False)
res,_ = llstm_bs(xpack)
xunpack = unpack(res, False)
res = xunpack[0]
res.sum().backward()
init_h, init_c = Variable(torch.cuda.FloatTensor(20, 512).zero_()), Variable(torch.cuda.FloatTensor(20, 512).zero_())
def run_lstmcell() :
prev_h, prev_c = (init_h, init_c)
all_h = []
for ii in range(x.size(1)) :
(prev_h, prev_c) = llstmcell(x[:, ii], (prev_h, prev_c))
all_h.append(prev_h)
pass
prev_h, prev_c = (init_h, init_c)
all_h_bw = []
for ii in range(x.size(1)-1, -1, -1) :
(prev_h, prev_c) = llstmcell_bw(x[:, ii], (prev_h, prev_c))
all_h_bw.append(prev_h)
pass
all_h = torch.stack(all_h)
all_h_bw = torch.stack(all_h_bw)
all_h_comb = torch.cat([all_h, all_h_bw], 2)
all_h_comb.sum().backward()
pass