The `pack sequence -> recurrent network -> unpack sequence` pattern in a LSTM training with nn.DataParallel

hello, when i use the pack sequence -> recurrent network -> unpack sequence pattern in a LSTM training with nn.DataParallel, i encounter a very strange problem.
here is my code:

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_layer, hidden_size, num_classes, 
            rnn_type='lstm', dropout=0.0, bidirect=True, residual=False):
        super(LSTM, self).__init__()
        if bidirect:
            layer = [nn.LSTM( input_size=input_size, hidden_size=hidden_size, batch_first=True, bidirectional=bidirect, dropout=dropout)] 
            for i in range(hidden_layer):
                if i == hidden_layer-1:
                    layer.append(nn.LSTM(hidden_size*2, num_classes, batch_first=True, dropout=0.0, bidirectional=bidirect))
                else:
                    layer.append(nn.LSTM(hidden_size*2, hidden_size, batch_first=True, dropout=0.0, bidirectional=bidirect))
            self.lstm = nn.Sequential(*layer)
        else:
            pass
   def forward (self, x, input_len):
        
        _, idx_sort = th.sort(input_len, dim=0, descending=True)
        _, idx_unsort = th.sort(idx_sort, dim=0)
        x = x.index_select(0, Variable(idx_sort))
        input_len = input_len[idx_sort]
        x = x.cuda()
        max_length = x.shape[1]
        x = rnn_pack.pack_padded_sequence(x, input_len, batch_first=True)
        _x = self.lstm(x)_
        out , _ = rnn_pack.pad_packed_sequence(x, total_length=max_length, batch_first=True)
        out = out.index_select(0, Variable(idx_unsort))
        return out
def train_one_epoch(nnet, criterion, optimizer, train_loader, num_parallel, is_rnn=True):
    nnet.train()
    for index, (key, feats, labels, len_list) in enumerate(train_loader):
        labels = labels.view(labels.shape[0], labels.shape[1])
        input_len = np.array(len_list)
        optimizer.zero_grad()
        if is_rnn:
            label_mat = labels.view(labels.size(0) * labels.size(1))
        targets = Variable(label_mat.cuda())
        input_len = th.from_numpy(input_len)
        out = nnet(feats, input_len)
         ……………………
nnet = LSTM((args.left_context + args.right_context + 1) * args.feat_dim, args.hidden_layer, args.hidden_size, args.num_classes, rnn_type=net_type, dropout=args.dropout, bidirect=bidirect, residual=residual)
nnet = nn.DataParallel(nnet, device_ids=[0,1,2])

when I run the main script, the error occurs as follows:

Traceback (most recent call last):
  File "./train/train_rnn_pack_sort.py", line 195, in <module>
    train(args)
  File "./train/train_rnn_pack_sort.py", line 150, in train
    tr_frame_acc = train_epoch(nnet, criterion, optimizer, train_loader, num_parallel, train_dataset.num_frames, is_rnn=True)
  File "./train/train_rnn_pack_sort.py", line 60, in train_epoch
    train_frame, pos_frames = common_pack_sort.train_one_epoch(nnet, criterion, optimizer, train_loader, num_parallel, is_rnn)
  File "/search/speech/wangqingnan/asr_tools/pytorch/asr_egs/common/common_pack_sort.py", line 107, in train_one_epoch
    out = nnet(feats, input_len)
  File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/data_parallel.py", line 114, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/data_parallel.py", line 124, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/parallel_apply.py", line 65, in parallel_apply
    raise output
AttributeError: 'tuple' object has no attribute 'size'

and if i use “x = self.lstm(x.data)” instead of “x = self.lstm(x)” in the forward method,the error shows as follows:

Traceback (most recent call last):
  File "./train/train_rnn_pack_sort.py", line 195, in <module>
    train(args)
  File "./train/train_rnn_pack_sort.py", line 150, in train
    tr_frame_acc = train_epoch(nnet, criterion, optimizer, train_loader, num_parallel, train_dataset.num_frames, is_rnn=True)
  File "./train/train_rnn_pack_sort.py", line 60, in train_epoch
    train_frame, pos_frames = common_pack_sort.train_one_epoch(nnet, criterion, optimizer, train_loader, num_parallel, is_rnn)
  File "/search/speech/wangqingnan/asr_tools/pytorch/asr_egs/common/common_pack_sort.py", line 107, in train_one_epoch
    out = nnet(feats, input_len)
  File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/data_parallel.py", line 114, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/data_parallel.py", line 124, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/parallel_apply.py", line 65, in parallel_apply
    raise output
RuntimeError: input must have 3 dimensions, got 2

However, I remember that with one GPU, “x” is still a PackedSequence object, and there is no problems in the forward propagation.
Any help would be appreciated…