hello, when i use the pack sequence -> recurrent network -> unpack sequence
pattern in a LSTM training with nn.DataParallel, i encounter a very strange problem.
here is my code:
class LSTM(nn.Module):
def __init__(self, input_size, hidden_layer, hidden_size, num_classes,
rnn_type='lstm', dropout=0.0, bidirect=True, residual=False):
super(LSTM, self).__init__()
if bidirect:
layer = [nn.LSTM( input_size=input_size, hidden_size=hidden_size, batch_first=True, bidirectional=bidirect, dropout=dropout)]
for i in range(hidden_layer):
if i == hidden_layer-1:
layer.append(nn.LSTM(hidden_size*2, num_classes, batch_first=True, dropout=0.0, bidirectional=bidirect))
else:
layer.append(nn.LSTM(hidden_size*2, hidden_size, batch_first=True, dropout=0.0, bidirectional=bidirect))
self.lstm = nn.Sequential(*layer)
else:
pass
def forward (self, x, input_len):
_, idx_sort = th.sort(input_len, dim=0, descending=True)
_, idx_unsort = th.sort(idx_sort, dim=0)
x = x.index_select(0, Variable(idx_sort))
input_len = input_len[idx_sort]
x = x.cuda()
max_length = x.shape[1]
x = rnn_pack.pack_padded_sequence(x, input_len, batch_first=True)
_x = self.lstm(x)_
out , _ = rnn_pack.pad_packed_sequence(x, total_length=max_length, batch_first=True)
out = out.index_select(0, Variable(idx_unsort))
return out
def train_one_epoch(nnet, criterion, optimizer, train_loader, num_parallel, is_rnn=True):
nnet.train()
for index, (key, feats, labels, len_list) in enumerate(train_loader):
labels = labels.view(labels.shape[0], labels.shape[1])
input_len = np.array(len_list)
optimizer.zero_grad()
if is_rnn:
label_mat = labels.view(labels.size(0) * labels.size(1))
targets = Variable(label_mat.cuda())
input_len = th.from_numpy(input_len)
out = nnet(feats, input_len)
……………………
nnet = LSTM((args.left_context + args.right_context + 1) * args.feat_dim, args.hidden_layer, args.hidden_size, args.num_classes, rnn_type=net_type, dropout=args.dropout, bidirect=bidirect, residual=residual)
nnet = nn.DataParallel(nnet, device_ids=[0,1,2])
when I run the main script, the error occurs as follows:
Traceback (most recent call last):
File "./train/train_rnn_pack_sort.py", line 195, in <module>
train(args)
File "./train/train_rnn_pack_sort.py", line 150, in train
tr_frame_acc = train_epoch(nnet, criterion, optimizer, train_loader, num_parallel, train_dataset.num_frames, is_rnn=True)
File "./train/train_rnn_pack_sort.py", line 60, in train_epoch
train_frame, pos_frames = common_pack_sort.train_one_epoch(nnet, criterion, optimizer, train_loader, num_parallel, is_rnn)
File "/search/speech/wangqingnan/asr_tools/pytorch/asr_egs/common/common_pack_sort.py", line 107, in train_one_epoch
out = nnet(feats, input_len)
File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
result = self.forward(*input, **kwargs)
File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/data_parallel.py", line 114, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/data_parallel.py", line 124, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/parallel_apply.py", line 65, in parallel_apply
raise output
AttributeError: 'tuple' object has no attribute 'size'
and if i use “x = self.lstm(x.data)” instead of “x = self.lstm(x)” in the forward method,the error shows as follows:
Traceback (most recent call last):
File "./train/train_rnn_pack_sort.py", line 195, in <module>
train(args)
File "./train/train_rnn_pack_sort.py", line 150, in train
tr_frame_acc = train_epoch(nnet, criterion, optimizer, train_loader, num_parallel, train_dataset.num_frames, is_rnn=True)
File "./train/train_rnn_pack_sort.py", line 60, in train_epoch
train_frame, pos_frames = common_pack_sort.train_one_epoch(nnet, criterion, optimizer, train_loader, num_parallel, is_rnn)
File "/search/speech/wangqingnan/asr_tools/pytorch/asr_egs/common/common_pack_sort.py", line 107, in train_one_epoch
out = nnet(feats, input_len)
File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
result = self.forward(*input, **kwargs)
File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/data_parallel.py", line 114, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/data_parallel.py", line 124, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/search/speech/wangqingnan/Anaconda/anaconda2/lib/python2.7/site-packages/torch/nn/parallel/parallel_apply.py", line 65, in parallel_apply
raise output
RuntimeError: input must have 3 dimensions, got 2
However, I remember that with one GPU, “x” is still a PackedSequence object, and there is no problems in the forward propagation.
Any help would be appreciated…