Multi layer RNN with DataParallel

Here is my minimum example:

import torch.nn as nn
import torch
import torch.nn.init as weight_init

class Network(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Network, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True)
        for p in self.lstm.parameters():
            weight_init.normal(p, std=0.1)

    def forward(self, input_var, h0):
        output, ht = self.lstm(input_var, h0)
        return output,ht

net = Network(256,256)
net.cuda()
dp=torch.nn.DataParallel(net)
input_var=torch.autograd.Variable(torch.rand(1,32,256).cuda())
h0=torch.autograd.Variable(torch.randn(2,1,256).cuda())
c0=torch.autograd.Variable(torch.randn(2,1,256).cuda())
h=(h0,c0)

out, ht=dp(input_var,h)

Then error is RuntimeError: Expected hidden size (2, 1L, 256), got (1L, 1L, 256L). I believe it is because pytorch cut input tensor into pieces and send them to different gpus, but I think that hidden state should not be split. As for your example, I will simplify and try it.