DataParallel LSTM wrong dimensions with 8 GPUs

I posted this issue about 5 months ago, now I’m back at this project and was eagerly awaiting for 0.3 to come out but the issue remains.

This code works with a single GPU, fails with more than one (specifically 8):

import torch
import torch.nn as nn
from torch.autograd import Variable

class StepRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers): #
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(input_size=hidden_size, \
                                hidden_size=hidden_size,\
                                num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).cuda(),
                Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size).cuda()))


decoder = StepRNN(
input_size=100,
hidden_size=8,
output_size=100,
num_layers=1)

decoder_dist = nn.DataParallel(decoder, device_ids=[0,1,2,3,4,5,6,7], dim=0)
decoder_dist.cuda()

batch_size = 16
hidden = decoder.init_hidden(batch_size)
input_ = Variable(torch.LongTensor(batch_size, 10)).cuda()
target =  Variable(torch.LongTensor(batch_size, 10)).cuda()

for c in range(10):
 decoder_dist(input_[:,c].contiguous(), hidden)

RuntimeError: Expected hidden size (1, 2, 8), got (1, 16, 8)

pytorch 0.3
CUDA 9
cuDNN 7

It obviously has something to do with batch_first being set to false (as per defaults). Is this a bug? I’ll be happy to file it unless I’m missing something.

Your reading of the documentation / input-output shapes of each particular layer is wrong.

I fixed the example for you:

import torch
import torch.nn as nn
from torch.autograd import Variable

class StepRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(input_size=hidden_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        assert input.dim() == 2
        encoded = self.encoder(input.view(input.size(1), 1))
        encoded = encoded.view(1, encoded.size(0), encoded.size(2))
        output, hidden = self.rnn(encoded, hidden)
        output = self.decoder(output.view(input.size(1), self.hidden_size))
        return output.view(1, input.size(1), -1), hidden

    def init_hidden(self, batch_size):
        hx = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size).cuda())
        cx = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size).cuda())
        return (hx, cx)


batch_size = 16
seq_length = 1
num_layers = 1

decoder = StepRNN(
    input_size=100,
    hidden_size=8,
    output_size=100,
    num_layers=num_layers
)

decoder_dist = nn.DataParallel(decoder, dim=1)
decoder_dist.cuda()

hidden = decoder.init_hidden(batch_size)
input_ = Variable(torch.LongTensor(1, batch_size, 10).fill_(0)).cuda()
target =  Variable(torch.LongTensor(1, batch_size, 10).fill_(0)).cuda()

for c in range(10):
    output, hidden = decoder_dist(input_[:, :, c].contiguous(), hidden)
1 Like

Thank you so much for taking the time to point me in the right direction, very helpful!