Error when using DataParallel (when using LSTM))

I’ve been trying to run this code that is currently on github.

The authors would not reply to the same question asked in the past, so I ended up here.

When I run the code on one GPU it works fine, but when I try to run it on muliple GPUs(3 GPUs), I get an error of : RuntimeError: Expected hidden[0] size (1, 3, 512), got (1, 9, 512).

The model has an encoder and a decoder, and (obviously) dataparallel is being used for both. However, I only encounter problems with the decoder. Both encoder and decoder are LSTMs.

The error that I get is as below.

Traceback (most recent call last):
  File "src/train.py", line 573, in <module>
    train_val()
  File "src/train.py", line 569, in train_val
    train( train_env, agent, val_envs, tok)
  File "src/train.py", line 277, in train
    agent.train(interval, feedback=args.agent_feedback_method) 
  File "/root/mount/rmm/src/agent.py", line 686, in train
    self.rollout(speaker_branching=self.train_branching)
  File "/root/mount/rmm/src/agent.py", line 414, in rollout
    prev_masks=masks, prev_entropys=entropys, prev_ml_loss=loss, train_rl=train_rl)
  File "/root/mount/rmm/src/speaker.py", line 78, in train
    prev_masks=prev_masks, prev_entropys=prev_entropys, prev_ml_loss=prev_ml_loss, train_rl=train_rl)
  File "/root/mount/rmm/src/speaker.py", line 213, in teacher_forcing
    logits, _, _ = self.decoder(insts, ctx, ctx_mask, h_t, c_t) #Error here. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
    output.reraise()
  File "/usr/local/lib/python2.7/dist-packages/torch/_utils.py", line 394, in reraise
    raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/root/mount/rmm/src/model.py", line 238, in forward
    x, (h1, c1) = self.lstm(embeds, (h0, c0))
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/rnn.py", line 556, in forward
    self.check_forward_args(input, hx, batch_sizes)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/rnn.py", line 512, in check_forward_args
    'Expected hidden[0] size {}, got {}')
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/rnn.py", line 176, in check_hidden_size
    raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
RuntimeError: Expected hidden[0] size (1, 3, 512), got (1, 9, 512)

The code for training the Encoder is as below.

class SpeakerEncoder(nn.Module):
    def __init__(self, feature_size, hidden_size, dropout_ratio, bidirectional,featdropout=0.3):
        super(SpeakerEncoder, self).__init__()
        self.num_directions = 2 if bidirectional else 1
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.feature_size = feature_size

        if bidirectional:
            print("BIDIR in speaker encoder!!")

        self.lstm = nn.LSTM(feature_size, self.hidden_size // self.num_directions, self.num_layers,
                            batch_first=True, dropout=dropout_ratio, bidirectional=bidirectional)
        self.drop = nn.Dropout(p=dropout_ratio)
        self.drop3 = nn.Dropout(p=featdropout)
        self.attention_layer = SoftDotAttention(self.hidden_size, feature_size)

        self.post_lstm = nn.LSTM(self.hidden_size, self.hidden_size // self.num_directions, self.num_layers,
                                 batch_first=True, dropout=dropout_ratio, bidirectional=bidirectional)

    def forward(self, action_embeds, feature, lengths, already_dropfeat=False):
        """
        :param action_embeds: (batch_size, length, 2048). The feature of the view
        :param feature: (batch_size, length, 36, 2048). The action taken (with the image feature)
        :param lengths: Not used in it
        :return: context with shape (batch_size, length, hidden_size)
        """
        x = action_embeds
        if not already_dropfeat:
            x = self.drop3(x)            # Do not dropout the spatial features

        # LSTM on the action embed
        ctx, _ = self.lstm(x)
        ctx = self.drop(ctx)

        # Att and Handle with the shape
        batch_size, max_length, _ = ctx.size()
        if not already_dropfeat:
            feature = self.drop3(feature)   # Dropout the image feature
        # print(feature.size())
        # print(ctx.size())
        x, _ = self.attention_layer(                        # Attend to the feature map
            ctx.contiguous().view(-1, self.hidden_size),    # (batch, length, hidden) --> (batch x length, hidden)
            feature.view(batch_size * max_length, -1, self.feature_size),        # (batch, length, # of images, feature_size) --> (batch x length, # of images, feature_size)
        )
        x = x.view(batch_size, max_length, -1)
        x = self.drop(x)

        # Post LSTM layer
        x, _ = self.post_lstm(x)
        x = self.drop(x)

        return x

The code for training the Decoder is as below.

class SpeakerDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, padding_idx, hidden_size, dropout_ratio): #hidden size = rnn_dim = 512
        super(SpeakerDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.drop = nn.Dropout(dropout_ratio)
        self.attention_layer = SoftDotAttention(hidden_size)
        self.projection = nn.Linear(hidden_size, vocab_size)
        self.baseline_projection = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout_ratio),
            nn.Linear(128, 1)
        )

    def forward(self, words, ctx, ctx_mask, h0, c0):
        embeds = self.embedding(words)
        embeds = self.drop(embeds)
        x, (h1, c1) = self.lstm(embeds, (h0, c0))

        x = self.drop(x)

        # Get the size
        batchXlength = words.size(0) * words.size(1)
        multiplier = batchXlength // ctx.size(0)         # By using this, it also supports the beam-search

        # Att and Handle with the shape
        # Reshaping x          <the output> --> (b(word)*l(word), r)
        # Expand the ctx from  (b, a, r)    --> (b(word)*l(word), a, r)
        # Expand the ctx_mask  (b, a)       --> (b(word)*l(word), a)
        # print("IN:")
        # print(x.contiguous().view(batchXlength, self.hidden_size).size())
        # print(ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous(). view(batchXlength, -1, self.hidden_size).size())
        x, _ = self.attention_layer(
            x.contiguous().view(batchXlength, self.hidden_size),
            ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous(). view(batchXlength, -1, self.hidden_size),
            mask=ctx_mask.unsqueeze(1).expand(-1, multiplier, -1).contiguous().view(batchXlength, -1)
        )
        x = x.view(words.size(0), words.size(1), self.hidden_size)

        # Output the prediction logit
        x = self.drop(x)
        logit = self.projection(x)

        return logit, h1, c1

It would be very thankful if I could get help. Thanks.

I assume you are using nn.DataParallel? If so, note that it’s in maintenance mode and we generally recommend using DistributedDataParallel mainly for performance reasons but also functionality.

In this case nn.DataParallel will split the inputs along dim0 as it assumes this to be the batch dimension. I then assume that hidden states might be split in an incorrect way (or maybe generated in a wrong way).

1 Like

Thank you for your reply.

Like you said, I tried initializing the h_t and c_t switching the positions of channels, and dataparallel did split them(before it did not split them…).

h_t = torch.zeros(batch_size,1, self.rnn_dim).cuda() ####################### switched positions
        c_t = torch.zeros(batch_size,1, self.rnn_dim).cuda() ####################### switched positions
        ctx_mask = utils.length2mask(lengths)

        # Get Language Input
        if insts is None:
            insts = self.gt_words(obs,for_nav=for_nav)  # Language Feature

        # Decoder
        logits, h_t, c_t = self.decoder(insts, ctx, ctx_mask, h_t, c_t) #this is where the h_t and c_t is used as input

Because the input shape has changed to (batch_size, 1, hidden_size), I tried to permute the input into the shape of (1, batch_size, hidden_size).

def forward(self, words, ctx, ctx_mask, h0, c0): 
        embeds = self.embedding(words)
        embeds = self.drop(embeds)
        h0 = h0.permute(1, 0, 2)
        c0 = c0.permute(1, 0, 2)
        print("h0 new shape : ",h0.shape, "c0 new shape : ", c0.shape)
        x, (h1, c1) = self.lstm(embeds, (h0, c0))

However, it still makes an error.

RuntimeError: Expected hidden[0] size (1, 3, 512), got (3, 1, 512)

I tried looking at the shapes of the permuted h0 and c0, and it printed as below.

('h0 new shape : ', (1, 3, 512), 'c0 new shape : ', (1, 3, 512))
('h0 new shape : ', (1, 3, 512), 'c0 new shape : ', (1, 3, 512))
('h0 new shape : ', (1, 3, 512), 'c0 new shape : ', (1, 3, 512))
('h0 new shape : ', (1, 3, 512), 'c0 new shape : ', (1, 3, 512))
('h0 new shape : ', (1, 3, 512), 'c0 new shape : ', (1, 3, 512))
('h0 new shape : ', (1, 3, 512), 'c0 new shape : ', (1, 3, 512))
('h0 new shape : ', (3, 1, 512), 'c0 new shape : ', (3, 1, 512)('h0 new shape : ', )(
3, 1, 512), 'c0 new shape : '('h0 new shape : ', (3, 1, 512, ), 'c0 new shape : ', (3(3, 1, 512, 1, 512))
))

The printed shapes are weird, and I really don’t know why it is printing like that. I also don’t understand why permute is not working.
It would be very thankful if I could get some more help on this. Thanks.

Could you post a minimal and executable code snippet I could copy to reproduce the issue?