I’ve been trying to run this code that is currently on github.
The authors would not reply to the same question asked in the past, so I ended up here.
When I run the code on one GPU it works fine, but when I try to run it on muliple GPUs(3 GPUs), I get an error of : RuntimeError: Expected hidden[0] size (1, 3, 512), got (1, 9, 512).
The model has an encoder and a decoder, and (obviously) dataparallel is being used for both. However, I only encounter problems with the decoder. Both encoder and decoder are LSTMs.
The error that I get is as below.
Traceback (most recent call last):
File "src/train.py", line 573, in <module>
train_val()
File "src/train.py", line 569, in train_val
train( train_env, agent, val_envs, tok)
File "src/train.py", line 277, in train
agent.train(interval, feedback=args.agent_feedback_method)
File "/root/mount/rmm/src/agent.py", line 686, in train
self.rollout(speaker_branching=self.train_branching)
File "/root/mount/rmm/src/agent.py", line 414, in rollout
prev_masks=masks, prev_entropys=entropys, prev_ml_loss=loss, train_rl=train_rl)
File "/root/mount/rmm/src/speaker.py", line 78, in train
prev_masks=prev_masks, prev_entropys=prev_entropys, prev_ml_loss=prev_ml_loss, train_rl=train_rl)
File "/root/mount/rmm/src/speaker.py", line 213, in teacher_forcing
logits, _, _ = self.decoder(insts, ctx, ctx_mask, h_t, c_t) #Error here. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/usr/local/lib/python2.7/dist-packages/torch/_utils.py", line 394, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/root/mount/rmm/src/model.py", line 238, in forward
x, (h1, c1) = self.lstm(embeds, (h0, c0))
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/rnn.py", line 556, in forward
self.check_forward_args(input, hx, batch_sizes)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/rnn.py", line 512, in check_forward_args
'Expected hidden[0] size {}, got {}')
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/rnn.py", line 176, in check_hidden_size
raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
RuntimeError: Expected hidden[0] size (1, 3, 512), got (1, 9, 512)
The code for training the Encoder is as below.
class SpeakerEncoder(nn.Module):
def __init__(self, feature_size, hidden_size, dropout_ratio, bidirectional,featdropout=0.3):
super(SpeakerEncoder, self).__init__()
self.num_directions = 2 if bidirectional else 1
self.hidden_size = hidden_size
self.num_layers = 1
self.feature_size = feature_size
if bidirectional:
print("BIDIR in speaker encoder!!")
self.lstm = nn.LSTM(feature_size, self.hidden_size // self.num_directions, self.num_layers,
batch_first=True, dropout=dropout_ratio, bidirectional=bidirectional)
self.drop = nn.Dropout(p=dropout_ratio)
self.drop3 = nn.Dropout(p=featdropout)
self.attention_layer = SoftDotAttention(self.hidden_size, feature_size)
self.post_lstm = nn.LSTM(self.hidden_size, self.hidden_size // self.num_directions, self.num_layers,
batch_first=True, dropout=dropout_ratio, bidirectional=bidirectional)
def forward(self, action_embeds, feature, lengths, already_dropfeat=False):
"""
:param action_embeds: (batch_size, length, 2048). The feature of the view
:param feature: (batch_size, length, 36, 2048). The action taken (with the image feature)
:param lengths: Not used in it
:return: context with shape (batch_size, length, hidden_size)
"""
x = action_embeds
if not already_dropfeat:
x = self.drop3(x) # Do not dropout the spatial features
# LSTM on the action embed
ctx, _ = self.lstm(x)
ctx = self.drop(ctx)
# Att and Handle with the shape
batch_size, max_length, _ = ctx.size()
if not already_dropfeat:
feature = self.drop3(feature) # Dropout the image feature
# print(feature.size())
# print(ctx.size())
x, _ = self.attention_layer( # Attend to the feature map
ctx.contiguous().view(-1, self.hidden_size), # (batch, length, hidden) --> (batch x length, hidden)
feature.view(batch_size * max_length, -1, self.feature_size), # (batch, length, # of images, feature_size) --> (batch x length, # of images, feature_size)
)
x = x.view(batch_size, max_length, -1)
x = self.drop(x)
# Post LSTM layer
x, _ = self.post_lstm(x)
x = self.drop(x)
return x
The code for training the Decoder is as below.
class SpeakerDecoder(nn.Module):
def __init__(self, vocab_size, embedding_size, padding_idx, hidden_size, dropout_ratio): #hidden size = rnn_dim = 512
super(SpeakerDecoder, self).__init__()
self.hidden_size = hidden_size
self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx)
self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
self.drop = nn.Dropout(dropout_ratio)
self.attention_layer = SoftDotAttention(hidden_size)
self.projection = nn.Linear(hidden_size, vocab_size)
self.baseline_projection = nn.Sequential(
nn.Linear(hidden_size, 128),
nn.ReLU(),
nn.Dropout(dropout_ratio),
nn.Linear(128, 1)
)
def forward(self, words, ctx, ctx_mask, h0, c0):
embeds = self.embedding(words)
embeds = self.drop(embeds)
x, (h1, c1) = self.lstm(embeds, (h0, c0))
x = self.drop(x)
# Get the size
batchXlength = words.size(0) * words.size(1)
multiplier = batchXlength // ctx.size(0) # By using this, it also supports the beam-search
# Att and Handle with the shape
# Reshaping x <the output> --> (b(word)*l(word), r)
# Expand the ctx from (b, a, r) --> (b(word)*l(word), a, r)
# Expand the ctx_mask (b, a) --> (b(word)*l(word), a)
# print("IN:")
# print(x.contiguous().view(batchXlength, self.hidden_size).size())
# print(ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous(). view(batchXlength, -1, self.hidden_size).size())
x, _ = self.attention_layer(
x.contiguous().view(batchXlength, self.hidden_size),
ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous(). view(batchXlength, -1, self.hidden_size),
mask=ctx_mask.unsqueeze(1).expand(-1, multiplier, -1).contiguous().view(batchXlength, -1)
)
x = x.view(words.size(0), words.size(1), self.hidden_size)
# Output the prediction logit
x = self.drop(x)
logit = self.projection(x)
return logit, h1, c1
It would be very thankful if I could get help. Thanks.