I changed my code to use an LSTM like so:
import torch
import torch.nn as nn
from torch.autograd import Variable
class StepRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers): #
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.encoder = nn.Embedding(input_size, hidden_size)
self.rnn = nn.LSTM(input_size=hidden_size, \
hidden_size=hidden_size,\
num_layers=num_layers)
self.decoder = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden):
batch_size = input.size(0)
encoded = self.encoder(input)
output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
output = self.decoder(output.view(batch_size, -1))
return output, hidden
def init_hidden(self, batch_size):
return (Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).cuda(),
Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size).cuda()))
decoder = StepRNN(
input_size=100,
hidden_size=8,
output_size=100,
num_layers=1)
decoder_dist = nn.DataParallel(decoder, device_ids=[0,1,2,3,4,5,6,7], dim=0)
decoder_dist.cuda()
batch_size = 16
hidden = decoder.init_hidden(batch_size)
input_ = Variable(torch.LongTensor(batch_size, 10)).cuda()
target = Variable(torch.LongTensor(batch_size, 10)).cuda()
for c in range(10):
decoder_dist(input_[:,c].contiguous(), hidden)
The result is again RuntimeError: Expected hidden size (1, 2, 8), got (1, 16, 8)
. Full trace. It doesn’t seem to affect GRU only so I modified the title of this post for future possible searches.
What is the right way to parallelize consistent with the pytorch defaults? It seems like DataParallel is expecting data in the non standard way, or am I missing anything?
As a beginner its confusing to rethink everything I’ve learned using batch_first=True
, how can I go about using DataParallel using the defaults, or how would I have to modify the code above to use batch_first=True?
Thanks for any input.