I have a very simple LSTM model:
class MyLSTM(nn.Module):
def __init__(self, input_dim, hidden_dim):
super(MyLSTM,self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.LSTM = nn.LSTM(input_dim, hidden_dim, batch_first=True)
self.LNN = nn.Linear(hidden_dim, input_dim)
def forward(self, i, hc):
o, _ = self.LSTM(i, hc)
return self.LNN(o)
which I train with the following code:
params = {'batch_size': args.batch,
'shuffle': False,
'drop_last': True}
dataloader = DataLoader(train_set, **params)
if cuda:
model.cuda()
hidden_dim = n_features
model = MyLSTM(n_features, hidden_dim)
optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr)
loss_f = nn.MSELoss()
for epoch in range(args.epochs):
for i, (seq, truth) in enumerate(dataloader):
model.zero_grad()
optimizer.zero_grad()
h = torch.rand(args.batch * hidden_dim).view(1, args.batch, hidden_dim)
c = torch.rand(args.batch * hidden_dim).view(1, args.batch, hidden_dim)
output = model(seq, (h, c))
loss = loss_f(output, truth)
loss.backward()
optimizer.step()
This works fine, but now I would like to use DataParallel to train my model on 2 GPUs.
So I changed the above code to:
if cuda:
if len(gpus) > 1:
model = nn.DataParallel(model, device_ids=[0,1])
model.cuda()
Also, it seems that the sizes of the hidden vectors must be adjusted as well:
h = torch.rand(args.batch // n_gpus * hidden_dim).view(1, args.batch // n_gpus, hidden_dim)
c = torch.rand(args.batch // n_gpus * hidden_dim).view(1, args.batch // n_gpus, hidden_dim)
Now, I face another problem (assuming batch size = 16):
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py:431: UserWarning: Using a target size (torch.Size([16, 1440, 3969])) that is different to the input size (torch.Size([8, 1440, 3969])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
Traceback (most recent call last):
File "py/train.py", line 144, in <module>
loss = loss_f(output, truth)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py", line 431, in forward
return F.mse_loss(input, target, reduction=self.reduction)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py", line 2215, in mse_loss
expanded_input, expanded_target = torch.broadcast_tensors(input, target)
File "/usr/local/lib/python3.6/dist-packages/torch/functional.py", line 52, in broadcast_tensors
return torch._C._VariableFunctions.broadcast_tensors(tensors)
RuntimeError: The size of tensor a (8) must match the size of tensor b (16) at non-singleton dimension 0
After a bit of digging, it seems that, somehow, only 1 replica of my model is used. That is, each replica correctly receives 1/2 of the batch (batch size = 8), but having only 1 replica, the final output is not 16, but only 8, hence the size mismatch error.
This is confirmed by the fact that, if I put one print(o.size())
inside the forward
method of the MyLSTM
model, I see only 1 torch.Size([8, T, n_features])
, while I would expect 2.
Any idea on the possible causes of this?