Hi there,
I am trying to run my model using DataParallel. When I am passing the input to the model, the “source” tensors got divided into 2 sub-batches with dimensions of src: torch.Size([64, 3, 41, 266])
while the target tensors are not. I am using a batch size of 128 and 2 GPUs, which should give 2 sub-batches of 64 elements but I am not getting with target tensors. The shape of the target am getting is: torch.Size([184, 128])
.
May I request you to tell me what i am doing wrong? Any suggestions will be appreciated! Thank you!
# -*- coding: utf-8 -*-
import time, pandas
import torch
def train(model, vocab, batch_size, train_dataloader, optimizer, criterion,device, clip, write_file):
model.train() # train mode is ON i.e. dropout and normalization tech. will be used
# train_dataloader.sampler.set_epoch(i)
epoch_loss = 0
trg_seqs = open('logs/train_targets.txt', 'w')
pred_seqs = open('logs/train_predicted.txt', 'w')
for i, tdi in enumerate(train_dataloader):
# setting gradients to zero
optimizer.zero_grad()
# output, pred, encoder, decoder = model(src, trg, vocab, True, True, 0.5)
output, pred, encoder, decoder = model( tdi, vocab, True, True, 0.5 )
# translating and storing trg and pred sequences in batches
if write_file:
batch_size = trg.shape[1]
for idx in range(batch_size):
trg_arr = [vocab.itos[itrg] for itrg in trg[:,idx]]
trg_seq = " ".join(trg_arr)
trg_seqs.write(trg_seq + '\n')
pred_arr = [vocab.itos[ipred] for ipred in pred.int()[:,idx]]
pred_seq = " ".join(pred_arr)
pred_seqs.write(pred_seq+'\n')
#trg = [trg len, batch size]
#output = [trg len, batch size, output dim]
output_dim = output.shape[-1]
output = output[1:].view(-1, output_dim)
trg = trg[1:].view(-1)
loss = criterion(output, trg)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
epoch_loss += loss.item()
net_loss = epoch_loss/len(train_dataloader)
return net_loss, encoder, decoder
CALLING CLASS OF THE MODEL:
class OpenNMTImg2Seq(nn.Module):
"""
Calling class
"""
def __init__(self, encoder, decoder, device, encoder_dim, hid_dim):
super(OpenNMTImg2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
# def forward(self, src, trg, vocab, write_flag=False, teacher_force_flag=False, teacher_forcing_ratio=0):
def forward(self, tdi, vocab, write_flag=False, teacher_force_flag=False, teacher_forcing_ratio=0):
(img, mml) = tdi
trg = mml.to(self.device, dtype=torch.int64)
# checking the shapes: can see the difference here.
print('trg shape: ', trg.shape)
src = img.to(self.device)
print(' src, trg shape: ', src.shape, trg.shape)
batch_size = trg.shape[1]
trg_len = trg.shape[0]
trg_dim = self.decoder.output_dim
# to store all separate outputs of individual token
outputs = torch.zeros(trg_len, batch_size, trg_dim).to(self.device) #[trg_len, batch, output_dim]
# for each token, [batch, output_dim]
# run the encoder --> get flattened FV of images
encoder_out, hidden, cell = self.encoder(src) # enc_output: [HxW+1, B, H*2] Hid/cell: [1, B, Hid]
print('encoder_out: ', encoder_out.shape)
dec_src = trg[0,:] # [1, B]
if write_flag:
pred_seq_per_batch = torch.zeros(trg.shape)
init_idx = vocab.stoi['<sos>'] # 2
pred_seq_per_batch[0,:] = torch.full(dec_src.shape, init_idx)
for t in range(1, trg_len):
output, hidden, cell = self.decoder(dec_src, encoder_out, hidden, cell) # O: [B, out] H: [1, B, Hid]
outputs[t]=output
top1 = output.argmax(1) # [batch_size]
if write_flag:
pred_seq_per_batch[t,:] = top1
# decide if teacher forcing shuuld be used or not
teacher_force = False
if teacher_force_flag:
teacher_force = random.random() < teacher_forcing_ratio
dec_src = trg[t] if teacher_force else top1
if write_flag: return outputs, pred_seq_per_batch, self.encoder, self.decoder
else: return outputs, self.encoder, self.decoder
The shapes I am getting on GPU1(or at GPU0):
trg shape: torch.Size([183, 128])
src, trg shape: torch.Size([64, 3, 41, 266]) torch.Size([183, 128])