DataParallel is not splitting the the target tensor

Hi there,
I am trying to run my model using DataParallel. When I am passing the input to the model, the “source” tensors got divided into 2 sub-batches with dimensions of src: torch.Size([64, 3, 41, 266]) while the target tensors are not. I am using a batch size of 128 and 2 GPUs, which should give 2 sub-batches of 64 elements but I am not getting with target tensors. The shape of the target am getting is: torch.Size([184, 128]).
May I request you to tell me what i am doing wrong? Any suggestions will be appreciated! Thank you!

# -*- coding: utf-8 -*-

import time, pandas
import torch

def train(model, vocab, batch_size, train_dataloader, optimizer, criterion,device, clip, write_file):

    model.train()  # train mode is ON i.e. dropout and normalization tech. will be used

    # train_dataloader.sampler.set_epoch(i)

    epoch_loss = 0

    trg_seqs = open('logs/train_targets.txt', 'w')
    pred_seqs = open('logs/train_predicted.txt', 'w')

    for i, tdi in enumerate(train_dataloader):

        # setting gradients to zero

        # output, pred, encoder, decoder = model(src, trg, vocab, True, True, 0.5)
        output, pred, encoder, decoder = model( tdi, vocab, True, True, 0.5 )

        # translating and storing trg and pred sequences in batches
        if write_file:
            batch_size = trg.shape[1]
            for idx in range(batch_size):
                trg_arr = [vocab.itos[itrg] for itrg in trg[:,idx]]
                trg_seq = " ".join(trg_arr)
                trg_seqs.write(trg_seq + '\n')

                pred_arr = [vocab.itos[ipred] for ipred in[:,idx]]
                pred_seq = " ".join(pred_arr)

        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)


        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)


        epoch_loss += loss.item()
        net_loss = epoch_loss/len(train_dataloader)

    return net_loss, encoder, decoder


class OpenNMTImg2Seq(nn.Module):
    Calling class
    def __init__(self, encoder, decoder, device, encoder_dim, hid_dim):
        super(OpenNMTImg2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    # def forward(self, src, trg,  vocab, write_flag=False, teacher_force_flag=False, teacher_forcing_ratio=0):
    def forward(self, tdi, vocab, write_flag=False, teacher_force_flag=False, teacher_forcing_ratio=0):

        (img, mml) = tdi
        trg =, dtype=torch.int64)
        # checking the shapes: can see the difference here. 
        print('trg shape:  ',  trg.shape)
        src =
        print(' src, trg shape:  ', src.shape, trg.shape)

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_dim = self.decoder.output_dim

        # to store all separate outputs of individual token
        outputs = torch.zeros(trg_len, batch_size, trg_dim).to(self.device) #[trg_len, batch, output_dim]
        # for each token, [batch, output_dim]
        # run the encoder --> get flattened FV of images
        encoder_out, hidden, cell = self.encoder(src)       # enc_output: [HxW+1, B, H*2]   Hid/cell: [1, B, Hid]
        print('encoder_out: ', encoder_out.shape)

        dec_src = trg[0,:]   # [1, B]

        if write_flag:
            pred_seq_per_batch = torch.zeros(trg.shape)
            init_idx = vocab.stoi['<sos>']  # 2
            pred_seq_per_batch[0,:] = torch.full(dec_src.shape, init_idx)

        for t in range(1, trg_len):

            output, hidden, cell = self.decoder(dec_src, encoder_out, hidden, cell)     # O: [B, out]   H: [1, B, Hid]
            top1 = output.argmax(1)     # [batch_size]

            if write_flag:
                pred_seq_per_batch[t,:] = top1
            # decide if teacher forcing shuuld be used or not
            teacher_force = False
            if teacher_force_flag:
                teacher_force = random.random() < teacher_forcing_ratio

            dec_src = trg[t] if teacher_force else top1

        if  write_flag: return outputs, pred_seq_per_batch, self.encoder, self.decoder
        else: return outputs, self.encoder, self.decoder

The shapes I am getting on GPU1(or at GPU0):

 trg shape:   torch.Size([183, 128])
 src, trg shape:   torch.Size([64, 3, 41, 266]) torch.Size([183, 128])

If you are using batch size of 128, won’t the target size be (128, x)? i.e., first dimension is the batch size.

Hi @InnovArul, I am using DataParalell, which should spit the dataset among GPUs. If you see the shape of src, it is divided into two equal batches of 64 while the trg is still 128. It creates problems down the road while running the model.

I am not sure why it happens. I could not reproduce it yet.

import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self):

    def forward(self, x, y):
        print(x.shape, y.shape)
        return x, y

if __name__ == '__main__':
    m = Model()
    dp = nn.DataParallel(m)

    x, y = torch.randn(20, 35), torch.randn(35, 20)
    dp(x, y)
torch.Size([10, 35]) torch.Size([18, 20])
torch.Size([10, 35]) torch.Size([17, 20])

@InnovArul Thank you very much for your help.
There was one small mistake that I was doing. The shape of the src was [Batch, channel, W, H] while that of trg was [seq_len, Batch]. It should be [Batch, seq_len]. The Batch has to be the first dimension of the tensor.