Expected input batch_size (2550) to match target batch_size (11)

Hello. Now I’m making Transformer with torchtext. I inferred this tutorial.
I made datasets with torchtext.

I think this error is occurred by the difference of padding length from each epoch.
Do you know the solution of this problem?

train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), batch_sizes=(50, 50, 50), device= device)
for batch in train_iter:
  print("input size: ", batch.IN.shape)

This is the part of output.

input size:  torch.Size([33, 50])
input size:  torch.Size([55, 50])
input size:  torch.Size([26, 50])
input size:  torch.Size([43, 50])

Here is the model.

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
            device = src.device
            mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

this is the parameter of the model.

ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 50*40 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

What is the error?
torchtext would actually create batches for you and pad (using default <pad>) sequences depending on the input seq len parameter. Good to post your torchtext code here too.

The error occurred in this function. I wanted to train Transformer with my own datasets.

criterion = nn.CrossEntropyLoss()
lr = 5 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train(iterator):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(SRC.vocab.stoi)
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    for i, batch in enumerate(iterator):
        #data, targets = get_batch(train_data, i)
        data = batch.SRC
        targets = batch.TRG
        #print(data.shape)
        optimizer.zero_grad()
        if data.shape[0] != bptt:
            src_mask = model.generate_square_subsequent_mask(data.shape[0]).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(train_iter)
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

this is the error message.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-98-902b689e0b7c> in <module>()
      5 for epoch in range(1, epochs + 1):
      6     epoch_start_time = time.time()
----> 7     train(train_iter)
      8     val_loss = evaluate(model, val_data)
      9     print('-' * 89)

4 frames
<ipython-input-97-731a786093a5> in train(iterator)
     20             src_mask = model.generate_square_subsequent_mask(data.shape[0]).to(device)
     21         output = model(data, src_mask)
---> 22         loss = criterion(output.view(-1, ntokens), targets)
     23         loss.backward()
     24         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
    946     def forward(self, input: Tensor, target: Tensor) -> Tensor:
    947         return F.cross_entropy(input, target, weight=self.weight,
--> 948                                ignore_index=self.ignore_index, reduction=self.reduction)
    949 
    950 

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
   2420     if size_average is not None or reduce is not None:
   2421         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2422     return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
   2423 
   2424 

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   2214     if input.size(0) != target.size(0):
   2215         raise ValueError('Expected input batch_size ({}) to match target batch_size ({}).'
-> 2216                          .format(input.size(0), target.size(0)))
   2217     if dim == 2:
   2218         ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)

ValueError: Expected input batch_size (660) to match target batch_size (9).

Two things:

  1. Can you share the part where you are reading, preparing the data and vocab. If you’re using a text field maybe try:
    text_field = data.Field(lower=True, fix_length=100)
  2. I think this part can be moved inside model definition:

Make sure the input sequence lengths are the same (for all batches). Then, align it with the target length.

This is the part of preparing data.

I think all batches are same size.

# this is tokenizer for japanese
import janome
from janome.tokenizer import Tokenizer
j_t = Tokenizer()
def tokenizer(text): 
    return [tok for tok in j_t.tokenize(text, wakati=True)]

SRC = data.Field(sequential=True, tokenize=tokenizer,init_token='<sos>',
                            eos_token='<eos>', lower=True)
TRG = data.Field(sequential=True, tokenize=tokenizer,init_token='<sos>',
                            eos_token='<eos>', lower=True)

train, val, test = data.TabularDataset.splits(
        path="./", train='train.tsv',
        validation='val.tsv', test='test.tsv', format='tsv',
        fields=[('SRC', SRC), ('TRG', TRG)])

SRC.build_vocab(train, min_freq=1)
TRG.build_vocab(train, min_freq=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_batch_size = 20
test_batch_size = 10
eval_batch_size = 2
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), sort = False,  batch_sizes = (train_batch_size,eval_batch_size, test_batch_size), device= device)

I could solve this problem by this advice.

thank you for helping me!!!

1 Like