Expected input batch_size (3) to match target batch_size (1)

Error with prints that show that the size and shape are the same

Before Outputs size:  torch.Size([3, 9956])
Before Outputs shape:  torch.Size([3, 9956])
Outputs size:  torch.Size([1, 3, 9])
Captions size:  torch.Size([1, 3, 9])
Captions shape:  torch.Size([1, 3, 9])
Outputs shape:  torch.Size([1, 3, 9])
Traceback (most recent call last):
  File "/home/Documents/projects/anothertry/pytorch-tutorial/tutorials/03-advanced/image_captioning/finetuneme.py", line 169, in <module>
    main(args)
  File "/home/Documents/projects/anothertry/pytorch-tutorial/tutorials/03-advanced/image_captioning/finetuneme.py", line 124, in main
    loss.backward()
  File "/home/anaconda3/lib/python3.11/site-packages/torch/_tensor.py", line 525, in backward
    torch.autograd.backward(
  File "/home/anaconda3/lib/python3.11/site-packages/torch/autograd/__init__.py", line 267, in backward
    _engine_run_backward(
  File "/home/anaconda3/lib/python3.11/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Expected input size [3, 9], got [3, 9, 256]

Code:

    model = FineTuneModel(args.embed_size, args.hidden_size, vocab_size, args.num_layers)
    model = model.to(device)

    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
    
    
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())

    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    for epoch in range(args.num_epochs):
        model.train()
        total_loss = 0
        for i, (images, captions) in enumerate(dataloader):
            images = images.to(device)
            captions = captions.to(device)


            lengths = [len(cap) for cap in captions]
            outputs = model(images, captions, lengths)
            print("Before Outputs size: ", outputs.size())
            print("Before Outputs shape: ", outputs.shape)
            linear = nn.Linear(9956, 9).to(device)
            
            outputs = linear(outputs)
            outputs = outputs.unsqueeze(0)
            captions = captions.to(dtype=torch.float32)
            #outputs = outputs.view(3, 9)
            print("Outputs size: ", outputs.size())
            print("Captions size: ", captions.size())
            print("Captions shape: ", captions.shape)
            print("Outputs shape: ", outputs.shape)
            loss = criterion(outputs, captions)
            model.zero_grad()
            loss.backward()
            
            optimizer.step()

            total_loss += loss.item()
            print(f'Epoch [{epoch+1}/{args.num_epochs}], Loss: {total_loss/len(dataloader)}')

        torch.save(model.decoder.state_dict(),
                os.path.join(args.fine_path,
                                'decoder-1-1.ckpt'))

        torch.save(model.encoder.state_dict(),
                os.path.join(args.fine_path,
                                'encoder-1-1.ckpt'))

In case it’s needed too (my encoder):

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        #self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        self.bn = nn.LayerNorm(embed_size)

    def forward(self, images):
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

If I change to this:
loss = criterion(outputs.squeeze(0), captions.flatten()) the error changes to:

Expected input batch_size (3) to match target batch_size (27)`. on `line 122, in main
    loss = criterion(outputs.squeeze(0), captions.flatten())

And if I do outputs = outputs.view(3, 9) the error changes to:

File "/home/es/Documents/projects/anothertry/pytorch-tutorial/tutorials/03-advanced/image_captioning/finetuneme.py", line 122, in main
    loss = criterion(outputs, captions)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "//anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home//anaconda3/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1185, in forward
    return F.cross_entropy(input, target, weight=self.weight,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home//anaconda3/lib/python3.11/site-packages/torch/nn/functional.py", line 3086, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: Expected input batch_size (3) to match target batch_size (1).

PS: Don’t mind the file paths, I’m deleting the username.

Could you post a minimal and executable code snippet reproducing the issue?
Also, double post from here.

Not sure how I can do that. It has the dependency of the model that is on another file as the vocabulary.
The training has the model that needs the CCN and RNN. If you could give a hint of how can I share a minimal and executable code snippet if this conjugates every little bit of my files.

PS:

If I remove the loss.backward() the error disappears, and it trains the model (very badly).

This post explains it in detail. It’s helpful to be able to actually execute your code to be able to debug it properly instead of trying to guess what the issue is, so try to narrow down the failing module and use random data to reproduce it.

Here it is the minimal executable code snippet reproducing the issue:

import os
import pickle
import argparse
import torch
from torch import nn 
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from PIL import Image
from collections import Counter
import nltk

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image_to_caption = {
    '0': ['a women in a white shirt fighting a man',
            'a man touching a women',
            'a man assaulting a women'],
    #'1': ['asd', 'asd']
}


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.LayerNorm(embed_size)

    def forward(self, images):
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length

        #Not necessary :-|
        #self.vocab_size = vocab_size

    def forward(self, features, captions, lengths):
        embeddings = self.embed(captions)
        
        features = features.unsqueeze(1).unsqueeze(2)
        features = features.expand(-1, 3, 9, -1)

        embeddings = torch.cat((features, embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs

    def sample(self, features, states=None):
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)
            outputs = self.linear(hiddens.squeeze(1))
            _, predicted = outputs.max(1)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)
            inputs = inputs.unsqueeze(1)
        sampled_ids = torch.stack(sampled_ids, 1) 
        return sampled_ids

class Vocabulary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(image_to_caption, threshold):
    counter = Counter()
    for captions in image_to_caption.values():
        for caption in captions:
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            counter.update(tokens)

    words = [word for word, cnt in counter.items() if cnt >= threshold]

    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

def main(args):
    vocab = build_vocab(image_to_caption, threshold=1)
    vocab_size = len(vocab)

    transform = transforms.Compose([
        transforms.Resize((args.crop_size)),
        transforms.Grayscale(num_output_channels=3),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])
    
    class CustomDataset(Dataset):
        def __init__(self, image_to_caption):
            self.image_to_caption = image_to_caption
            self.transform = transform

        def __len__(self):
            return 1
        
        def __getitem__(self, index):
            img = torch.randn(3, 9, 9948)
            print(img)
            #img = torch.Generator()
            #img = transform(img)

            def tokenize(caption, vocabulary):
                words = caption.split()
                tokens = []
                for word in words:
                    if word in vocabulary.word2idx:
                        tokens.append(vocabulary(word))
                return tokens
            
            captions = self.image_to_caption[str(index)]
            tokenized_captions = [torch.tensor(tokenize(caption, vocab)) for caption in captions]
            tokenized_captions = pad_sequence(tokenized_captions, batch_first=True)

            return img, tokenized_captions

    class FineTuneModel(nn.Module):
        def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
            super(FineTuneModel, self).__init__()
            self.encoder = EncoderCNN(embed_size)
            self.decoder = DecoderRNN(embed_size,
                                      hidden_size,
                                      vocab_size,
                                      num_layers)

        def forward(self, images, captions, lengths):
            features = self.encoder(images)
            outputs = self.decoder(features, captions, lengths)
            return outputs #CHECK DECODER -- checks out in this code
    
    dataset = CustomDataset(image_to_caption=image_to_caption)
    
    dataloader = DataLoader( dataset, batch_size=args.batch_size, shuffle=True, drop_last=False)

    model = FineTuneModel(args.embed_size, args.hidden_size, vocab_size, args.num_layers)

    model = model.to(device)

    """Model load here"""

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    for epoch in range(args.num_epochs):
        model.train()
        #Check model train and eval functions
        total_loss = 0
        for i, (images, captions) in enumerate(dataloader):
            images = images.to(device)
            captions = captions.to(device)

            lengths = [len(cap) for cap in captions]
            outputs = model(images, captions, lengths)

            print("\nBefore Outputs size: ", outputs.size(),
                  "\nBefore Captions size: ", captions.size())
            
            linear = nn.Linear(13, 9).to(device)
            outputs = linear(outputs)

            outputs = outputs.unsqueeze(0)

            print("\nAfter Outputs size: ", outputs.size(),
                  "\nAfter Captions size: ", captions.size(),
                  "\nAfter Outputs shape: ", outputs.shape,
                  "\nAfter Captions shape: ", captions.shape,
                  "\n")

            captions = captions.to(dtype=torch.float32)
            loss = criterion(outputs, captions) #check criterion variables

            print("\nLast Outputs size: ", outputs.size(),
                  "\nLast Captions size: ", captions.size(),
                  "\n \n")

            model.zero_grad()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item() # check this funcion
            print(f'Epoch [{epoch+1}/{args.num_epochs}], Loss: {total_loss/len(dataloader)}')

        """Model save here"""

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('--crop_size', type=int, default=224, help='size for cropping images')
    #parser.add_argument('--vocab_path', type=str, default='data/vocab.pkl', help='path for vocab file')
    #parser.add_argument('--image_folder', type=str, default='finetuning/images', help='path for the images folder')
    parser.add_argument('--batch_size', type=int, default=1, help='batch size')
    parser.add_argument('--embed_size', type=int, default=256, help='dimension of word embedding vectors')
    parser.add_argument('--hidden_size', type=int, default=512, help='dimension of lstm hidden states')
    parser.add_argument('--num_layers', type=int, default=1, help='number of layers in lstm')
    parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate for schedule')
    parser.add_argument('--num_epochs', type=int, default=1, help='number of epochs for training')

    args = parser.parse_args()
    print(args)
    main(args)

Hi, have you tried yet?