ValueError: Expected input batch_size (512) to match target batch_size (112)

import pandas as pd
from PIL import Image
import cv2
import numpy as np
from ultralytics import YOLO
import matplotlib.pyplot as plt
import os
from torchvision import transforms
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
import re


license_label = pd.read_csv('/Users/velmurugan/Desktop/@/python_works/License plate detection/Licplatesrecognition_train.csv')
license_label.head()

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from PIL import Image

class CRNNDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.data = df
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data.iloc[idx, 0]
        img_path = f"{self.img_dir}/{img_name}"
        image = Image.open(img_path).convert('L')  # Convert image to grayscale

        if self.transform:
            image = self.transform(image)

        label_str = self.data.iloc[idx, 1]
        label = [int(char) for char in label_str]

        return image, label
    
train_df = license_label.iloc[:870]
val_df = license_label.iloc[870:]
train_img_path = '/Users/velmurugan/Desktop/@/python_works/License plate detection/reco/train'
val_img_path = '/Users/velmurugan/Desktop/@/python_works/License plate detection/reco/val'

transform = transforms.Compose([
    transforms.Resize((32, 128)),  # Resize to match the input size of the character recognition model
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalization
])

def collate_fn(batch):
    images, labels = zip(*batch)
    
    # Stack images into a batch tensor
    images = torch.stack(images, dim=0)
    
    # Pad labels into a batch tensor and reshape
    labels = [torch.tensor(label, dtype=torch.long) for label in labels]
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-1)
    labels_padded = labels_padded.view(-1)  # Flatten to match the output shape
    
    return images, labels_padded

# Creating train and val dataset and data loader

train_data = CRNNDataset(df=train_df,img_dir=train_img_path,transform=transform)
val_data = CRNNDataset(df=val_df,img_dir=val_img_path,transform=transform)

train_loader = DataLoader(train_data,batch_size=16,shuffle=True,collate_fn=collate_fn)
val_loader = DataLoader(val_data,batch_size=16,shuffle=False)

class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.rnn = nn.LSTM(64 * 8, 128, bidirectional=True, num_layers=2, batch_first=True)
        self.fc1 = nn.Linear(128 * 2, num_classes)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        
        batch_size, channels, height, width = x.size()
        x = x.permute(0, 3, 1, 2)  # Change to (batch, width, channels, height)
        x = x.reshape(batch_size, width, channels * height)  # Change to (batch, width, channels*height)

        x, _ = self.rnn(x)
        x = self.fc1(x)  # Apply FC layer to each time step
        return x
# Instantiate and train the CRNN model 
num_classes = 10  # Only digits (0-9)
crnn_model = CRNN(num_classes)

def train_crnn_model(train_loader, model, num_epochs=10):
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(images)
            
            # Reshape outputs to match the labels
            batch_size, seq_len, num_classes = outputs.size()
            outputs = outputs.view(batch_size * seq_len, num_classes)
            
            # Flatten labels
            labels = labels.view(-1)
            
            # Ensure outputs and labels have the same batch size
            if outputs.size(0) != labels.size(0):
                raise ValueError(f"Expected input batch_size ({outputs.size(0)}) to match target batch_size ({labels.size(0)}).")
            
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader)}')

train_crnn_model(train_loader, crnn_model, 10)

I can’t resolve this error. please help me

I might be wrong but here’s the detailed analysis I followed:

  • Model output shape (16, 32, 10) representing (batch_size, seq_len, num_classes)
  • Labels’ size seems to vary from sample to sample.
  • However, since the exception message includes labels.size(0) being 112, then knowing that:
    • labels were flattened (using .view(-1) twice in both collate_fn and train_crnn_model functions),
    • batch_size is set to be 16
    • Then the maximum length of a single label is 112/16 = 7
  • Since you used outputs = outputs.view(batch_size * seq_len, num_classes), you are assuming that seq_len should equal the maximum padded length (which clearly is not the case 32 != 7.

And hence, to resolve the error you should:

  • Keep labels shape/ size as (batch_size, max_padded_len) (i.e., remove both .view(-1))
  • Keep outputs shape/ size as (batch_size, seq_len, num_classes) (i.e., remove outputs = outputs.view(batch_size * seq_len, num_classes))

However, I believe that would produce another error with the criterion line.

To be able to further understand the requirements and provide possible solutions, please provide a sample of the dataset or mockup data similar to the actual one.