Batch Size Value Error

I am trying to train my model, but I am getting this error “ValueError: Expected input batch_size (5760) to match target batch_size (40)”

The input to my model is

torch.Size([40, 16, 3, 112, 112])

16 is number of frames
I have 7 number of classes to classify

This is my model code

class VGG16(torch.nn.Module):

def __init__(self, num_features, num_classes):
    super(VGG16, self).__init__()
    
    # calculate same padding:
    # (w - k + 2*p)/s + 1 = o
    # => p = (s(o-1) - w + k)/2
    
    self.block_1 = nn.Sequential(
            nn.Conv2d(in_channels=3,
                      out_channels=64,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      # (1(32-1)- 32 + 3)/2 = 1
                      padding=1), 
            nn.ReLU(),
            nn.Conv2d(in_channels=64,
                      out_channels=64,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2),
                         stride=(2, 2))
    )
    
    self.block_2 = nn.Sequential(
            nn.Conv2d(in_channels=64,
                      out_channels=128,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=128,
                      out_channels=128,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2),
                         stride=(2, 2))
    )
    
    self.block_3 = nn.Sequential(        
            nn.Conv2d(in_channels=128,
                      out_channels=256,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=256,
                      out_channels=256,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),        
            nn.Conv2d(in_channels=256,
                      out_channels=256,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=256,
                      out_channels=256,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2),
                         stride=(2, 2))
    )
    
      
    self.block_4 = nn.Sequential(   
            nn.Conv2d(in_channels=256,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),        
            nn.Conv2d(in_channels=512,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),        
            nn.Conv2d(in_channels=512,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=512,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),   
            nn.MaxPool2d(kernel_size=(2, 2),
                         stride=(2, 2))
    )
    
    self.block_5 = nn.Sequential(
            nn.Conv2d(in_channels=512,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),            
            nn.Conv2d(in_channels=512,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),            
            nn.Conv2d(in_channels=512,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=512,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1),
            nn.ReLU(),   
            nn.MaxPool2d(kernel_size=(2, 2),
                         stride=(2, 2))             
    )
    self.classifier = nn.Sequential(
            nn.Linear(512, 4096),
            nn.ReLU(True),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Linear(4096, num_classes)
    )
        
    
    for m in self.modules():
        if isinstance(m, torch.nn.Conv2d):
            #n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            #m.weight.data.normal_(0, np.sqrt(2. / n))
            m.weight.detach().normal_(0, 0.05)
            if m.bias is not None:
                m.bias.detach().zero_()
        elif isinstance(m, torch.nn.Linear):
            m.weight.detach().normal_(0, 0.05)
            m.bias.detach().detach().zero_()
    
def forward(self, x):
    x = x.contiguous().view(-1, 3, 112, 112) #reshapes the input frames by first flattening the batch and frame dimensions into one dimension, resulting in a tensor with the shape [640, 3, 112, 112]
    x = self.block_1(x)
    x = self.block_2(x)
    x = self.block_3(x)
    x = self.block_4(x)
    x = self.block_5(x)
    print("shape of second last layer", x.shape)   
                
    x = self.classifier(x.view(-1, 512))
    print("shape of last layer", x.shape) 
    
    return x

"

shape of second last layer torch.Size([640, 512, 3, 3])
shape of last layer torch.Size([5760, 7])

Hello,

I noticed some errors in your implementation, and it appears that you’re working on a task involving the classification of RGB images into seven different classes. Here are the issues I found:

  1. The classifier’s input expects 512 values, which means you need to add extra pooling layers to reduce the dimensions appropriately. You can achieve this by using a line of code like: x = F.adaptive_max_pool2d(x, (1, 1)). This will downsample the input tensor to the required size for classification.
  2. When reshaping your tensors to prepare them for the classifier, it seems you are merging all the channel and batch dimensions. Instead, a more suitable approach would be to use: x = torch.flatten(x, 1). This operation will flatten the tensor while preserving the batch dimension, which is crucial for classification.

I done the changes as u said but its giving me error raise ValueError(
ValueError: Input dimension should be at least 3

def forward(self, x):
    x = x.contiguous().view(-1, 3, 112, 112)
    x = self.block_1(x)
    x = self.block_2(x)
    x = self.block_3(x)
    x = self.block_4(x)
    x = self.block_5(x)
    
    print("shape of second last layer", x.shape)   
    
    x = torch.flatten(x, 1)
                
    x = self.classifier(x.view(-1, 512))
    print("shape of last layer", x.shape) 
    
    
            
    x = F.adaptive_max_pool2d(x, (1, 1))

    
   

    return x

You can use following code snippet

def forward(self, x):
        x = x.contiguous().view(-1, 3, 112, 112) #reshapes the input frames by first flattening the batch and frame dimensions into one dimension, resulting in a tensor with the shape [640, 3, 112, 112]
        x = self.block_1(x)
        x = self.block_2(x)
        x = self.block_3(x)
        x = self.block_4(x)
        x = self.block_5(x)
        print("shape of second last layer", x.shape)   

        x = torch.nn.functional.adaptive_max_pool2d(x, (1, 1))
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        print("shape of last layer", x.shape) 
        
        return x

But, to increase implementation quality your input shape should be like (batch, channel, height, width) instead of your shape [40, 16, 3, 112, 112]. Also, in

x = x.contiguous().view(-1, 3, 112, 112)

You are assuming the input image shape to be (112, 112), which can be changed. You can use

x = x.contiguous().view(-1, 3, x.size(-2), x.size(-1))

I use that code snippets but again got the error
ValueError: Expected input batch_size (640) to match target batch_size (40).

This error is caused because you are passing 40*16=640 images as inputs to the model, but there are only 40 labels in the target data. There is a mismatch error in your training loop.

Yes, how can i adjust back it to the 40 because I am passing the video frames