ValueError: Expected input batch_size (49132) to match target batch_size (49128)

I’m getting the following error. When the input size is 512x512, it works normally. However, it does not work for 1024x512 input size. The code is pasted below:

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import constant_init, kaiming_init, normal_init, xavier_init
from .vgg import VGG
from mmcv.runner import load_checkpoint

from mmdet.utils import get_root_logger
from mmdet.models.builder import BACKBONES

class SSDVGG(VGG):
    """VGG Backbone network for single-shot-detection.

        input_size (int): width and height of input, from {300, 512}.
        depth (int): Depth of vgg, from {11, 13, 16, 19}.
        out_indices (Sequence[int]): Output from which stages.

        >>> self = SSDVGG(input_size=300, depth=11)
        >>> self.eval()
        >>> inputs = torch.rand(1, 3, 300, 300)
        >>> level_outputs = self.forward(inputs)
        >>> for level_out in level_outputs:
        ...     print(tuple(level_out.shape))
        (1, 1024, 19, 19)
        (1, 512, 10, 10)
        (1, 256, 5, 5)
        (1, 256, 3, 3)
        (1, 256, 1, 1)
    extra_setting = {
        300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256),
        512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 256),
        1024: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 256),

    def __init__(self,
                 input_size=(512, 512),
                 out_indices=(3, 4),
                 out_feature_indices=(22, 34),
        # TODO: in_channels for mmcv.VGG
        super(SSDVGG, self).__init__(
        # assert input_size in (300, 512)
        self.input_size = input_size

        ## 이전까지 Feature개수 = 29

        self.features.add_module(  ## features = 30
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
        self.features.add_module(  ## features = 31
            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6))
        self.features.add_module(  ## features = 32
            str(len(self.features)), nn.ReLU(inplace=True))
        self.features.add_module(  ## features = 33
            str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1))
        self.features.add_module(  ## features = 34
            str(len(self.features)), nn.ReLU(inplace=True))
        self.out_feature_indices = out_feature_indices

        self.inplanes = 1024
        self.extra = self._make_extra_layers(self.extra_setting[input_size[0]])
        self.l2_norm = L2Norm(
            self.features[out_feature_indices[0] - 1].out_channels,

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone.

            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        if isinstance(pretrained, str):
            logger = get_root_logger()
            load_checkpoint(self, pretrained, strict=False, logger=logger)
        elif pretrained is None:
            for m in self.features.modules():
                if isinstance(m, nn.Conv2d):
                elif isinstance(m, nn.BatchNorm2d):
                    constant_init(m, 1)
                elif isinstance(m, nn.Linear):
                    normal_init(m, std=0.01)
            raise TypeError('pretrained must be a str or None')

        for m in self.extra.modules():
            if isinstance(m, nn.Conv2d):
                xavier_init(m, distribution='uniform')

        constant_init(self.l2_norm, self.l2_norm.scale)

    def forward(self, x):
        """Forward function."""
        outs = []
        layer_temp = []
        x_temp = list()
        print("==== features ====")
        for i, layer in enumerate(self.features):
            x = layer(x)
            if i in self.out_feature_indices:
        print("==== extra ====")
        for i, layer in enumerate(self.extra):
            x = F.relu(layer(x), inplace=True)
            if i % 2 == 1:
            # layer_temp.append(layer)
            # if i == 9:
            #     x_test = F.relu(layer(x), inplace=True)
            #     pause = 0
        outs[0] = self.l2_norm(outs[0])
        if len(outs) == 1:
            return outs[0]
            return tuple(outs)

    def _make_extra_layers(self, outplanes):
        layers = []
        kernel_sizes = (1, 3)
        num_layers = 0
        outplane = None
        for i in range(len(outplanes)):
            if self.inplanes == 'S':
                self.inplanes = outplane
            k = kernel_sizes[num_layers % 2]
            if outplanes[i] == 'S':
                outplane = outplanes[i + 1]
                conv = nn.Conv2d(
                    self.inplanes, outplane, k, stride=2, padding=1)
                outplane = outplanes[i]
                conv = nn.Conv2d(
                    self.inplanes, outplane, k, stride=1, padding=0)
            self.inplanes = outplanes[i]
            num_layers += 1

        # if self.input_size[0] == 512:
        #     layers.append(nn.Conv2d(self.inplanes, 256, 4, padding=1))
        if self.input_size[0] == 1024:
            # layers.append(nn.Conv2d(self.inplanes, 512, 4, padding=1))
            layers.append(nn.Conv2d(self.inplanes, 256, 4, padding=1))

        return nn.Sequential(*layers)

class L2Norm(nn.Module):

    def __init__(self, n_dims, scale=20., eps=1e-10):
        """L2 normalization layer.

            n_dims (int): Number of dimensions to be normalized
            scale (float, optional): Defaults to 20..
            eps (float, optional): Used to avoid division by zero.
                Defaults to 1e-10.
        super(L2Norm, self).__init__()
        self.n_dims = n_dims
        self.weight = nn.Parameter(torch.Tensor(self.n_dims))
        self.eps = eps
        self.scale = scale

    def forward(self, x):
        """Forward function."""
        # normalization layer convert to FP32 in FP16 training
        x_float = x.float()
        norm = x_float.pow(2).sum(1, keepdim=True).sqrt() + self.eps
        return (self.weight[None, :, None, None].float().expand_as(x_float) *
                x_float / norm).type_as(x)

Anyone with the solution. Thanks in advance.

I cannot see any obvious errors, but since your code snippet is not executable, I also cannot debug it.
Based on the error message I guess you might flatten the predictions somehow and/or the batch size calculation is wrong.
A VGG with inputs of [49132, 3, 1024, 512] would need a lot of memory so you are either using the CPU where this might fit into the system RAM or the batch_size calculation is wrong.