I am trying to extract part of the pretrained mvit model but it’s giving me the following error RuntimeError: The size of tensor a (65537) must match the size of tensor b (50177) at non-singleton dimension 1. If I use x3d_l, it works just fine. Not sure what I am doing wrong.

class VideoModel(nn.Module):
def __init__(self):
super(VideoModel, self).__init__()
model_name = 'mvit_base_32x3'#'mvit_base_32x3' X3D_L - 77.44% and mvit_base_32x3 - 80.30 on kinetics400
original_model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)
self.features = nn.Sequential(
*list(original_model.children())[:-1] #mvit
#*list(original_model.blocks.children())[:-1] #x3d
)
def forward(self, x):
x = self.features(x)
return x
model = VideoModel()
x = torch.randn(1, 3,32,256,256)
pred = model(x)

If I change the size of x to (1,3,32,224,224) I get NotImplementedError but the following code works and produces the [1, 400] prediction probability.

model_name = 'mvit_base_32x3'#'mvit_base_32x3' X3D_L - 77.44% and mvit_base_32x3 - 80.30 on kinetics400
original_model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)
x = torch.randn(1, 3,32,224,224)
pred = original_model(x)