Model summary for slowfast_r50 model

In the below script of pytorch video for classification using pretrained model slowfast_r50, I am not able to get the model summary rather I am getting model architecture blocks only;
import torch
import json
from torchsummary import summary
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
CenterCropVideo,
NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
ApplyTransformToKey,
ShortSideScale,
UniformTemporalSubsample,
UniformCropVideo
)
from typing import Dict

Device on which to run the model

Set to cuda to load on GPU

device = “cpu”

Pick a pretrained model and load the pretrained weights

model_name = “slowfast_r50”
model = torch.hub.load(“facebookresearch/pytorchvideo”, model=model_name, pretrained=True)
print(model)
slow_input_size=[1,3,8,256,256]
fast_input_size=[1,3,32,256,256]

input_size=[slow_input_size,fast_input_size]

model_summary=summary(model,input_size)

print(model_summary)

Save the entire model (including the architecture)

model_save=torch.save(model, “slowfast_r50_full_model.pth”)
#print(model_save)

Set to eval mode and move to desired device

model = model.to(device)
model = model.eval()
with open(“/home/mantra/Documents/Projects/Video/pytorchvideo_tutorial/kinetics_classnames.json”, “r”) as f:
kinetics_classnames = json.load(f)

Create an id to label name mapping

kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
kinetics_id_to_classname[v] = str(k).replace(‘"’, “”)

side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
alpha = 4

class PackPathway(torch.nn.Module):
“”"
Transform for converting video frames as a list of tensors.
“”"
def init(self):
super().init()

def forward(self, frames: torch.Tensor):
    fast_pathway = frames
    # Perform temporal sampling from the fast pathway.
    slow_pathway = torch.index_select(
        frames,
        1,
        torch.linspace(
            0, frames.shape[1] - 1, frames.shape[1] // alpha
        ).long(),
    )
    frame_list = [slow_pathway, fast_pathway]
    return frame_list

transform = ApplyTransformToKey(
key=“video”,
transform=Compose(
[
UniformTemporalSubsample(num_frames),
Lambda(lambda x: x/255.0),
NormalizeVideo(mean, std),
ShortSideScale(
size=side_size
),
CenterCropVideo(crop_size),
PackPathway()
]
),
)

The duration of the input clip is also specific to the model.

clip_duration = (num_frames * sampling_rate)/frames_per_second
print(clip_duration)

Load the example video

video_path = “/home/mantra/Documents/Projects/Video/pytorchvideo_tutorial/-1HT31BzADs_000118_000128.mp4”

Select the duration of the clip to load by specifying the start and end duration

The start_sec should correspond to where the action occurs in the video

start_sec = 0
end_sec = start_sec + clip_duration

Initialize an EncodedVideo helper class

video = EncodedVideo.from_path(video_path)

Load the desired clip

video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
print(‘video_data’)
print(type(video_data[‘video’]))
print(video_data[‘video’].shape)

print(type(video_data[‘audio’]))
print(video_data[‘audio’].shape)

print(‘video data keys’)
for key, value in video_data.items() :
print(key)

Apply a transform to normalize the video input

video_data = transform(video_data)
print(video_data)
print(video_data[‘video’][0])

Move the inputs to the desired device

inputs = video_data[“video”]
inputs = [i.to(device)[None, …] for i in inputs]
print(inputs[0].shape)
print(inputs[1].shape)

Pass the input clip through the model

preds = model(inputs)

torch.Size([1, 3, 8, 256, 256])

torch.Size([1, 3, 32, 256, 256])

model_summary=summary(model,(1, 3, 8, 256, 256))

model_summary=summary(model,(([[1, 3, 8, 256, 256],[1, 3, 32, 256, 256]])))
print(model_summary)

print(preds)
print(preds.shape)

Get the predicted classes

post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
print(preds)
pred_classes = preds.topk(k=5).indices
print(pred_classes)
print(pred_classes.shape)
print(preds[0][253])

print(preds[0][151])

print(preds[0][182])

print(preds[0][367])

print(preds[0][160])

Map the predicted classes to the label names

pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes[0]]
print(“Predicted labels: %s” % ", ".join(pred_class_names))

I am getting the following error while running above script, please anyone help me to reslove this issue;

Traceback (most recent call last):
File “/home/mantra/Documents/Projects/Video/pytorchvideo_tutorial/slowfast_arch.py”, line 14, in
summary(model, input_sizes )
File “/home/mantra/miniconda3/envs/video/lib/python3.12/site-packages/torchsummary/torchsummary.py”, line 72, in summary
model(*x)
File “/home/mantra/miniconda3/envs/video/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/home/mantra/miniconda3/envs/video/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: Net.forward() takes 2 positional arguments but 3 were given