Extracting certain blocks from a pretrained model and removing some layers

I want to extract the features from certain blocks of the TimeSformer model and also want to remove the last two layers.


import torch
from timesformer.models.vit import TimeSformer

model = TimeSformer(img_size=224, num_classes=400, num_frames=8, attention_type='divided_space_time',  pretrained_model='/path/to/pretrained/model.pyth')

The print of the model is as follows:

TimeSformer(
(model): VisionTransformer(
(dropout): Dropout(p=0.0, inplace=False)
(patch_embed): PatchEmbed(
(proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
)
(pos_drop): Dropout(p=0.0, inplace=False)
(time_drop): Dropout(p=0.0, inplace=False)
(blocks): ModuleList(
(0): Block(
(norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): Attention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(attn_drop): Dropout(p=0.0, inplace=False)
)
(temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(temporal_attn): Attention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(attn_drop): Dropout(p=0.0, inplace=False)
)
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
(drop_path): Identity()
(norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELU()
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(1): Block(
(norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): Attention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(attn_drop): Dropout(p=0.0, inplace=False)
)
(temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(temporal_attn): Attention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(attn_drop): Dropout(p=0.0, inplace=False)
)
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
(drop_path): DropPath()
(norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELU()
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(2): Block(
(norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): Attention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(attn_drop): Dropout(p=0.0, inplace=False)
)
(temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(temporal_attn): Attention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(attn_drop): Dropout(p=0.0, inplace=False)
)
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
(drop_path): DropPath()
(norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELU()
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
.
.
.
.
.
(11): Block(
(norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): Attention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(attn_drop): Dropout(p=0.0, inplace=False)
)
(temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(temporal_attn): Attention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(attn_drop): Dropout(p=0.0, inplace=False)
)
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
(drop_path): DropPath()
(norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELU()
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
)
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True) **** I want to remove this layer*****
(head): Linear(in_features=768, out_features=400, bias=True) **** I want to remove this layer*****
)
)

Specifically, I want to extract the outputs of the 4th, 8th and 11th layers of the model and removing the lats two layers. How can I do this. I tried using TimeSformer.blocks[0] but that was not working.