I need to process and refine the model for many different purposes, one automatically (split, concat, …) both in terms of model and weight. I can use some models (VGG, EffciennetB3, Xception, ViT), however in some variants of ViT (CrossViT, CrossFormer) I get error messages, still can’t find a fix.
import torch
from vit_pytorch.cross_vit import CrossViT
import torch.nn as nn
v = CrossViT(
image_size = 256,
num_classes = 1000,
depth = 4, # number of multi-scale encoding blocks
sm_dim = 192, # high res dimension
sm_patch_size = 16, # high res patch size (should be smaller than lg_patch_size)
sm_enc_depth = 2, # high res depth
sm_enc_heads = 8, # high res heads
sm_enc_mlp_dim = 2048, # high res feedforward dimension
lg_dim = 384, # low res dimension
lg_patch_size = 64, # low res patch size
lg_enc_depth = 3, # low res depth
lg_enc_heads = 8, # low res heads
lg_enc_mlp_dim = 2048, # low res feedforward dimensions
cross_attn_depth = 2, # cross attention rounds
cross_attn_heads = 8, # cross attention heads
dropout = 0.1,
emb_dropout = 0.1
)
v = nn.Sequential(*list(v.children()))
v1 = nn.Sequential(*list(v.children()[:5]))
img = torch.randn(1, 3, 256, 256)
pred = v(img) # (1, 1000)
print(pred)
“einops.EinopsError: Expected 4 dimensions, got 3”
from vit_pytorch.crossformer import CrossFormer
model = CrossFormer(
num_classes = 1000, # number of output classes
dim = (64, 128, 256, 512), # dimension at each stage
depth = (2, 2, 8, 2), # depth of transformer at each stage
global_window_size = (8, 4, 2, 1), # global window sizes at each stage
local_window_size = 7, # local window size (can be customized for each stage, but in paper, held constant at 7 for all stages)
)
img = torch.randn(1, 3, 224, 224)
model = nn.Sequential(*list(model.children()))
pred = model(img) # (1, 1000)
“NotImplementedError: Module [ModuleList] is missing the required “forward” function”