I have a Transformer model, where I have declared an additional module of patch_embedding(let’s call this patch_embedding_2) in init() of the model. The surprising thing I observe is even when not using patch_embedding_2 during forward pass, the training loss is different compared to when not declaring patch_embedding_2 inside init().
I think since patch_embedding_2 is not involved in forward pass it should also not involve in backpropagation and the training loss should be identical with/without adding patch_embedding_2 inside init().
def __init__(self, img_size=224, patch_size=16, stride_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
306 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., camera=0, view=0,
307 drop_path_rate=0., hybrid_backbone=None, norm_layer=nn.LayerNorm, local_feature=False, sie_xishu =1.0):
308 super().__init__()
309 self.num_classes = num_classes
310 self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
311 self.local_feature = local_feature
312 if hybrid_backbone is not None:
313 self.patch_embed = HybridEmbed(
314 hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
315 else:
316 self.patch_embed = PatchEmbed_overlap(
317 img_size=img_size, patch_size=patch_size, stride_size=stride_size, in_chans=in_chans,
318 embed_dim=embed_dim)
319 ##patch_embed_2 is not involving in forward pass, just by its declation in __init__() the training loss is being altered which should not be the case accoring to my thinking
320 self.patch_embed_2 = PatchEmbed_overlap(
321 img_size=img_size, patch_size=patch_size, stride_size=stride_size, in_chans=in_chans,
322 embed_dim=embed_dim)
323