How to get ouput of middle of transformer model?

I am trying to get output of 2nd ‘transformer_blocks’ which inside of Modulelist just before this ouput goes into 3rd transformer block, I was trying to hook with register_forward_hook but I got return None.

class BERT(nn.Module):
    """
    BERT model : Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, max_len=512, dropout=0.1):
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()

        cuda_condition = torch.cuda.is_available()
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        self.hidden = hidden
        self.n_layers = n_layers
        self.attn_heads = attn_heads

        # paper noted they used 4*hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = hidden * 4

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden, max_len=max_len)

        # multi-layers transformer blocks, deep network
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])

        # cls token
        self.cls_token = torch.tensor([vocab_size-1]).to(self.device)


    def forward(self, x, code, eval, flash_attn=None):#, segment_info):
        # attention masking for padded token
        # torch.ByteTensor([batch_size, 1, seq_len, seq_len)

        x = torch.cat([self.cls_token.expand(x.size(0), -1), x], dim = 1)

        # mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
        mask=None
        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(x, code)#, segment_info)

        # running over multiple transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask, eval, flash_attn)

        return x


(trainable pid=605113) BERT(
(trainable pid=605113)   (embedding): BERTEmbedding(
(trainable pid=605113)     (token): TokenEmbedding(9, 200, padding_idx=0)
(trainable pid=605113)     (position): PositionalEmbedding()
(trainable pid=605113)     (gene_code): GeneCodeEmbedding(
(trainable pid=605113)       (embedding): Embedding(297, 200)
(trainable pid=605113)       (embd_norm): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
(trainable pid=605113)     )
(trainable pid=605113)     (dropout): Dropout(p=0.1, inplace=False)
(trainable pid=605113)   )
(trainable pid=605113)   (transformer_blocks): ModuleList(
(trainable pid=605113)     (0-5): 6 x TransformerBlock(
(trainable pid=605113)       (attention): MultiHeadedAttention(
(trainable pid=605113)         (linear_layers): ModuleList(
(trainable pid=605113)           (0-2): 3 x Linear(in_features=200, out_features=200, bias=True)
(trainable pid=605113)         )
(trainable pid=605113)         (output_linear): Linear(in_features=200, out_features=200, bias=True)
(trainable pid=605113)         (attention): Attention()
(trainable pid=605113)         (dropout): Dropout(p=0.1, inplace=False)
(trainable pid=605113)       )
(trainable pid=605113)       (feed_forward): PositionwiseFeedForward(
(trainable pid=605113)         (w_1): Linear(in_features=200, out_features=800, bias=True)
(trainable pid=605113)         (w_2): Linear(in_features=800, out_features=200, bias=True)
(trainable pid=605113)         (dropout): Dropout(p=0.1, inplace=False)
(trainable pid=605113)         (activation): GELU()
(trainable pid=605113)       )
(trainable pid=605113)       (input_sublayer): SublayerConnectionAttention(
(trainable pid=605113)         (norm): LayerNorm()
(trainable pid=605113)         (dropout): Dropout(p=0.1, inplace=False)
(trainable pid=605113)       )
(trainable pid=605113)       (output_sublayer): SublayerConnectionFFNN(
(trainable pid=605113)         (norm): LayerNorm()
(trainable pid=605113)         (dropout): Dropout(p=0.1, inplace=False)
(trainable pid=605113)       )
(trainable pid=605113)       (dropout): Dropout(p=0.1, inplace=False)
(trainable pid=605113)     )
(trainable pid=605113)   )
(trainable pid=605113) )