Please Help Why does one of these models train 100x slower? Though they are similar

Model 1 in my machine 1 epoch takes

0.08318 M parameters
 
GPTLanguageModel(
  (token_embedding_table): Embedding(65, 45)
  (position_embedding_table): Embedding(65, 45)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-2): 3 x Head(
            (key): Linear(in_features=45, out_features=15, bias=False)
            (query): Linear(in_features=45, out_features=15, bias=False)
            (value): Linear(in_features=45, out_features=15, bias=False)
            (dropout): Dropout(p=0.25, inplace=False)
          )
        )
        (proj): Linear(in_features=45, out_features=45, bias=True)
        (dropout): Dropout(p=0.25, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=45, out_features=180, bias=True)
          (1): ReLU()
          (2): Linear(in_features=180, out_features=45, bias=True)
          (3): Dropout(p=0.25, inplace=False)
        )
      )
      (ln1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
    )
    (1): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-2): 3 x Head(
            (key): Linear(in_features=45, out_features=15, bias=False)
            (query): Linear(in_features=45, out_features=15, bias=False)
            (value): Linear(in_features=45, out_features=15, bias=False)
            (dropout): Dropout(p=0.25, inplace=False)
          )
        )
        (proj): Linear(in_features=45, out_features=45, bias=True)
        (dropout): Dropout(p=0.25, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=45, out_features=180, bias=True)
          (1): ReLU()
          (2): Linear(in_features=180, out_features=45, bias=True)
          (3): Dropout(p=0.25, inplace=False)
        )
      )
      (ln1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
    )
    (2): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-2): 3 x Head(
            (key): Linear(in_features=45, out_features=15, bias=False)
            (query): Linear(in_features=45, out_features=15, bias=False)
            (value): Linear(in_features=45, out_features=15, bias=False)
            (dropout): Dropout(p=0.25, inplace=False)
          )
        )
        (proj): Linear(in_features=45, out_features=45, bias=True)
        (dropout): Dropout(p=0.25, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=45, out_features=180, bias=True)
          (1): ReLU()
          (2): Linear(in_features=180, out_features=45, bias=True)
          (3): Dropout(p=0.25, inplace=False)
        )
      )
      (ln1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
    )
  )
  (ln_f): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
  (lm_head): Linear(in_features=45, out_features=65, bias=True)
)

Model 2

0.821902 M parameters
TransformerDecoder(
  (token_embedding): Embedding(65, 45)
  (pos_embedding): Embedding(65, 45)
  (decoder_layer): TransformerDecoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=45, out_features=45, bias=True)
    )
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=45, out_features=45, bias=True)
    )
    (linear1): Linear(in_features=45, out_features=2048, bias=True)
    (dropout): Dropout(p=0.25, inplace=False)
    (linear2): Linear(in_features=2048, out_features=45, bias=True)
    (norm1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
    (norm3): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.25, inplace=False)
    (dropout2): Dropout(p=0.25, inplace=False)
    (dropout3): Dropout(p=0.25, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=45, out_features=45, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=45, out_features=45, bias=True)
        )
        (linear1): Linear(in_features=45, out_features=2048, bias=True)
        (dropout): Dropout(p=0.25, inplace=False)
        (linear2): Linear(in_features=2048, out_features=45, bias=True)
        (norm1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.25, inplace=False)
        (dropout2): Dropout(p=0.25, inplace=False)
        (dropout3): Dropout(p=0.25, inplace=False)
      )
    )
  )
  (layer_norm): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
  (fc_out): Linear(in_features=45, out_features=65, bias=True)
)

Is this due to the fact in linear1 of model2 there is out_feature of 2048 and that is across three of the layers. Whereas model 1 is relative small?