Model 1 in my machine 1 epoch takes
0.08318 M parameters
GPTLanguageModel(
(token_embedding_table): Embedding(65, 45)
(position_embedding_table): Embedding(65, 45)
(blocks): Sequential(
(0): Block(
(sa): MultiHeadAttention(
(heads): ModuleList(
(0-2): 3 x Head(
(key): Linear(in_features=45, out_features=15, bias=False)
(query): Linear(in_features=45, out_features=15, bias=False)
(value): Linear(in_features=45, out_features=15, bias=False)
(dropout): Dropout(p=0.25, inplace=False)
)
)
(proj): Linear(in_features=45, out_features=45, bias=True)
(dropout): Dropout(p=0.25, inplace=False)
)
(ffwd): FeedFoward(
(net): Sequential(
(0): Linear(in_features=45, out_features=180, bias=True)
(1): ReLU()
(2): Linear(in_features=180, out_features=45, bias=True)
(3): Dropout(p=0.25, inplace=False)
)
)
(ln1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(ln2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
)
(1): Block(
(sa): MultiHeadAttention(
(heads): ModuleList(
(0-2): 3 x Head(
(key): Linear(in_features=45, out_features=15, bias=False)
(query): Linear(in_features=45, out_features=15, bias=False)
(value): Linear(in_features=45, out_features=15, bias=False)
(dropout): Dropout(p=0.25, inplace=False)
)
)
(proj): Linear(in_features=45, out_features=45, bias=True)
(dropout): Dropout(p=0.25, inplace=False)
)
(ffwd): FeedFoward(
(net): Sequential(
(0): Linear(in_features=45, out_features=180, bias=True)
(1): ReLU()
(2): Linear(in_features=180, out_features=45, bias=True)
(3): Dropout(p=0.25, inplace=False)
)
)
(ln1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(ln2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
)
(2): Block(
(sa): MultiHeadAttention(
(heads): ModuleList(
(0-2): 3 x Head(
(key): Linear(in_features=45, out_features=15, bias=False)
(query): Linear(in_features=45, out_features=15, bias=False)
(value): Linear(in_features=45, out_features=15, bias=False)
(dropout): Dropout(p=0.25, inplace=False)
)
)
(proj): Linear(in_features=45, out_features=45, bias=True)
(dropout): Dropout(p=0.25, inplace=False)
)
(ffwd): FeedFoward(
(net): Sequential(
(0): Linear(in_features=45, out_features=180, bias=True)
(1): ReLU()
(2): Linear(in_features=180, out_features=45, bias=True)
(3): Dropout(p=0.25, inplace=False)
)
)
(ln1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(ln2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
)
)
(ln_f): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(lm_head): Linear(in_features=45, out_features=65, bias=True)
)
Model 2
0.821902 M parameters
TransformerDecoder(
(token_embedding): Embedding(65, 45)
(pos_embedding): Embedding(65, 45)
(decoder_layer): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=45, out_features=45, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=45, out_features=45, bias=True)
)
(linear1): Linear(in_features=45, out_features=2048, bias=True)
(dropout): Dropout(p=0.25, inplace=False)
(linear2): Linear(in_features=2048, out_features=45, bias=True)
(norm1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.25, inplace=False)
(dropout2): Dropout(p=0.25, inplace=False)
(dropout3): Dropout(p=0.25, inplace=False)
)
(transformer_decoder): TransformerDecoder(
(layers): ModuleList(
(0-2): 3 x TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=45, out_features=45, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=45, out_features=45, bias=True)
)
(linear1): Linear(in_features=45, out_features=2048, bias=True)
(dropout): Dropout(p=0.25, inplace=False)
(linear2): Linear(in_features=2048, out_features=45, bias=True)
(norm1): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.25, inplace=False)
(dropout2): Dropout(p=0.25, inplace=False)
(dropout3): Dropout(p=0.25, inplace=False)
)
)
)
(layer_norm): LayerNorm((45,), eps=1e-05, elementwise_affine=True)
(fc_out): Linear(in_features=45, out_features=65, bias=True)
)