`AttributeError: 'NoneType' object has no attribute 'size'`

I’m trying to input the features in 3 parallel model architecture( 2*CNN + transformer encoder).

# change nn.sequential to take dict to make more readable


class parallel_all_you_want(nn.Module):      # base module
    # Define all layers present in the network
    def __init__(self, num_emotions):
        super().__init__()                # initializes internal module state.
        
        ################## TRANSFORMER BLOCK #####################
        # MAXPOOL THE INPUT FEATURE MAP/TENSOR TO THE TRANSFORMER
        # rectangular kernel worked better here for the rectangular input
        self.transformer_maxpool = nn.MaxPool2d(kernel_size = [1, 4], stride = [1, 4])
        
        # define single transformer encoder layer
        # self-attention + feedforward network from "Attention is All you need"
        # 4 multi-head self-attention layers earch with 40 -->512 --> 40 feedforward network
        transformer_layer = nn.TransformerEncoderLayer(
            d_model = 40, # input features (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
            nhead = 4, # 4 self-attention layers in each multi-head self-attention
            dim_feedforward = 512, # 2 layers in each encoder block's feedforward network: dim 40-->512--->40
            dropout = 0.4,
            activation = 'relu' # ReLU: avoid saturation/time gradient/ reduce compute time
            
        )
        
        # I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper
        # Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers = 4)
        
        
        
        # ======================== 1ST PARALLEL 2D CONVOLUTION BLOCK ===========================
        
        # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock1 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels = 1,  # input volume depth == input channel dim == 1
                out_channels = 16, # expand output feature map volume's depth to 16
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 2, stride = 2), #typical maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 2nd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 16,  
                out_channels = 32, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(32), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 32,  
                out_channels = 64, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(64), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
        )
        
        
        # ======================== 2ND PARALLEL 2D CONVOLUTION BLOCK ===========================
        
        # 3 sequential conv2D layers: ( 1, 40, 259) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock2 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels = 1,  # input volume depth == input channel dim == 1
                out_channels = 16, # expand output feature map volume's depth to 16
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 2, stride = 2), #typical maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 2nd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 16,  
                out_channels = 32, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(32), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 32,  
                out_channels = 64, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(64), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
        )
        
        
        # ====================== FINAL LINEAR BLOCK ==========================
        # Linear softmax layer to take final concatenated embedding tensor
        #   from parallel 2D cnn and transformer block, output  output 8 logits 
        # Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
        # 512*2+40 == 1064 input features --> 8 output emotions 
        
        self.fc1_linear = nn.Linear(512 * 2 + 40, num_emotions)
        
        # SOFTMAX layer for the 8 output logits from final FC linear layer
        self.softmax_out = nn.Softmax(dim = 1) # dim == 1 is frequency embedding
        
    
    # define one complete parallel fwd pass of input features tensor through 2*conv+1*transformer blocks
    
    def forward(self, x):
        
        # =========== 1st parallel Conc2D block: 3 con layers =================
        # create final feature embedding from 1st con layer
        # input features passed through 3 sequential 2D convolutional layers
        conv2d_embedding1 = self.conv2Dblock1(x) # x = features (x_train - N*C*B*H)
        
        # flatten final 64*1*8 feature map from con layer to length 512 1D array
        # skip the 1sst (N/batch) dimension when flattening
        conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim = 1)
        
        
        # =========== 2nd parallel Conc2D block: 3 con layers =================
        # create final feature embedding from 1st con layer
        # input features passed through 3 sequential 2D convolutional layers
        conv2d_embedding2 = self.conv2Dblock2(x) # x = features (x_train - N*C*B*H)
        
        # flatten final 64*1*8 feature map from con layer to length 512 1D array
        # skip the 1sst (N/batch) dimension when flattening
        conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim = 1)
        
        
        # ============ 4-encoder-layer Transformer block =================
        
        # maxpool input feature map: 1*40*282 w/ 1*4 kernel  --> 1*40*70
        x_maxpool = self.transformer_maxpool(x)
        
        # remove channel dim: 1*40*70 --> 40*70
        x_maxpool_reduced = torch.squeeze(x_maxpool, 1)   # squeeze removes single dimension entries
        
        # convert maxpooled feature map format: batch * freq * time --> time * batch * freq format
        # because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
        x = x_maxpool_reduced.permute(2, 0, 1)
        
        # finally, pass reduced input feature map 'x' into transformer encoder
        transformer_output = self.transformer_encoder(x)
        
        ## create final feature embedding from transformer layer by taking mean in the time dimension (now the 0th dim)
        # transformer outputs 2x40 (MFCC embedding*time) feature map, take mean of columns i.e. take time average
        transformer_embedding = torch.mean(transfomer_output, dim = 0) 
        
        
        # ============ concatenate frequency embeddings form con and transformer block ===============
        # concatenate tensors output by parallel 2*conv and 1*transformer blocks
        complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2, transformer_embedding], dim = 1)
        
        
        # ============ final FC linear layer, need logits for loss =================
        output_logits = self.fc1_linear(complete_embedding)
        
        # ============ final softmax layer: use logits from FC linear, get softmax for prediction ==========
        output_softmax = self.softmax_out(output_logits)
        
        
        # need output logits to compute cross entropy loss, need softmax probabilities to predict class
        return output_logits, output_softmax
# need device to instantiate model
device = 'cuda'

# instantiate model 
model = parallel_all_you_want(len(emotions_dict)).to(device)

# include input feature map dims in call to summary()
summary(model, input_size=(1, 40, 259))

x_train input shape = X_train:(26727, 1, 40, 259), y_train:(26727,)

I’m getting error:
AttributeError: 'NoneType' object has no attribute 'size'

I’m unable to reproduce the reported error, but others instead:

# instantiate model 
model = parallel_all_you_want(10)

summary(model, input_size=(1, 40, 259))
# ValueError: expected 4D input (got 3D input)

which raises an error due to a wrong shape. Fixing this yields:

summary(model, input_size=(1, 1, 40, 259))
# NameError: name 'transfomer_output' is not defined

since you have a typo in your model. Fixing this again works:

summary(model, input_size=(1, 1, 40, 259))
# ===============================================================================================
# Layer (type:depth-idx)                        Output Shape              Param #
# ===============================================================================================
# parallel_all_you_want                         [1, 10]                   --
# ├─Sequential: 1-1                             [1, 64, 1, 8]             --
# │    └─Conv2d: 2-1                            [1, 16, 40, 259]          160
# │    └─BatchNorm2d: 2-2                       [1, 16, 40, 259]          32
# │    └─ReLU: 2-3                              [1, 16, 40, 259]          --
# │    └─MaxPool2d: 2-4                         [1, 16, 20, 129]          --
# │    └─Dropout: 2-5                           [1, 16, 20, 129]          --
# │    └─Conv2d: 2-6                            [1, 32, 20, 129]          4,640
# │    └─BatchNorm2d: 2-7                       [1, 32, 20, 129]          64
# │    └─ReLU: 2-8                              [1, 32, 20, 129]          --
# │    └─MaxPool2d: 2-9                         [1, 32, 5, 32]            --
# │    └─Dropout: 2-10                          [1, 32, 5, 32]            --
# │    └─Conv2d: 2-11                           [1, 64, 5, 32]            18,496
# │    └─BatchNorm2d: 2-12                      [1, 64, 5, 32]            128
# │    └─ReLU: 2-13                             [1, 64, 5, 32]            --
# │    └─MaxPool2d: 2-14                        [1, 64, 1, 8]             --
# │    └─Dropout: 2-15                          [1, 64, 1, 8]             --
# ├─Sequential: 1-2                             [1, 64, 1, 8]             --
# │    └─Conv2d: 2-16                           [1, 16, 40, 259]          160
# │    └─BatchNorm2d: 2-17                      [1, 16, 40, 259]          32
# │    └─ReLU: 2-18                             [1, 16, 40, 259]          --
# │    └─MaxPool2d: 2-19                        [1, 16, 20, 129]          --
# │    └─Dropout: 2-20                          [1, 16, 20, 129]          --
# │    └─Conv2d: 2-21                           [1, 32, 20, 129]          4,640
# │    └─BatchNorm2d: 2-22                      [1, 32, 20, 129]          64
# │    └─ReLU: 2-23                             [1, 32, 20, 129]          --
# │    └─MaxPool2d: 2-24                        [1, 32, 5, 32]            --
# │    └─Dropout: 2-25                          [1, 32, 5, 32]            --
# │    └─Conv2d: 2-26                           [1, 64, 5, 32]            18,496
# │    └─BatchNorm2d: 2-27                      [1, 64, 5, 32]            128
# │    └─ReLU: 2-28                             [1, 64, 5, 32]            --
# │    └─MaxPool2d: 2-29                        [1, 64, 1, 8]             --
# │    └─Dropout: 2-30                          [1, 64, 1, 8]             --
# ├─MaxPool2d: 1-3                              [1, 1, 40, 64]            --
# ├─TransformerEncoder: 1-4                     [64, 1, 40]               --
# │    └─ModuleList: 2-31                       --                        --
# │    │    └─TransformerEncoderLayer: 3-1      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-2      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-3      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-4      [64, 1, 40]               48,232
# ├─Linear: 1-5                                 [1, 10]                   10,650
# ├─Softmax: 1-6                                [1, 10]                   --
# ===============================================================================================
# Total params: 250,618
# Trainable params: 250,618
# Non-trainable params: 0
# Total mult-adds (M): 43.86
# ===============================================================================================
# Input size (MB): 0.04
# Forward/backward pass size (MB): 9.57
# Params size (MB): 0.90
# Estimated Total Size (MB): 10.51
# ===============================================================================================

Thank you @ptrblck for considering my problem.
I tried the solution and added one more dimension (1, 1, 40, 259). But now, conv2d is not compatible as it was expecting 4 (batch * channel * w * h) got increased to 5
Here is what I got:

The website from where I’m referring the code is given below: This might help you to analyze the working more clearly.

You are right and I forgot to mention that I’ve also removed the unsqueezing operation in the forward method:

x = self.pool(F.relu(self.conv1(x)))

EDIT: sorry, I think this was the wrong topic and thus the wrong model and your code still works for me after fixing the aforementioned issues.

Okay, I must check this out. Thanks for your consideration.

Just to be double sure, will you please send me the code that you’ve used while running the code. I think, I should try that one. If same error occurs then I doubt on data. Is there any chance that due to bad data, error is occurring?

Sure, here is the full code:

import torch
import torch.nn as nn


class parallel_all_you_want(nn.Module):      # base module
    # Define all layers present in the network
    def __init__(self, num_emotions):
        super().__init__()                # initializes internal module state.
        
        ################## TRANSFORMER BLOCK #####################
        # MAXPOOL THE INPUT FEATURE MAP/TENSOR TO THE TRANSFORMER
        # rectangular kernel worked better here for the rectangular input
        self.transformer_maxpool = nn.MaxPool2d(kernel_size = [1, 4], stride = [1, 4])
        
        # define single transformer encoder layer
        # self-attention + feedforward network from "Attention is All you need"
        # 4 multi-head self-attention layers earch with 40 -->512 --> 40 feedforward network
        transformer_layer = nn.TransformerEncoderLayer(
            d_model = 40, # input features (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
            nhead = 4, # 4 self-attention layers in each multi-head self-attention
            dim_feedforward = 512, # 2 layers in each encoder block's feedforward network: dim 40-->512--->40
            dropout = 0.4,
            activation = 'relu' # ReLU: avoid saturation/time gradient/ reduce compute time
            
        )
        
        # I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper
        # Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers = 4)
        
        
        
        # ======================== 1ST PARALLEL 2D CONVOLUTION BLOCK ===========================
        
        # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock1 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels = 1,  # input volume depth == input channel dim == 1
                out_channels = 16, # expand output feature map volume's depth to 16
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 2, stride = 2), #typical maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 2nd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 16,  
                out_channels = 32, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(32), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 32,  
                out_channels = 64, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(64), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
        )
        
        
        # ======================== 2ND PARALLEL 2D CONVOLUTION BLOCK ===========================
        
        # 3 sequential conv2D layers: ( 1, 40, 259) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock2 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels = 1,  # input volume depth == input channel dim == 1
                out_channels = 16, # expand output feature map volume's depth to 16
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 2, stride = 2), #typical maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 2nd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 16,  
                out_channels = 32, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(32), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 32,  
                out_channels = 64, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(64), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
        )
        
        
        # ====================== FINAL LINEAR BLOCK ==========================
        # Linear softmax layer to take final concatenated embedding tensor
        #   from parallel 2D cnn and transformer block, output  output 8 logits 
        # Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
        # 512*2+40 == 1064 input features --> 8 output emotions 
        
        self.fc1_linear = nn.Linear(512 * 2 + 40, num_emotions)
        
        # SOFTMAX layer for the 8 output logits from final FC linear layer
        self.softmax_out = nn.Softmax(dim = 1) # dim == 1 is frequency embedding
        
    
    # define one complete parallel fwd pass of input features tensor through 2*conv+1*transformer blocks
    
    def forward(self, x):
        
        # =========== 1st parallel Conc2D block: 3 con layers =================
        # create final feature embedding from 1st con layer
        # input features passed through 3 sequential 2D convolutional layers
        conv2d_embedding1 = self.conv2Dblock1(x) # x = features (x_train - N*C*B*H)
        
        # flatten final 64*1*8 feature map from con layer to length 512 1D array
        # skip the 1sst (N/batch) dimension when flattening
        conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim = 1)
        
        
        # =========== 2nd parallel Conc2D block: 3 con layers =================
        # create final feature embedding from 1st con layer
        # input features passed through 3 sequential 2D convolutional layers
        conv2d_embedding2 = self.conv2Dblock2(x) # x = features (x_train - N*C*B*H)
        
        # flatten final 64*1*8 feature map from con layer to length 512 1D array
        # skip the 1sst (N/batch) dimension when flattening
        conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim = 1)
        
        
        # ============ 4-encoder-layer Transformer block =================
        
        # maxpool input feature map: 1*40*282 w/ 1*4 kernel  --> 1*40*70
        x_maxpool = self.transformer_maxpool(x)
        
        # remove channel dim: 1*40*70 --> 40*70
        x_maxpool_reduced = torch.squeeze(x_maxpool, 1)   # squeeze removes single dimension entries
        
        # convert maxpooled feature map format: batch * freq * time --> time * batch * freq format
        # because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
        x = x_maxpool_reduced.permute(2, 0, 1)
        
        # finally, pass reduced input feature map 'x' into transformer encoder
        transformer_output = self.transformer_encoder(x)
        
        ## create final feature embedding from transformer layer by taking mean in the time dimension (now the 0th dim)
        # transformer outputs 2x40 (MFCC embedding*time) feature map, take mean of columns i.e. take time average
        transformer_embedding = torch.mean(transformer_output, dim = 0) 
        
        
        # ============ concatenate frequency embeddings form con and transformer block ===============
        # concatenate tensors output by parallel 2*conv and 1*transformer blocks
        complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2, transformer_embedding], dim = 1)
        
        
        # ============ final FC linear layer, need logits for loss =================
        output_logits = self.fc1_linear(complete_embedding)
        
        # ============ final softmax layer: use logits from FC linear, get softmax for prediction ==========
        output_softmax = self.softmax_out(output_logits)
        
        
        # need output logits to compute cross entropy loss, need softmax probabilities to predict class
        return output_logits, output_softmax
    

from torchinfo import summary

# instantiate model 
model = parallel_all_you_want(10)

# summary(model, input_size=(1, 40, 259))
# ValueError: expected 4D input (got 3D input)

summary(model, input_size=(1, 1, 40, 259))
# ===============================================================================================
# Layer (type:depth-idx)                        Output Shape              Param #
# ===============================================================================================
# parallel_all_you_want                         [1, 10]                   --
# ├─Sequential: 1-1                             [1, 64, 1, 8]             --
# │    └─Conv2d: 2-1                            [1, 16, 40, 259]          160
# │    └─BatchNorm2d: 2-2                       [1, 16, 40, 259]          32
# │    └─ReLU: 2-3                              [1, 16, 40, 259]          --
# │    └─MaxPool2d: 2-4                         [1, 16, 20, 129]          --
# │    └─Dropout: 2-5                           [1, 16, 20, 129]          --
# │    └─Conv2d: 2-6                            [1, 32, 20, 129]          4,640
# │    └─BatchNorm2d: 2-7                       [1, 32, 20, 129]          64
# │    └─ReLU: 2-8                              [1, 32, 20, 129]          --
# │    └─MaxPool2d: 2-9                         [1, 32, 5, 32]            --
# │    └─Dropout: 2-10                          [1, 32, 5, 32]            --
# │    └─Conv2d: 2-11                           [1, 64, 5, 32]            18,496
# │    └─BatchNorm2d: 2-12                      [1, 64, 5, 32]            128
# │    └─ReLU: 2-13                             [1, 64, 5, 32]            --
# │    └─MaxPool2d: 2-14                        [1, 64, 1, 8]             --
# │    └─Dropout: 2-15                          [1, 64, 1, 8]             --
# ├─Sequential: 1-2                             [1, 64, 1, 8]             --
# │    └─Conv2d: 2-16                           [1, 16, 40, 259]          160
# │    └─BatchNorm2d: 2-17                      [1, 16, 40, 259]          32
# │    └─ReLU: 2-18                             [1, 16, 40, 259]          --
# │    └─MaxPool2d: 2-19                        [1, 16, 20, 129]          --
# │    └─Dropout: 2-20                          [1, 16, 20, 129]          --
# │    └─Conv2d: 2-21                           [1, 32, 20, 129]          4,640
# │    └─BatchNorm2d: 2-22                      [1, 32, 20, 129]          64
# │    └─ReLU: 2-23                             [1, 32, 20, 129]          --
# │    └─MaxPool2d: 2-24                        [1, 32, 5, 32]            --
# │    └─Dropout: 2-25                          [1, 32, 5, 32]            --
# │    └─Conv2d: 2-26                           [1, 64, 5, 32]            18,496
# │    └─BatchNorm2d: 2-27                      [1, 64, 5, 32]            128
# │    └─ReLU: 2-28                             [1, 64, 5, 32]            --
# │    └─MaxPool2d: 2-29                        [1, 64, 1, 8]             --
# │    └─Dropout: 2-30                          [1, 64, 1, 8]             --
# ├─MaxPool2d: 1-3                              [1, 1, 40, 64]            --
# ├─TransformerEncoder: 1-4                     [64, 1, 40]               --
# │    └─ModuleList: 2-31                       --                        --
# │    │    └─TransformerEncoderLayer: 3-1      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-2      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-3      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-4      [64, 1, 40]               48,232
# ├─Linear: 1-5                                 [1, 10]                   10,650
# ├─Softmax: 1-6                                [1, 10]                   --
# ===============================================================================================
# Total params: 250,618
# Trainable params: 250,618
# Non-trainable params: 0
# Total mult-adds (M): 43.86
# ===============================================================================================
# Input size (MB): 0.04
# Forward/backward pass size (MB): 9.57
# Params size (MB): 0.90
# Estimated Total Size (MB): 10.51
# ===============================================================================================
1 Like

Finally, model is error free now. There was some typo error. Thanks for your invaluable support. :sparkling_heart:

1 Like