`AttributeError: 'NoneType' object has no attribute 'size'`

Pranav_Belhekar · February 26, 2023, 12:09pm

I’m trying to input the features in 3 parallel model architecture( 2*CNN + transformer encoder).

# change nn.sequential to take dict to make more readable


class parallel_all_you_want(nn.Module):      # base module
    # Define all layers present in the network
    def __init__(self, num_emotions):
        super().__init__()                # initializes internal module state.
        
        ################## TRANSFORMER BLOCK #####################
        # MAXPOOL THE INPUT FEATURE MAP/TENSOR TO THE TRANSFORMER
        # rectangular kernel worked better here for the rectangular input
        self.transformer_maxpool = nn.MaxPool2d(kernel_size = [1, 4], stride = [1, 4])
        
        # define single transformer encoder layer
        # self-attention + feedforward network from "Attention is All you need"
        # 4 multi-head self-attention layers earch with 40 -->512 --> 40 feedforward network
        transformer_layer = nn.TransformerEncoderLayer(
            d_model = 40, # input features (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
            nhead = 4, # 4 self-attention layers in each multi-head self-attention
            dim_feedforward = 512, # 2 layers in each encoder block's feedforward network: dim 40-->512--->40
            dropout = 0.4,
            activation = 'relu' # ReLU: avoid saturation/time gradient/ reduce compute time
            
        )
        
        # I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper
        # Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers = 4)
        
        
        
        # ======================== 1ST PARALLEL 2D CONVOLUTION BLOCK ===========================
        
        # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock1 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels = 1,  # input volume depth == input channel dim == 1
                out_channels = 16, # expand output feature map volume's depth to 16
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 2, stride = 2), #typical maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 2nd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 16,  
                out_channels = 32, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(32), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 32,  
                out_channels = 64, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(64), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
        )
        
        
        # ======================== 2ND PARALLEL 2D CONVOLUTION BLOCK ===========================
        
        # 3 sequential conv2D layers: ( 1, 40, 259) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock2 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels = 1,  # input volume depth == input channel dim == 1
                out_channels = 16, # expand output feature map volume's depth to 16
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 2, stride = 2), #typical maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 2nd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 16,  
                out_channels = 32, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(32), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 32,  
                out_channels = 64, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(64), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
        )
        
        
        # ====================== FINAL LINEAR BLOCK ==========================
        # Linear softmax layer to take final concatenated embedding tensor
        #   from parallel 2D cnn and transformer block, output  output 8 logits 
        # Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
        # 512*2+40 == 1064 input features --> 8 output emotions 
        
        self.fc1_linear = nn.Linear(512 * 2 + 40, num_emotions)
        
        # SOFTMAX layer for the 8 output logits from final FC linear layer
        self.softmax_out = nn.Softmax(dim = 1) # dim == 1 is frequency embedding
        
    
    # define one complete parallel fwd pass of input features tensor through 2*conv+1*transformer blocks
    
    def forward(self, x):
        
        # =========== 1st parallel Conc2D block: 3 con layers =================
        # create final feature embedding from 1st con layer
        # input features passed through 3 sequential 2D convolutional layers
        conv2d_embedding1 = self.conv2Dblock1(x) # x = features (x_train - N*C*B*H)
        
        # flatten final 64*1*8 feature map from con layer to length 512 1D array
        # skip the 1sst (N/batch) dimension when flattening
        conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim = 1)
        
        
        # =========== 2nd parallel Conc2D block: 3 con layers =================
        # create final feature embedding from 1st con layer
        # input features passed through 3 sequential 2D convolutional layers
        conv2d_embedding2 = self.conv2Dblock2(x) # x = features (x_train - N*C*B*H)
        
        # flatten final 64*1*8 feature map from con layer to length 512 1D array
        # skip the 1sst (N/batch) dimension when flattening
        conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim = 1)
        
        
        # ============ 4-encoder-layer Transformer block =================
        
        # maxpool input feature map: 1*40*282 w/ 1*4 kernel  --> 1*40*70
        x_maxpool = self.transformer_maxpool(x)
        
        # remove channel dim: 1*40*70 --> 40*70
        x_maxpool_reduced = torch.squeeze(x_maxpool, 1)   # squeeze removes single dimension entries
        
        # convert maxpooled feature map format: batch * freq * time --> time * batch * freq format
        # because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
        x = x_maxpool_reduced.permute(2, 0, 1)
        
        # finally, pass reduced input feature map 'x' into transformer encoder
        transformer_output = self.transformer_encoder(x)
        
        ## create final feature embedding from transformer layer by taking mean in the time dimension (now the 0th dim)
        # transformer outputs 2x40 (MFCC embedding*time) feature map, take mean of columns i.e. take time average
        transformer_embedding = torch.mean(transfomer_output, dim = 0) 
        
        
        # ============ concatenate frequency embeddings form con and transformer block ===============
        # concatenate tensors output by parallel 2*conv and 1*transformer blocks
        complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2, transformer_embedding], dim = 1)
        
        
        # ============ final FC linear layer, need logits for loss =================
        output_logits = self.fc1_linear(complete_embedding)
        
        # ============ final softmax layer: use logits from FC linear, get softmax for prediction ==========
        output_softmax = self.softmax_out(output_logits)
        
        
        # need output logits to compute cross entropy loss, need softmax probabilities to predict class
        return output_logits, output_softmax

# need device to instantiate model
device = 'cuda'

# instantiate model 
model = parallel_all_you_want(len(emotions_dict)).to(device)

# include input feature map dims in call to summary()
summary(model, input_size=(1, 40, 259))

x_train input shape = X_train:(26727, 1, 40, 259), y_train:(26727,)

I’m getting error:
AttributeError: 'NoneType' object has no attribute 'size'

ptrblck · February 26, 2023, 8:29pm

I’m unable to reproduce the reported error, but others instead:

# instantiate model 
model = parallel_all_you_want(10)

summary(model, input_size=(1, 40, 259))
# ValueError: expected 4D input (got 3D input)

which raises an error due to a wrong shape. Fixing this yields:

summary(model, input_size=(1, 1, 40, 259))
# NameError: name 'transfomer_output' is not defined

since you have a typo in your model. Fixing this again works:

summary(model, input_size=(1, 1, 40, 259))
# ===============================================================================================
# Layer (type:depth-idx)                        Output Shape              Param #
# ===============================================================================================
# parallel_all_you_want                         [1, 10]                   --
# ├─Sequential: 1-1                             [1, 64, 1, 8]             --
# │    └─Conv2d: 2-1                            [1, 16, 40, 259]          160
# │    └─BatchNorm2d: 2-2                       [1, 16, 40, 259]          32
# │    └─ReLU: 2-3                              [1, 16, 40, 259]          --
# │    └─MaxPool2d: 2-4                         [1, 16, 20, 129]          --
# │    └─Dropout: 2-5                           [1, 16, 20, 129]          --
# │    └─Conv2d: 2-6                            [1, 32, 20, 129]          4,640
# │    └─BatchNorm2d: 2-7                       [1, 32, 20, 129]          64
# │    └─ReLU: 2-8                              [1, 32, 20, 129]          --
# │    └─MaxPool2d: 2-9                         [1, 32, 5, 32]            --
# │    └─Dropout: 2-10                          [1, 32, 5, 32]            --
# │    └─Conv2d: 2-11                           [1, 64, 5, 32]            18,496
# │    └─BatchNorm2d: 2-12                      [1, 64, 5, 32]            128
# │    └─ReLU: 2-13                             [1, 64, 5, 32]            --
# │    └─MaxPool2d: 2-14                        [1, 64, 1, 8]             --
# │    └─Dropout: 2-15                          [1, 64, 1, 8]             --
# ├─Sequential: 1-2                             [1, 64, 1, 8]             --
# │    └─Conv2d: 2-16                           [1, 16, 40, 259]          160
# │    └─BatchNorm2d: 2-17                      [1, 16, 40, 259]          32
# │    └─ReLU: 2-18                             [1, 16, 40, 259]          --
# │    └─MaxPool2d: 2-19                        [1, 16, 20, 129]          --
# │    └─Dropout: 2-20                          [1, 16, 20, 129]          --
# │    └─Conv2d: 2-21                           [1, 32, 20, 129]          4,640
# │    └─BatchNorm2d: 2-22                      [1, 32, 20, 129]          64
# │    └─ReLU: 2-23                             [1, 32, 20, 129]          --
# │    └─MaxPool2d: 2-24                        [1, 32, 5, 32]            --
# │    └─Dropout: 2-25                          [1, 32, 5, 32]            --
# │    └─Conv2d: 2-26                           [1, 64, 5, 32]            18,496
# │    └─BatchNorm2d: 2-27                      [1, 64, 5, 32]            128
# │    └─ReLU: 2-28                             [1, 64, 5, 32]            --
# │    └─MaxPool2d: 2-29                        [1, 64, 1, 8]             --
# │    └─Dropout: 2-30                          [1, 64, 1, 8]             --
# ├─MaxPool2d: 1-3                              [1, 1, 40, 64]            --
# ├─TransformerEncoder: 1-4                     [64, 1, 40]               --
# │    └─ModuleList: 2-31                       --                        --
# │    │    └─TransformerEncoderLayer: 3-1      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-2      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-3      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-4      [64, 1, 40]               48,232
# ├─Linear: 1-5                                 [1, 10]                   10,650
# ├─Softmax: 1-6                                [1, 10]                   --
# ===============================================================================================
# Total params: 250,618
# Trainable params: 250,618
# Non-trainable params: 0
# Total mult-adds (M): 43.86
# ===============================================================================================
# Input size (MB): 0.04
# Forward/backward pass size (MB): 9.57
# Params size (MB): 0.90
# Estimated Total Size (MB): 10.51
# ===============================================================================================

Pranav_Belhekar · February 26, 2023, 10:00pm

Thank you @ptrblck for considering my problem.
I tried the solution and added one more dimension (1, 1, 40, 259). But now, conv2d is not compatible as it was expecting 4 (batch * channel * w * h) got increased to 5
Here is what I got:

The website from where I’m referring the code is given below: This might help you to analyze the working more clearly.

github.com

IliaZenkov/transformer-cnn-emotion-recognition/blob/main/notebooks/Parallel_is_All_You_Want.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "vZ9z9-sX60Ct"
   },
   "source": [
    "# Parallel is All You Want: Combining Spatial and Temporal Feature Representions of Speech Emotion by Parallelizing CNNs and Transformer-Encoders\n",
    "# Abstract\n",
    "In this notebook, I'm going to build upon my [Intro to Speech Audio Classification repo](https://github.com/IliaZenkov/sklearn-audio-classification) and build two parallel convolutional neural networks (CNN) in parallel with a Transformer encoder network to classify audio data. We're working on the [RAVDESS dataset](https://smartlaboratory.org/ravdess/) to classify emotions from one of 8 classes. We combine the CNN for spatial feature representation and the Transformer for temporal feature representation. We augment the training data by increasing variation in the dataset to reduce overfitting; we use Additive White Gaussian Noise (AWGN) to augment the RAVDESS dataset three-fold for a total of 4320 audio samples.\n",
    "\n",
    "We harness the image-classification and spatial feature representation power of the CNN by treating MFCC plots as grayscale images; their width is a time scale, their height is a frequency scale. The value of each pixel in the MFCC is the intensity of the audio signal at a particular range of mel frequencies at a time step. \n",
    "\n",
    "Because of the sequential nature of the data, we will also use the Transformer to try and model as accurately as possible the temporal relationships between pitch transitions in emotions.  \n",
    "\n",
    "This notebook takes inspirations from a variety of recent advances in deep learning and network architectures; in particular, stacked and parallel CNN networks combined with multi-head self-attention layers from the Transformer Encoder. I hypothesize that the expansion of CNN filter channel dimensions and reduction of feature maps will provide the most expressive feature representation at the lowest computational cost, while the Transformer-Encoder is used with the hypothesis that the network will learn to predict frequency distributions of different emotions according to the global structure of the MFCC plot (and indirectly, mel spectrogram) of each emotion. **With the strength of the CNN in spatial feature representation and Transformer in sequence encoding, I manage to achieve an 80.44% accuracy on a hold-out test set from the RAVDESS dataset.**\n",
    "\n",
    "<!--TABLE OF CONTENTS-->\n",
    "# Table of Contents\n",

This file has been truncated. show original

ptrblck · February 26, 2023, 10:28pm

You are right and I forgot to mention that I’ve also removed the unsqueezing operation in the forward method:

x = self.pool(F.relu(self.conv1(x)))

EDIT: sorry, I think this was the wrong topic and thus the wrong model and your code still works for me after fixing the aforementioned issues.

Pranav_Belhekar · February 27, 2023, 3:56am

Okay, I must check this out. Thanks for your consideration.

Pranav_Belhekar · February 27, 2023, 4:01am

Just to be double sure, will you please send me the code that you’ve used while running the code. I think, I should try that one. If same error occurs then I doubt on data. Is there any chance that due to bad data, error is occurring?

ptrblck · February 27, 2023, 4:19am

Sure, here is the full code:

import torch
import torch.nn as nn


class parallel_all_you_want(nn.Module):      # base module
    # Define all layers present in the network
    def __init__(self, num_emotions):
        super().__init__()                # initializes internal module state.
        
        ################## TRANSFORMER BLOCK #####################
        # MAXPOOL THE INPUT FEATURE MAP/TENSOR TO THE TRANSFORMER
        # rectangular kernel worked better here for the rectangular input
        self.transformer_maxpool = nn.MaxPool2d(kernel_size = [1, 4], stride = [1, 4])
        
        # define single transformer encoder layer
        # self-attention + feedforward network from "Attention is All you need"
        # 4 multi-head self-attention layers earch with 40 -->512 --> 40 feedforward network
        transformer_layer = nn.TransformerEncoderLayer(
            d_model = 40, # input features (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
            nhead = 4, # 4 self-attention layers in each multi-head self-attention
            dim_feedforward = 512, # 2 layers in each encoder block's feedforward network: dim 40-->512--->40
            dropout = 0.4,
            activation = 'relu' # ReLU: avoid saturation/time gradient/ reduce compute time
            
        )
        
        # I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper
        # Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers = 4)
        
        
        
        # ======================== 1ST PARALLEL 2D CONVOLUTION BLOCK ===========================
        
        # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock1 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels = 1,  # input volume depth == input channel dim == 1
                out_channels = 16, # expand output feature map volume's depth to 16
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 2, stride = 2), #typical maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 2nd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 16,  
                out_channels = 32, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(32), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 32,  
                out_channels = 64, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(64), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
        )
        
        
        # ======================== 2ND PARALLEL 2D CONVOLUTION BLOCK ===========================
        
        # 3 sequential conv2D layers: ( 1, 40, 259) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock2 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels = 1,  # input volume depth == input channel dim == 1
                out_channels = 16, # expand output feature map volume's depth to 16
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 2, stride = 2), #typical maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 2nd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 16,  
                out_channels = 32, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(32), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels = 32,  
                out_channels = 64, # expand output feature map volume's depth to 32
                kernel_size = 3, # 3*3 stride 1 kernel
                stride = 1,
                padding = 1
            ),
            
            nn.BatchNorm2d(64), # batch normalize the output feature map before activation
            nn.ReLU(),  # feature map --> activation map
            nn.MaxPool2d(kernel_size = 4, stride = 4), # increase maxpool kernel size
            nn.Dropout(p = 0.3),
        )
        
        
        # ====================== FINAL LINEAR BLOCK ==========================
        # Linear softmax layer to take final concatenated embedding tensor
        #   from parallel 2D cnn and transformer block, output  output 8 logits 
        # Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
        # 512*2+40 == 1064 input features --> 8 output emotions 
        
        self.fc1_linear = nn.Linear(512 * 2 + 40, num_emotions)
        
        # SOFTMAX layer for the 8 output logits from final FC linear layer
        self.softmax_out = nn.Softmax(dim = 1) # dim == 1 is frequency embedding
        
    
    # define one complete parallel fwd pass of input features tensor through 2*conv+1*transformer blocks
    
    def forward(self, x):
        
        # =========== 1st parallel Conc2D block: 3 con layers =================
        # create final feature embedding from 1st con layer
        # input features passed through 3 sequential 2D convolutional layers
        conv2d_embedding1 = self.conv2Dblock1(x) # x = features (x_train - N*C*B*H)
        
        # flatten final 64*1*8 feature map from con layer to length 512 1D array
        # skip the 1sst (N/batch) dimension when flattening
        conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim = 1)
        
        
        # =========== 2nd parallel Conc2D block: 3 con layers =================
        # create final feature embedding from 1st con layer
        # input features passed through 3 sequential 2D convolutional layers
        conv2d_embedding2 = self.conv2Dblock2(x) # x = features (x_train - N*C*B*H)
        
        # flatten final 64*1*8 feature map from con layer to length 512 1D array
        # skip the 1sst (N/batch) dimension when flattening
        conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim = 1)
        
        
        # ============ 4-encoder-layer Transformer block =================
        
        # maxpool input feature map: 1*40*282 w/ 1*4 kernel  --> 1*40*70
        x_maxpool = self.transformer_maxpool(x)
        
        # remove channel dim: 1*40*70 --> 40*70
        x_maxpool_reduced = torch.squeeze(x_maxpool, 1)   # squeeze removes single dimension entries
        
        # convert maxpooled feature map format: batch * freq * time --> time * batch * freq format
        # because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
        x = x_maxpool_reduced.permute(2, 0, 1)
        
        # finally, pass reduced input feature map 'x' into transformer encoder
        transformer_output = self.transformer_encoder(x)
        
        ## create final feature embedding from transformer layer by taking mean in the time dimension (now the 0th dim)
        # transformer outputs 2x40 (MFCC embedding*time) feature map, take mean of columns i.e. take time average
        transformer_embedding = torch.mean(transformer_output, dim = 0) 
        
        
        # ============ concatenate frequency embeddings form con and transformer block ===============
        # concatenate tensors output by parallel 2*conv and 1*transformer blocks
        complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2, transformer_embedding], dim = 1)
        
        
        # ============ final FC linear layer, need logits for loss =================
        output_logits = self.fc1_linear(complete_embedding)
        
        # ============ final softmax layer: use logits from FC linear, get softmax for prediction ==========
        output_softmax = self.softmax_out(output_logits)
        
        
        # need output logits to compute cross entropy loss, need softmax probabilities to predict class
        return output_logits, output_softmax
    

from torchinfo import summary

# instantiate model 
model = parallel_all_you_want(10)

# summary(model, input_size=(1, 40, 259))
# ValueError: expected 4D input (got 3D input)

summary(model, input_size=(1, 1, 40, 259))
# ===============================================================================================
# Layer (type:depth-idx)                        Output Shape              Param #
# ===============================================================================================
# parallel_all_you_want                         [1, 10]                   --
# ├─Sequential: 1-1                             [1, 64, 1, 8]             --
# │    └─Conv2d: 2-1                            [1, 16, 40, 259]          160
# │    └─BatchNorm2d: 2-2                       [1, 16, 40, 259]          32
# │    └─ReLU: 2-3                              [1, 16, 40, 259]          --
# │    └─MaxPool2d: 2-4                         [1, 16, 20, 129]          --
# │    └─Dropout: 2-5                           [1, 16, 20, 129]          --
# │    └─Conv2d: 2-6                            [1, 32, 20, 129]          4,640
# │    └─BatchNorm2d: 2-7                       [1, 32, 20, 129]          64
# │    └─ReLU: 2-8                              [1, 32, 20, 129]          --
# │    └─MaxPool2d: 2-9                         [1, 32, 5, 32]            --
# │    └─Dropout: 2-10                          [1, 32, 5, 32]            --
# │    └─Conv2d: 2-11                           [1, 64, 5, 32]            18,496
# │    └─BatchNorm2d: 2-12                      [1, 64, 5, 32]            128
# │    └─ReLU: 2-13                             [1, 64, 5, 32]            --
# │    └─MaxPool2d: 2-14                        [1, 64, 1, 8]             --
# │    └─Dropout: 2-15                          [1, 64, 1, 8]             --
# ├─Sequential: 1-2                             [1, 64, 1, 8]             --
# │    └─Conv2d: 2-16                           [1, 16, 40, 259]          160
# │    └─BatchNorm2d: 2-17                      [1, 16, 40, 259]          32
# │    └─ReLU: 2-18                             [1, 16, 40, 259]          --
# │    └─MaxPool2d: 2-19                        [1, 16, 20, 129]          --
# │    └─Dropout: 2-20                          [1, 16, 20, 129]          --
# │    └─Conv2d: 2-21                           [1, 32, 20, 129]          4,640
# │    └─BatchNorm2d: 2-22                      [1, 32, 20, 129]          64
# │    └─ReLU: 2-23                             [1, 32, 20, 129]          --
# │    └─MaxPool2d: 2-24                        [1, 32, 5, 32]            --
# │    └─Dropout: 2-25                          [1, 32, 5, 32]            --
# │    └─Conv2d: 2-26                           [1, 64, 5, 32]            18,496
# │    └─BatchNorm2d: 2-27                      [1, 64, 5, 32]            128
# │    └─ReLU: 2-28                             [1, 64, 5, 32]            --
# │    └─MaxPool2d: 2-29                        [1, 64, 1, 8]             --
# │    └─Dropout: 2-30                          [1, 64, 1, 8]             --
# ├─MaxPool2d: 1-3                              [1, 1, 40, 64]            --
# ├─TransformerEncoder: 1-4                     [64, 1, 40]               --
# │    └─ModuleList: 2-31                       --                        --
# │    │    └─TransformerEncoderLayer: 3-1      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-2      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-3      [64, 1, 40]               48,232
# │    │    └─TransformerEncoderLayer: 3-4      [64, 1, 40]               48,232
# ├─Linear: 1-5                                 [1, 10]                   10,650
# ├─Softmax: 1-6                                [1, 10]                   --
# ===============================================================================================
# Total params: 250,618
# Trainable params: 250,618
# Non-trainable params: 0
# Total mult-adds (M): 43.86
# ===============================================================================================
# Input size (MB): 0.04
# Forward/backward pass size (MB): 9.57
# Params size (MB): 0.90
# Estimated Total Size (MB): 10.51
# ===============================================================================================

Pranav_Belhekar · February 27, 2023, 4:33am

Finally, model is error free now. There was some typo error. Thanks for your invaluable support.