Shape is invalid for input of size in MultiHeadAttention of Transformer decoder

I am trying to make a CNN+Transformer model for speech recognition. My model can correctly process the data from the CNN to the encoder but when it comes to the decoder part I always get a strange error:

RuntimeError: shape '[16, 736, 32]' is invalid for input of size 589824

This always happens when I pass the target and the encoder into the decoder layer of the transformer. I don’t know how to fix this error at all. I tried making a mask for the target and passing that but that leads to errors with the shape of the mask as the model expects the shape of the mask to be [batch_size, batch_size] but mine ends up being [seq_len, seq_len]. If it helps here is the code for how my model is defined:

import torch
import math
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer

def sinusoidal_positional_encoding(seq_len, d_model, device='cpu'):
    pos_encodings = torch.zeros(seq_len, d_model).to(device)
    
    position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1).to(device)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)).to(device)
    
    pos_encodings[:, 0::2] = torch.sin(position * div_term)
    pos_encodings[:, 1::2] = torch.cos(position * div_term)
    
    pos_encodings = pos_encodings.unsqueeze(0)  # (1, seq_len, d_model)
    return pos_encodings
 

class Conv1DGELUBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride):
        super(Conv1DGELUBlock, self).__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride)
        self.gelu = nn.GELU()
        self.norm = nn.BatchNorm1d(out_channels)
    
    def forward(self, x):
        x = self.conv(x)
        x = self.gelu(x)
        x = self.norm(x)
        return x

class AudioTransformer(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, vocab_size, device='cpu'):
        super(AudioTransformer, self).__init__()
        self.conv1 = Conv1DGELUBlock(input_dim, d_model, kernel_size=3, stride=2)
        self.conv2 = Conv1DGELUBlock(d_model, d_model, kernel_size=3, stride=2)
        
        self.d_model = d_model
        self.positional_encoding = sinusoidal_positional_encoding(max_seq_length, d_model, device)
        
        self.encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.encoder = TransformerEncoder(self.encoder_layer, num_layers=num_encoder_layers)
        
        self.decoder_layer = TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.decoder = TransformerDecoder(self.decoder_layer, num_layers=num_decoder_layers)
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.output_layer = nn.Linear(d_model, vocab_size)
    
    def forward(self, src, tgt):
        src = src.squeeze(1)

        src = self.conv1(src)
        src = self.conv2(src)
                
        seq_length = src.size(2)
        
        src *= math.sqrt(self.d_model)
        
        pe = self.positional_encoding[:, :seq_length, :].transpose(2, 1)
                
        src = src + pe
        
        src = src.transpose(2, 1)
                        
        memory = self.encoder(src)
                
        tgt_emb = self.embedding(tgt)
        
        tgt_seq_length = tgt_emb.size(1)
        tgt_pe = self.positional_encoding[:, :tgt_seq_length, :]
        tgt_emb = tgt_emb + tgt_pe
        
        output = self.decoder(tgt_emb, memory)
        
        output = self.output_layer(output)
        
        return output