I am trying to make a CNN+Transformer model for speech recognition. My model can correctly process the data from the CNN to the encoder but when it comes to the decoder part I always get a strange error:
RuntimeError: shape '[16, 736, 32]' is invalid for input of size 589824
This always happens when I pass the target and the encoder into the decoder layer of the transformer. I don’t know how to fix this error at all. I tried making a mask for the target and passing that but that leads to errors with the shape of the mask as the model expects the shape of the mask to be [batch_size, batch_size] but mine ends up being [seq_len, seq_len]. If it helps here is the code for how my model is defined:
import torch
import math
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
def sinusoidal_positional_encoding(seq_len, d_model, device='cpu'):
pos_encodings = torch.zeros(seq_len, d_model).to(device)
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1).to(device)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)).to(device)
pos_encodings[:, 0::2] = torch.sin(position * div_term)
pos_encodings[:, 1::2] = torch.cos(position * div_term)
pos_encodings = pos_encodings.unsqueeze(0) # (1, seq_len, d_model)
return pos_encodings
class Conv1DGELUBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride):
super(Conv1DGELUBlock, self).__init__()
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride)
self.gelu = nn.GELU()
self.norm = nn.BatchNorm1d(out_channels)
def forward(self, x):
x = self.conv(x)
x = self.gelu(x)
x = self.norm(x)
return x
class AudioTransformer(nn.Module):
def __init__(self, input_dim, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, vocab_size, device='cpu'):
super(AudioTransformer, self).__init__()
self.conv1 = Conv1DGELUBlock(input_dim, d_model, kernel_size=3, stride=2)
self.conv2 = Conv1DGELUBlock(d_model, d_model, kernel_size=3, stride=2)
self.d_model = d_model
self.positional_encoding = sinusoidal_positional_encoding(max_seq_length, d_model, device)
self.encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
self.encoder = TransformerEncoder(self.encoder_layer, num_layers=num_encoder_layers)
self.decoder_layer = TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
self.decoder = TransformerDecoder(self.decoder_layer, num_layers=num_decoder_layers)
self.embedding = nn.Embedding(vocab_size, d_model)
self.output_layer = nn.Linear(d_model, vocab_size)
def forward(self, src, tgt):
src = src.squeeze(1)
src = self.conv1(src)
src = self.conv2(src)
seq_length = src.size(2)
src *= math.sqrt(self.d_model)
pe = self.positional_encoding[:, :seq_length, :].transpose(2, 1)
src = src + pe
src = src.transpose(2, 1)
memory = self.encoder(src)
tgt_emb = self.embedding(tgt)
tgt_seq_length = tgt_emb.size(1)
tgt_pe = self.positional_encoding[:, :tgt_seq_length, :]
tgt_emb = tgt_emb + tgt_pe
output = self.decoder(tgt_emb, memory)
output = self.output_layer(output)
return output