RuntimeError: The size of tensor a (32) must match the size of tensor b (128) at non-singleton dimension 1

Hello, I’m getting an error while attempting to create positional encoding. I aim to generate branches for analyzing individual targets, each with different classes. However, it appears that the tensors are not aligning correctly.

The transformer model is like the below:

import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torchsummary import summary

class EncoderLayer(nn.Module):
    def __init__(self, d_model, nhead):
        super(EncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead)
        self.linear1 = nn.Linear(d_model, 256)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(256, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        attn_output, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_output)
        x = self.norm1(x)
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        x = x + self.dropout(x)
        x = self.norm2(x)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead):
        super(DecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead)
        self.linear1 = nn.Linear(d_model, 256)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(256, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        attn_output, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_output)
        x = self.norm1(x)
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        x = x + self.dropout(x)
        x = self.norm2(x)
        return x


class TransformerModel(nn.Module):
    def __init__(self, input_size, output_sizes, d_model=128, nhead=8, num_encoder_layers=6, num_decoder_layers=6):
        super(TransformerModel, self).__init__()

        # Input embedding layer
        self.embedding = nn.Linear(input_size, d_model)

        # Shared Layers for Input Processing
        self.shared_layers = nn.Sequential(
            nn.Linear(d_model, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU()
        )

        # Positional encoding for input
        self.input_positional_encoding = PositionalEncoding(d_model)

        # Encoder layers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, nhead) for _ in range(num_encoder_layers)
        ])

        # Output embedding layer for encoder
        self.output_embedding = nn.Linear(d_model, input_size)

        # Positional encoding for output of encoder
        self.output_positional_encoding = PositionalEncoding(d_model)

        # Decoder layers
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, nhead) for _ in range(num_decoder_layers)
        ])

        # Linear layer for final prediction for each target
        self.linears = nn.ModuleDict({
            target: nn.Linear(d_model, output_size) for target, output_size in output_sizes.items()
        })

        # Softmax layer
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        print(f"Input shape: {x.shape}")
    
        # Input embedding
        x = self.embedding(x)
        print(f"Embedded shape: {x.shape}")

        # Shared layers for input processing
        x = self.shared_layers(x)
        print(f"Shared layers shape: {x.shape}")

        # Add positional encoding for input
        x = self.input_positional_encoding(x)
        print(f"PE input shape: {x.shape}")

        # Encoder layers
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)

        # Output embedding for encoder
        x = self.output_embedding(x)
        print(f"Output embedding shape: {x.shape}")

        # Add positional encoding for output of encoder
        x = self.output_positional_encoding(x)
        print(f"PE output shape: {x.shape}")

        # Decoder layers
        for decoder_layer in self.decoder_layers:
            x = decoder_layer(x)

        # Linear layers for final prediction for each target
        outputs = {target: linear(x) for target, linear in self.linears.items()}

        # Apply softmax if needed
        outputs = {target: self.softmax(output) for target, output in outputs.items()}

        return outputs


# Positional Encoding module
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)


    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].detach()
    



input_size = 6  
output_sizes = {'key': 24, 'pri_deg': 21, 'sec_deg': 21, 'quality': 10, 'inversion': 4, 'boundary': 4}
model = TransformerModel(input_size, output_sizes)

print(model)
# Example usage
input_data = torch.randn(32, input_size)  
# # print(input_data)
output_data = model(input_data)

# # Print the summary
# summary(model, input_size=(32, input_size))

I am getting the error:
RuntimeError: The size of tensor a (32) must match the size of tensor b (128) at non-singleton dimension 1