RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x5 and 16x16)

I know this question has been asked a lot, however I wasn’t able to solve it yet.

Here is my code

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train, y_train)
valid_dataset = TensorDataset(X_valid, y_valid)

train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_data_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True)

class MyTransformerModel(nn.Module):
def init(self, d_model, nhead, num_encoder_layers, dim_feedforward, dropout):
super(MyTransformerModel, self).init()
encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,dim_feedforward=dim_feedforward, dropout=dropout)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
self.decoder = nn.Linear(16, 16)

def forward(self, src):
    encoded = self.transformer_encoder(src)
    pooled_output = encoded.mean(dim=1)  # Apply mean along sequence length
    logits = self.decoder(pooled_output)
    probabilities = torch.sigmoid(logits)
    predictions = torch.round(probabilities)
    return predictions

model = MyTransformerModel(d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, eps=1e-6)

def flat_accuracy(preds, labels):
pred_flat = preds.flatten()
labels_flat = labels.flatten()
return torch.sum(pred_flat == labels_flat).item() / len(labels_flat)

epochs = 10

for epoch in range(epochs):
model.train()
total_train_loss = 0
total_train_accuracy = 0

for batch, (X, y) in enumerate(train_data_loader):
    optimizer.zero_grad()
    output = model(X)
    loss = loss_function(output.float(), y.float())
    loss.backward()
    optimizer.step()
    
    total_train_loss += loss.item()
    total_train_accuracy += flat_accuracy(output, y)

avg_train_loss = total_train_loss / len(train_data_loader)
avg_train_accuracy = total_train_accuracy / len(train_data_loader)
print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.2f}, Training Accuracy: {avg_train_accuracy:.2f}')

# Validation phase
model.eval()
total_val_loss = 0
total_eval_accuracy = 0

with torch.no_grad():
    for batch, (X, y) in enumerate(valid_data_loader):
        output = model(X)
        val_loss = loss_function(output.float(), y.float())
        total_val_loss += val_loss.item()
        total_val_accuracy += flat_accuracy(output, y)

avg_val_loss = total_val_loss / len(validation_dataloader)
avg_val_accuracy = total_val_accuracy / len(valid_data_loader)
print(f'Epoch {epoch+1}, Validation Loss: {avg_val_loss:.2f}, Validation Accuracy: {avg_val_accuracy:.2f}')

this is the output of encoded.shape

torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([5, 512])

so the last batch is the problematic one. How do I solve this?

Thanks in advance!

Your model definition is a bit weird as it seems you are using the batch size (and thus number of sample in the current batch) as the feature dimension for the decoder.

This code represents your approach:

lin = nn.Linear(16, 16)

x = torch.randn(16, 512)

out = lin(x.mean(dim=1))
print(out.shape)
# torch.Size([16])

Note that the linear layer will treat the 1D input as a single sample with a feature shape of 16 and will output a single sample also with 16 output features as it will implicitly add the missing batch dimension.
I doubt you really want to use this approach as you will see shape mismatches for any other input batch size besides 16.

If you want to reduce dim1 via encoded.mean(dim=1) use .mean(dim=1, keepdim=True) and set in_features=1 in the decoder.

Thanks for the reply. My model defintion is weird, because the task I have is also weird. I need to train a randomly initialized transformer, but it has to be encoder only. So I cannot add anything to the decoder, since there is no decoder. Right now I’ve been skipping the batches by using this

if X.size(0) != 16:
              continue

I know this is not the right way to solve the issue, I just haven’t been able to solve it any other way.