I know this question has been asked a lot, however I wasn’t able to solve it yet.

Here is my code

import torch

import torch.nn as nn

from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train, y_train)

valid_dataset = TensorDataset(X_valid, y_valid)

train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

valid_data_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True)

class MyTransformerModel(nn.Module):

def **init**(self, d_model, nhead, num_encoder_layers, dim_feedforward, dropout):

super(MyTransformerModel, self).**init**()

encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,dim_feedforward=dim_feedforward, dropout=dropout)

self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

self.decoder = nn.Linear(16, 16)

```
def forward(self, src):
encoded = self.transformer_encoder(src)
pooled_output = encoded.mean(dim=1) # Apply mean along sequence length
logits = self.decoder(pooled_output)
probabilities = torch.sigmoid(logits)
predictions = torch.round(probabilities)
return predictions
```

model = MyTransformerModel(d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1)

loss_function = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, eps=1e-6)

def flat_accuracy(preds, labels):

pred_flat = preds.flatten()

labels_flat = labels.flatten()

return torch.sum(pred_flat == labels_flat).item() / len(labels_flat)

epochs = 10

for epoch in range(epochs):

model.train()

total_train_loss = 0

total_train_accuracy = 0

```
for batch, (X, y) in enumerate(train_data_loader):
optimizer.zero_grad()
output = model(X)
loss = loss_function(output.float(), y.float())
loss.backward()
optimizer.step()
total_train_loss += loss.item()
total_train_accuracy += flat_accuracy(output, y)
avg_train_loss = total_train_loss / len(train_data_loader)
avg_train_accuracy = total_train_accuracy / len(train_data_loader)
print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.2f}, Training Accuracy: {avg_train_accuracy:.2f}')
# Validation phase
model.eval()
total_val_loss = 0
total_eval_accuracy = 0
with torch.no_grad():
for batch, (X, y) in enumerate(valid_data_loader):
output = model(X)
val_loss = loss_function(output.float(), y.float())
total_val_loss += val_loss.item()
total_val_accuracy += flat_accuracy(output, y)
avg_val_loss = total_val_loss / len(validation_dataloader)
avg_val_accuracy = total_val_accuracy / len(valid_data_loader)
print(f'Epoch {epoch+1}, Validation Loss: {avg_val_loss:.2f}, Validation Accuracy: {avg_val_accuracy:.2f}')
```

this is the output of encoded.shape

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([16, 512])

torch.Size([5, 512])

so the last batch is the problematic one. How do I solve this?

Thanks in advance!