Constant Validation Loss and Accuracy

Dear All,

I am new to Machine Learning and Transformers. I am trying to develop an online Transformer Based time-series anomaly detection model. The problems I’m facing is that the model doesn’t seem to learn as I’m looking at the loss values during training. Morover, the model output is constant in eval() mode no matter what the input is. I’ve been working on this for 3 weeks now and tried everything I found online on the topic like tuning hyper parameters and using lower learning rates. But still I got no luck.
My training.csv dataset consists of two columns (input data) in addition to labels for training. The input data shape is [32202] (batch_size,seq_length,input_dim), and the output shape is [20*1] which is supposed to be the binary decision for each datapoint in the sequence.
Below is my code:

import torch
import torch.nn as nn
import numpy as np
import datetime
import time
import subprocess
import pandas as pd
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math

sequence_length = 20
batch_size = 32
input_dim = 2 
output_dim = input_dim * sequence_length
d_model=16
nhead=4
dim_feedforward=512
num_layers=8

        
class TransformerAD(nn.Module):
    def __init__(self, input_dim, output_dim, sequence_length):
        super(TransformerAD, self).__init__()
        self.sequence_length = sequence_length
        self.input_transform = nn.Linear(input_dim, d_model)  # Transform the input features to d_model
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_layer, num_layers=num_layers)
        self.decoder = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.input_transform(x)
        encoded = self.transformer_encoder(x)
        decoded = self.decoder(encoded)
        return decoded


class CustomDataset(Dataset):
    def __init__(self, csv_file, sequence_length, split='train', validation_split=0.2):
        self.data = pd.read_csv(csv_file)
        self.sequence_length = sequence_length

        # Split data into training and validation
        total_samples = len(self.data) - sequence_length + 1
        split_index = int(total_samples * (1 - validation_split))
        if split == 'train':
            self.data = self.data.iloc[:split_index + sequence_length - 1]
        elif split == 'validation':
            self.data = self.data.iloc[split_index:]

    def __len__(self):
        # Ensure that a complete sequence can be formed
        return len(self.data) - self.sequence_length + 1

    def __getitem__(self, idx):
        # Extract a sequence and the corresponding label
        sequence = self.data.iloc[idx:idx + self.sequence_length, 1:-1].values.astype(np.float32)
        label = self.data.iloc[idx:idx + self.sequence_length, -1].values.astype(np.float32)
        return torch.tensor(sequence), torch.tensor(label)

# Training dataset and dataloader
train_dataset = CustomDataset('training.csv', sequence_length, split='train')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

# Validation dataset and dataloader
validation_dataset = CustomDataset('training.csv', sequence_length, split='validation')
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

def evaluate(model, validation_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for tensor_sequence, tensor_label in validation_loader:
            tensor_sequence = tensor_sequence.to(device)
            tensor_label = tensor_label.unsqueeze(2).to(device)

            output = model(tensor_sequence)
            loss = criterion(output, tensor_label)
            total_loss += loss.item()

            # Accuracy
            predicted_labels = output.sigmoid().round()  # Convert to probability and round to get predicted class
            correct_predictions += (predicted_labels == tensor_label.unsqueeze(1)).sum().item()
            total_predictions += tensor_label.numel()

    average_loss = total_loss / len(validation_loader)
    accuracy = correct_predictions / total_predictions
    return average_loss, accuracy


def train(model, train_loader, criterion, optimizer, scheduler, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        print(f"Starting Epoch [{epoch+1}/{num_epochs}]")
        for tensor_sequence, tensor_label in train_loader:
            tensor_sequence = tensor_sequence.to(device)
            tensor_label = tensor_label.unsqueeze(2).to(device)
            optimizer.zero_grad()
            output = model(tensor_sequence)
            loss = criterion(output, tensor_label)
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            scheduler.step()

        average_loss = total_loss / len(train_loader)
        print(f"Finished Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")
        average_val_loss, val_accuracy = evaluate(model, validation_loader, criterion, device)
        print(f"Validation Loss: {average_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
        torch.save(model.state_dict(), 'model.pth')


if torch.cuda.is_available():
    torch.cuda.set_device(1)  #GPU 1
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
# Initialize the model, criterion, optimizer, and scheduler
model = TransformerAD(input_dim, output_dim, sequence_length)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
weights_tensor = torch.tensor([0.3568, 0.6432])
criterion = nn.BCEWithLogitsLoss(pos_weight=weights_tensor[1]).to(device)

# Start the training process
num_epochs = 10
train(model, train_loader, criterion, optimizer, scheduler, num_epochs)

And this is the result:

Starting Epoch [1/10]
Finished Epoch [1/10], Loss: 0.5849
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [2/10]
Finished Epoch [2/10], Loss: 0.5831
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [3/10]
Finished Epoch [3/10], Loss: 0.5831
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [4/10]
Finished Epoch [4/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [5/10]
Finished Epoch [5/10], Loss: 0.5831
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [6/10]
Finished Epoch [6/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [7/10]
Finished Epoch [7/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [8/10]
Finished Epoch [8/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [9/10]
Finished Epoch [9/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [10/10]
Finished Epoch [10/10], Loss: 0.5831
Validation Loss: 0.7225, Accuracy: 18.3222

Typically, you want an embedding layer before the transformer. This will convert tokens into learnable feature vectors.

Is your input data integers or a vector of floats?

Have you tried working through the Transformer tutorial first?

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

Thanks for the response!

My input data is a float time-series. That’s why I thought I don’t need to use embedding layer as it expects inputs to be of (Long or Int), and it’s typically used for embedding discrete data such as word tokens. But I may be wrong.

Thanks for sharing the tutorial. I will definitely check it out.