Dear All,

I am new to Machine Learning and Transformers. I am trying to develop an online Transformer Based time-series anomaly detection model. The problems I’m facing is that the model doesn’t seem to learn as I’m looking at the loss values during training. Morover, the model output is constant in eval() mode no matter what the input is. I’ve been working on this for 3 weeks now and tried everything I found online on the topic like tuning hyper parameters and using lower learning rates. But still I got no luck.

My training.csv dataset consists of two columns (input data) in addition to labels for training. The input data shape is [32*20*2] (batch_size,seq_length,input_dim), and the output shape is [20*1] which is supposed to be the binary decision for each datapoint in the sequence.

Below is my code:

```
import torch
import torch.nn as nn
import numpy as np
import datetime
import time
import subprocess
import pandas as pd
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
sequence_length = 20
batch_size = 32
input_dim = 2
output_dim = input_dim * sequence_length
d_model=16
nhead=4
dim_feedforward=512
num_layers=8
class TransformerAD(nn.Module):
def __init__(self, input_dim, output_dim, sequence_length):
super(TransformerAD, self).__init__()
self.sequence_length = sequence_length
self.input_transform = nn.Linear(input_dim, d_model) # Transform the input features to d_model
self.transformer_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True)
self.transformer_encoder = nn.TransformerEncoder(self.transformer_layer, num_layers=num_layers)
self.decoder = nn.Linear(d_model, 1)
def forward(self, x):
x = self.input_transform(x)
encoded = self.transformer_encoder(x)
decoded = self.decoder(encoded)
return decoded
class CustomDataset(Dataset):
def __init__(self, csv_file, sequence_length, split='train', validation_split=0.2):
self.data = pd.read_csv(csv_file)
self.sequence_length = sequence_length
# Split data into training and validation
total_samples = len(self.data) - sequence_length + 1
split_index = int(total_samples * (1 - validation_split))
if split == 'train':
self.data = self.data.iloc[:split_index + sequence_length - 1]
elif split == 'validation':
self.data = self.data.iloc[split_index:]
def __len__(self):
# Ensure that a complete sequence can be formed
return len(self.data) - self.sequence_length + 1
def __getitem__(self, idx):
# Extract a sequence and the corresponding label
sequence = self.data.iloc[idx:idx + self.sequence_length, 1:-1].values.astype(np.float32)
label = self.data.iloc[idx:idx + self.sequence_length, -1].values.astype(np.float32)
return torch.tensor(sequence), torch.tensor(label)
# Training dataset and dataloader
train_dataset = CustomDataset('training.csv', sequence_length, split='train')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
# Validation dataset and dataloader
validation_dataset = CustomDataset('training.csv', sequence_length, split='validation')
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
def evaluate(model, validation_loader, criterion, device):
model.eval()
total_loss = 0
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
for tensor_sequence, tensor_label in validation_loader:
tensor_sequence = tensor_sequence.to(device)
tensor_label = tensor_label.unsqueeze(2).to(device)
output = model(tensor_sequence)
loss = criterion(output, tensor_label)
total_loss += loss.item()
# Accuracy
predicted_labels = output.sigmoid().round() # Convert to probability and round to get predicted class
correct_predictions += (predicted_labels == tensor_label.unsqueeze(1)).sum().item()
total_predictions += tensor_label.numel()
average_loss = total_loss / len(validation_loader)
accuracy = correct_predictions / total_predictions
return average_loss, accuracy
def train(model, train_loader, criterion, optimizer, scheduler, num_epochs):
for epoch in range(num_epochs):
model.train()
total_loss = 0
print(f"Starting Epoch [{epoch+1}/{num_epochs}]")
for tensor_sequence, tensor_label in train_loader:
tensor_sequence = tensor_sequence.to(device)
tensor_label = tensor_label.unsqueeze(2).to(device)
optimizer.zero_grad()
output = model(tensor_sequence)
loss = criterion(output, tensor_label)
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
optimizer.step()
scheduler.step()
average_loss = total_loss / len(train_loader)
print(f"Finished Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")
average_val_loss, val_accuracy = evaluate(model, validation_loader, criterion, device)
print(f"Validation Loss: {average_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
torch.save(model.state_dict(), 'model.pth')
if torch.cuda.is_available():
torch.cuda.set_device(1) #GPU 1
device = torch.device("cuda")
else:
device = torch.device("cpu")
# Initialize the model, criterion, optimizer, and scheduler
model = TransformerAD(input_dim, output_dim, sequence_length)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
weights_tensor = torch.tensor([0.3568, 0.6432])
criterion = nn.BCEWithLogitsLoss(pos_weight=weights_tensor[1]).to(device)
# Start the training process
num_epochs = 10
train(model, train_loader, criterion, optimizer, scheduler, num_epochs)
```

And this is the result:

```
Starting Epoch [1/10]
Finished Epoch [1/10], Loss: 0.5849
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [2/10]
Finished Epoch [2/10], Loss: 0.5831
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [3/10]
Finished Epoch [3/10], Loss: 0.5831
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [4/10]
Finished Epoch [4/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [5/10]
Finished Epoch [5/10], Loss: 0.5831
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [6/10]
Finished Epoch [6/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [7/10]
Finished Epoch [7/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [8/10]
Finished Epoch [8/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [9/10]
Finished Epoch [9/10], Loss: 0.5832
Validation Loss: 0.7225, Accuracy: 18.3222
Starting Epoch [10/10]
Finished Epoch [10/10], Loss: 0.5831
Validation Loss: 0.7225, Accuracy: 18.3222
```