Accuracy is not increasing

I have tried every method I could, but my accuracy is stuck between 50-60%
Loss is decreasing but the accuracy is not increasing

My whole code :

train, val = train_test_split(data, test_size=0.2, random_state=42, shuffle = True)
val, test = train_test_split(val, test_size=0.4, random_state=42, shuffle = True)

train_loader = torch.utils.data.DataLoader(train, batch_size=64, num_workers=2, shuffle=True)
val_loader = torch.utils.data.DataLoader(val, batch_size=64, num_workers=2, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=64, num_workers=2, shuffle=True)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F    
    
class RelationAwareFeatureExtractor(nn.Module):
    def __init__(self):
        super(RelationAwareFeatureExtractor, self).__init__()

        # ConvNet layers
        self.conv1 = nn.Conv2d(4, 8, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(16, 28, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.conv4 = nn.Conv2d(28, 32, kernel_size=3, stride=1, padding=1)
#         self.dropout1 = nn.Dropout(0.5)
        self.batchnorm1 = nn.BatchNorm2d(8)
        self.batchnorm2 = nn.BatchNorm2d(16)
        self.batchnorm3 = nn.BatchNorm2d(28)
        self.fc1 = nn.Linear(131072, 1024)
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 250)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.batchnorm1(x)
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
#         x = self.dropout1(x)
        x = self.batchnorm2(x)
        x = F.relu(self.conv3(x))
        x = self.pool2(x)
        x = self.batchnorm3(x)
        x = F.relu(self.conv4(x))
        # Flatten the tensor before fully connected layers
        x = torch.flatten(x, start_dim=1)  # Flatten dimensions except batch dimension
        # Fully connected layers
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

class SelfAttentionXFA(nn.Module):
    def __init__(self, hidden_size):
        super(SelfAttentionXFA, self).__init__()
        self.query = nn.Linear(in_features=250, out_features=250)
        self.key = nn.Linear(in_features = 250, out_features =250)
        self.value = nn.Linear(in_features = 250, out_features =250)
        self.Wc = torch.nn.Conv2d(1,1,kernel_size=(1, 250))

    def forward(self,x):
        
        # Compute Query, Key, and Value matrices
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        
        bacth = x.shape[0]
        
        Wf = torch.nn.Conv2d(1,1,kernel_size=(1, bacth))
        
        # PART 1 - NORMALIZATION
        
        # L2 normalization along the feature dimension
        q_norm = q / torch.norm(q, dim=1, keepdim=True, p=2)
        k_norm = k / torch.norm(k, dim=1, keepdim=True, p=2)
        
        
        # PART 2 - CALCULATION FOR kc_hat AND kf_hat
        
        # kc_hat
        
        k_norm = k_norm.view(-1,1,1,250)
        
        kc_hat = self.Wc(k_norm)
        
        kc_hat = kc_hat.view(-1,1)
        
        
        # kf_hat
        
        k_norm = k_norm.view(-1,250)
        k_norm_t = k_norm.t()
        
        k_norm = k_norm_t.view(250,1,1,-1)
        
        kf_hat = Wf(k_norm)
        
        kf_hat = kf_hat.view(-1,1)
        kf_hat = kf_hat.t()
        
        # PART 3 - MATRIX MULTIPLICATION : XFA(Q, K, V) = VλQˆT Kˆf Kˆc
        
        temperature = 0.001
        
#         multiplying all the matrices
        mat1 = v*temperature
        q_norm_t = q_norm.t()
        mat2 = torch.matmul(mat1,q_norm_t)
        mat3 = torch.matmul(mat2,kc_hat)
        x = torch.matmul(mat3,kf_hat)
        return x

class ConditionalRandomFields(nn.Module):
    def __init__(self, size):
        super(ConditionalRandomFields, self).__init__()
        self.size = size
        self.unary_energy = nn.Linear(size, 250)
        self.pairwise_energy = nn.Linear(size, 250)
        
        #This is a learnable parameter that represents the compatibility between labels. 
        #It is a square matrix of size `(self.size, self.size)`. It is initialized with random values between -1 and 1 
        #using uniform initialization
        self.compatibility_matrix = nn.Parameter(torch.Tensor(size, size))
#         self.softmax   = nn.Softmax(dim=1)
        self.ln1 = nn.LayerNorm(250)
        # Initialize compatibility matrix with random values
        nn.init.uniform_(self.compatibility_matrix, -1, 2)

    #computes the unary energy for each element in the sequence using the `self.unary_energy` linear layer. 
    #This energy represents the model's confidence in assigning each label to each element independently.
    def compute_unary_energy(self, bi):
#         print("\ncompute_unary_energy",self.unary_energy(bi))
        return self.unary_energy(bi)
    
    #`H` represents some energy values (possibly computed from a previous step). 
    #`It computes the pairwise energy between labels for each element in the sequence using the `self.pairwise_energy` linear layer
    def compute_pairwise_energy(self, H):
#         print("\npairwise_energy",self.pairwise_energy(H))
        return self.pairwise_energy(H)
    
    #performs a matrix multiplication with the learnable `self.compatibility_matrix`.
    #this step transforms the pairwise energy values based on the compatibility between labels
    def compatibility_transform(self, Ep):
#         print("\ncompatibility_matrix",torch.matmul(Ep, self.compatibility_matrix))
        return torch.matmul(Ep, self.compatibility_matrix)

    def forward(self, bi):
        # Step 3: Initialize marginal distribution by unary energy
        #Initialize the marginal distribution `E` by computing the unary energy `Eu` using `compute_unary_energy`.
        Eu = self.compute_unary_energy(bi)
        E = Eu
        for _ in range(5):
            # Step 6: Compute pairwise energy
            Ep = self.compute_pairwise_energy(E)
            # Step 7: Compatibility transform
            Ep_comp = self.compatibility_transform(Ep)
            # Step 8: Update the marginal distribution
            #Update the marginal distribution `E` by subtracting the unary energy `Eu` from the compatibility-transformed energy `Ep_comp`.
            E = Ep_comp - Eu
#             # Step 9: Normalization
#             E = self.softmax(E)
            E = self.ln1(E)
#         print('E shape',E.shape)
        return E

#the total loss is a combination of binary classification loss, regularization terms for smoothness and sparsity, and a 
#contrastive loss. The relative importance of these components is controlled by the hyperparameters `alpha1` and `alpha2`. 
#The model is trained to minimize this total loss during the training process.
import matplotlib.pyplot as plt

class ContrastiveLoss(nn.Module):
    def __init__(self, alpha1, alpha2, lambda1, lambda2):
        super(ContrastiveLoss, self).__init__()
        self.alpha1 = alpha1
        self.alpha2 = alpha2
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        
        # Initialize empty lists to store binary loss values separately for training and validation
    def binary_classification_loss(self, predicted_scores, target_labels):
        
        loss = F.binary_cross_entropy_with_logits(predicted_scores, target_labels.float())

#         print("\nbinary_classification_loss",loss)
        return loss
    
#The `sparsity_loss` method calculates a sparsity regularization term. It computes the L1 norm of the `f_values` tensor and 
#scales it by `lambda2`. This term encourages the model's output to have sparse activations
    def sparsity_loss(self, f_values):
        #sparsity_term = lambda2 / 2 * torch.sum(f_values)
        sparsity_term = self.lambda2 * torch.sum(f_values)
#         print("sparsity_term",sparsity_term)
        return sparsity_term

#The `temporal_smoothness_loss` method calculates a smoothness regularization term. 
#It computes the sum of squared differences between adjacent values in the `f_values` tensor, normalized by `(f_values.size(0) - 1)`, and scales it by `lambda1`. 
#This term encourages smooth transitions in the model's output over time.
    def temporal_smoothness_loss(self, f_values):
        #smoothness_term = lambda1 / (f_values.size(0) - 1) * torch.sum((f_values[:] - f_values[:]) ** 2)
        #num_frames = f_values.size(0)
        # Calculate the temporal smoothness loss
#         smoothness_term = lambda1  * torch.sum((f_values[1:] - f_values[:-1])** 2)/(num_frames - 1)
        smoothness_term = lambda1  * torch.sum((f_values[1:] - f_values[:-1])** 2)
#         print("smoothness_term",smoothness_term)
        return smoothness_term
    
#The `contrastive_loss` method calculates a contrastive loss term. It takes two lists of embeddings, `embeddings_bp` 
#(positive embeddings) and `embeddings_bn` (negative embeddings). It computes the Euclidean distance (L2 norm) between all
#pairs of embeddings (`emb_i` and `emb_j`) and sums them up. This term encourages positive embeddings to be close to each other
#and negative embeddings to be far apart. The result is scaled by `alpha1`.
    def contrastive_loss(self, embeddings_bp, embeddings_bn):
        num_bp = len(embeddings_bp)
        num_bn = len(embeddings_bn)
        
        if num_bp == 0.0 or num_bn == 0.0:
            return 0.0
        
        embeddings_bp = torch.cat(embeddings_bp, dim=0)
        embeddings_bn = torch.cat(embeddings_bn, dim=0)
        
        # Calculate the contrastive loss
        loss = 0.0
        for emb_i in embeddings_bp:
            for emb_j in embeddings_bn:
                loss += torch.norm(emb_i - emb_j, p=2) ** 2
        
        loss /= (num_bp * num_bn)
        
        loss *= self.alpha1
#         print("contrastive_loss",loss)
        return loss

    def forward(self, outputs, targets, embeddings_bp, embeddings_bn,is_training=True):
        binary_loss = self.binary_classification_loss(outputs, targets)
        smoothness_loss = self.temporal_smoothness_loss(outputs)
        sparsity_loss = self.sparsity_loss(outputs)
        contrastive_loss = self.contrastive_loss(embeddings_bp, embeddings_bn)
        
        total_loss = binary_loss + self.alpha1 * smoothness_loss + self.alpha2 * sparsity_loss + contrastive_loss
        return total_loss

class AnomalyDetector(nn.Module):
    def __init__(self):
        super(AnomalyDetector, self).__init__()
        
        # Feature extractor
        self.feature_extractor = RelationAwareFeatureExtractor()
        # Self-attention layer
        self.self_attention = SelfAttentionXFA(250)

        # Conditional random fields layer
        self.conditional_random_fields = ConditionalRandomFields(250)
        self.fc = nn.Linear(250, 1)

    def forward(self, x):
        # Extract features
        x = self.feature_extractor(x)
        
        x = self.self_attention(x)
        
        log_likelihood = self.conditional_random_fields(x)
        output = self.fc(log_likelihood)
        
        return output

# Create an instance of the AnomalyDetector
model = AnomalyDetector()

num_classes = 2
num_epochs = 50
learning_rate = 0.001

# Set the alpha values
# alpha1 = 8e-5
# alpha2 = 8e-5
alpha1 = 0.001
alpha2 = 0.001
lambda1 = 0.01
lambda2 = 0.01

# Define the loss function
criterion = ContrastiveLoss(alpha1, alpha2, lambda1, lambda2)

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) #,betas=(0.9, 0.999), eps=1e-08, weight_decay=0.0,
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# Create a learning rate scheduler
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.6)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    train_auc = []
    total_loss = []
   # model.to('cuda')
    
    embeddings_bp = []
    embeddings_bn = []
    
    for batch_data, batch_labels in train_loader:
        
#         batch_data = batch_data.to('cuda')
#         batch_labels = batch_labels.to('cuda')
        
        optimizer.zero_grad()
        
#         print(batch_data.shape)
#         print(batch_labels.shape)
        
        for i in range(batch_data.shape[0]):
            if torch.equal(batch_labels[i], torch.tensor([0.0, 1.0], dtype=batch_labels.dtype)):
                embeddings_bp.append(batch_data[i])
            else:
                embeddings_bn.append(batch_data[i])
                
        batch_data = batch_data.view(-1, 4, 32, 2048)

        # Forward pass
        output = model(batch_data)
        
        batch_labels = batch_labels.view(-1,1)

        # Compute loss
        ce_loss = criterion(output, batch_labels,embeddings_bp,embeddings_bn)

#         l2_reg = 0.0
#         for param in model.parameters():
#             l2_reg += torch.norm(param, p=2)

#         #Combine the cross-entropy loss and L2 regularization term
#         loss = ce_loss + l2_lambda * l2_reg

        ce_loss.backward()
        
        optimizer.step()

        train_loss += ce_loss.item()

        pr_auc = roc_auc_score(batch_labels.cpu().numpy(), output.detach().cpu().numpy())
        train_auc.append(pr_auc)
        
#     scheduler.step()

    train_loss /= len(train_loader)
    score = sum(train_auc)/len(train_auc)
    train_loss_history.append(train_loss)
    
    if verbose: 
        print(f"\nEpoch [{epoch+1}/{num_epochs}] | Train Loss: {train_loss:.4f} | PR-AUC : {score:.2f}")

image

Let me know if any shape print is needed.

The self.fc1-3 layers in RelationAwareFeatureExtractor will collapse to a single linear layer without an activation function between them, so you might want to add a non-linearity.

I would also recommend to overfit a small dataset (e.g. just 10 samples) to make sure your model is able to learn anything.

Okay, Thank you for the solution :blush:

by any chance do you have knowledge of TRN (temporal relational network) feature extractor.

No, unfortunately not, so let’s wait for others to chime in :slight_smile:

1 Like