Loss value stuck and parameters are not updated

My model paramters arent being updated, i have checked and they are all same for every batch train. Task of this model is Multi Label Classification.

this is whole code of my model

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

class GraphAttentionLayer(nn.Module):
 
    def __init__(self, in_features, out_features):
        super(GraphAttentionLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        self.relu = nn.LeakyReLU(0.2)

    def forward(self, H, adj):
        HW = torch.matmul(H, self.W)
        attention = torch.relu(torch.mm(HW, HW.transpose(0, 1)))
        h_prime = self.relu(torch.matmul(attention, HW))
        h_prime = self.relu(torch.matmul(adj, h_prime))
        return h_prime

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'

class GAT(nn.Module):
    def __init__(self, nfeat, nhid, nheads):
        super(GAT, self).__init__()
        self.nheads = nheads
        self.attentions = [GraphAttentionLayer(nfeat, nhid) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

    def forward(self, H, adj):
        sum = 0
        for att in self.attentions:
            sum += att(H, adj)

        sum = sum/self.nbheads    
        return sum

class MAGNET(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, adjacency, attention_heads, rnn='lstm'):
        super(MAGNET, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.gat1 = GAT(input_size, hidden_size*2, attention_heads)
        self.gat2 = GAT(hidden_size*2, hidden_size*2, attention_heads)
        self.A = nn.Parameter(adjacency)
        self.relu = nn.LeakyReLU(0.2)

    def forward(self, x, feat):

        features, _ = self.lstm(x)
        features = features[:, -1, :].squeeze(1)

        adj = self.A
        att = self.gat1(feat, adj)
        att = self.relu(att)
        att = self.gat2(att, adj)
        att = att.transpose(0, 1)

        out = torch.matmul(features, att)

        return out

this is how i train model

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train, X_test, y_train, y_test, feat, adjacency = load_data()

feat = feat.to(device)
adjacency = adjacency.to(device)

# Hyperparameters
input_size = X_train.shape[2]
hidden_size = 250
num_classes = y_train.shape[1]
learning_rate = 0.001
batch_size = 250
num_epochs = 250
attention_heads = 4
datalen = X_train.shape[0]


# Initialize network
model = MAGNET(input_size, hidden_size, num_classes, adjacency, attention_heads).to(device)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

model.train()
# Train Network
for epoch in range(num_epochs):
    loss_list = []
    hammingloss = []
    microf1_score= []
    for start, end in indexloader(datalen, batch_size):
        data = X_train[start:end]
        data = data.to(device)
        targets = y_train[start:end]
        targets = targets.to(device)
        # forward
        scores = model(data, feat)
        loss = criterion(scores, targets)
        # backward
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=10)
        loss_list.append(loss.item())
        # gradient descent or adam step
        optimizer.step()
    print(epoch, np.mean(loss_list))

Result:

0 1.0576843857765197
1 1.0576843857765197
2 1.0576843857765197
3 1.0576843857765197
4 1.0576843857765197
5 1.0576843857765197
6 1.0576843857765197
7 1.0576843857765197

What am i doing wrong?
I really need help

Thank you all.

Do optimizer.zero_grad() before scores = model(data, feat). I mean something like this.

...
optimizer.zero_grad()
# forward
scores = model(data, feat)
loss = criterion(scores, targets)
# backward
loss.backward()
...

And tell me if your problem has been solved.

I still got same result but if i am not using gat layer and i change output to fully connected linear the loss value will decreased and model managed to learn. No matter optimizer.zero_grad() position, below or above scores the model still managed to learn.

I change the code like this

def forward(self, x, feat): # feat.size : (N, 768)

        features, _ = self.lstm(x)
        features = features[:, -1, :].squeeze(1) #features.size : (batch_size, hidden_size*2)
        
        out = self.fc(features)

        return out

the result

0 0.266863764077425
1 0.17352587655186652
2 0.1687601201236248
3 0.16377742290496827
4 0.15879809260368347

I try to track what paramers were not updated:
The code like this

# Train Network
old_params = {}
for name, params in model.named_parameters():
    old_params[name] = params.clone()

for epoch in range(num_epochs):

# perform update
    loss_list = []
    hammingloss = []
    microf1_score= []
    for start, end in indexloader(datalen, batch_size):
        data = X_train[start:end]
        data = data.to(device)
        targets = y_train[start:end]
        targets = targets.to(device)
        optimizer.zero_grad()
        # forward
        scores = model(data, feat)
        loss = criterion(scores, targets)
        # backward
        # optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=10)
        loss_list.append(loss.item())
        # gradient descent or adam step
        optimizer.step()
    print(epoch, np.mean(loss_list))
    for name, params in model.named_parameters():
      if (old_params[name] == params).all():
        print(name, ' true')
      else:
        print(name, ' false')

And when i am using GAT all parameters were not updated.
the result

A  true
lstm.weight_ih_l0  true
lstm.weight_hh_l0  true
lstm.bias_ih_l0  true
lstm.bias_hh_l0  true
lstm.weight_ih_l0_reverse  true
lstm.weight_hh_l0_reverse  true
lstm.bias_ih_l0_reverse  true
lstm.bias_hh_l0_reverse  true
gat1.attention_0.W  true
gat1.attention_1.W  true
gat1.attention_2.W  true
gat1.attention_3.W  true
gat2.attention_0.W  true
gat2.attention_1.W  true
gat2.attention_2.W  true
gat2.attention_3.W  true

but if i am not using GAT layer, lstm and fc layer weights will be updated:
the result:

A  true
lstm.weight_ih_l0  false
lstm.weight_hh_l0  false
lstm.bias_ih_l0  false
lstm.bias_hh_l0  false
lstm.weight_ih_l0_reverse  false
lstm.weight_hh_l0_reverse  true
lstm.bias_ih_l0_reverse  false
lstm.bias_hh_l0_reverse  false
gat1.attention_0.W  true
gat1.attention_1.W  true
gat1.attention_2.W  true
gat1.attention_3.W  true
gat2.attention_0.W  true
gat2.attention_1.W  true
gat2.attention_2.W  true
gat2.attention_3.W  true
fc.weight  false
fc.bias  false

I am not making any progress