Model parameters are not updated

Hi,
I have a custom loss function, but when I use it with my model, its parameters do not change, but when I use built-in cross-entropy loss, it changes. I dont know where is my problem??? Could someone help me out???
This is my loss function:

class SkNN_loss(nn.Module):
    def __init__(self):
        super(SkNN_loss, self).__init__()

    def pairwise_euclid_distance(self, A, B):
        """Pairwise Euclidean distance between two matrices.
        :param A: a matrix.
        :param B: a matrix.

        :returns: A tensor for the pairwise Euclidean between A and B.
        """
        dist = A - B
        euc_dist = torch.norm(dist, p=2)
        return euc_dist

    def pairwise_cos_distance(self, A, B):
        """Pairwise cosine distance between two matrices.
        :param A: a matrix.
        :param B: a matrix.

        :returns: A tensor for the pairwise cosine between A and B.
        """
        # l2 normalized vectors
        norm_A = A.norm(p=2, keepdim=True)
        norm_B = B.norm(p=2, keepdim=True)
        A = A.div(norm_A)
        B = A.div(norm_B)

        prod = torch.mul(A, B)
        return 1 - prod

    def forward(self, features, labels, T):
        b = len(features)
        neighbors = 0
        total = 0
        l_sn = 0
        for i, x_i in enumerate(features):
            for j, x_j in enumerate(features):
                if i != j:
                    dist = self.pairwise_euclid_distance(x_i, x_j)
                    dist = -(dist / T)
                    dist = torch.exp(dist)
                    if labels[i] == labels[j]:
                        neighbors += dist
                    total += dist
            if neighbors != 0:
                l_sn += torch.log(neighbors / total)

            neighbors = 0
            total = 0
        l_sn = l_sn * (-1 / b)
        return l_sn

and this is my training:

def train_test(self, model, criterion, epoch, phase, optimizer, args, logger, use_gpu):

        # Set Dataset loader
        if phase == 'train':
            dataset = self.train_loader
        elif phase == 'test':
            dataset = self.test_loader
        else:
            dataset = self.val_loader
        # Set model mode
        if phase == 'test' or phase == 'val':
            model.eval()
        if phase == 'train':
            model.train()
            logger.info('-' * 10)
            logger.info('Epoch {}/{}'.format(epoch, args.epochs - 1))
            logger.info('Current Learning rate: {}'.format(showLR(optimizer)))
        Temperature = 100
        running_loss = 0
        running_corrects = 0
        running_all = 0
        top_k = 0
        model_loss = model.sknn_loss
        for batch_idx, (inputs, targets) in enumerate(dataset):

            since = time.time()
            loss_CEntropy_array = []
            loss_cluster_array = []
            inputs = inputs.float().permute(0, 2, 1, 3, 4)
            if use_gpu:
                model.to(device)
                inputs, targets = inputs.to(device), targets.to(device)

            if phase != 'train':
                with torch.no_grad():
                    outputs, features = model(inputs)
            else:
                outputs, features = model(inputs)
            output_tmp = F.softmax(outputs, dim=1).data
            _, preds = torch.max(output_tmp, 1)
            _, top_k_index = torch.topk(output_tmp, self.k)
            # print('!!!!!!!!!!!!!!!!!!!!!!!',preds, targets, outputs)
            loss_CEntropy = criterion(outputs, targets)
            loss_kNN = model_loss(features, targets, Temperature)
            print(loss_kNN.grad, loss_kNN)
            loss_CEntropy_array.append(self.cross_entr_coef * loss_CEntropy.item())
            loss_cluster_array.append(self.kNN_coef * loss_kNN.item())
            # L2 Regularization
            l2 = 0
            for W in model.parameters():
                # l2 += torch.norm(W, p=2)
                # print(W.requires_grad)
                pass

            final_loss = self.kNN_coef * loss_kNN + self.cross_entr_coef * loss_CEntropy
            if phase == 'train':
                print(W)
                # print(list(model.parameters())[0].grad)
                optimizer.zero_grad()
                final_loss.backward()
                optimizer.step()
            running_loss += final_loss.item()
            for jj in range(len(preds)):
                if preds[jj] == targets.data[jj]:
                    running_corrects += 1
                if targets.data[jj] in top_k_index[jj]:
                    top_k += 1

            running_all += len(inputs)

            # Write info in logger
            cost_time = time.time() - since
            logger.info(
                'Process: [{:5.0f}/{:5.0f} ({:.0f}%), (running_corrects/running_all): ({}/{})]\t'
                'Top-k: {}, Top-2 acc: {}\n'
                'Loss: {:.4f}\tAcc:{:.4f}\n'
                'Mean of Cross Entropy: {:.4f}, Mean of Cluster Entropy: {:.4f}\n'
                'Cost time:{:5.0f}s\tRemaining time for the epoch:{:5.0f}s\r'.format(
                    running_all,
                    len(dataset.dataset),
                    100. * batch_idx / len(dataset),
                    running_corrects,
                    running_all,
                    top_k,
                    top_k / running_all,
                    running_loss / (batch_idx + 1),
                    running_corrects / running_all,
                    np.mean(loss_CEntropy_array),
                    np.mean(loss_cluster_array),
                    cost_time,
                    cost_time * (len(dataset) - batch_idx)
                ))

            if batch_idx % args.interval == 0 or (batch_idx == len(dataset) - 1):
                cost_time = time.time() - since
                print(
                    'Process: [{:5.0f}/{:5.0f} ({:.0f}%), (running_corrects/running_all): ({}/{})]\t'
                    'Top-k: {}, Top-k acc: {}, '
                    'Loss: {:.4f}\tAcc:{:.4f}\tCost time:{:5.0f}s\tRemaining time for the epoch:{:5.0f}s\r'.format(
                        running_all,
                        len(dataset.dataset),
                        100. * batch_idx / len(dataset),
                        running_corrects,
                        running_all,
                        top_k,
                        top_k / running_all,
                        running_loss / (batch_idx + 1),
                        running_corrects / running_all,
                        cost_time,
                        cost_time * (len(dataset) - batch_idx)
                    ))

        final_loss = running_loss / (batch_idx + 1)
        acc = running_corrects / len(dataset.dataset)
        logger.info('*******************************Final Epoch Results*******************************')
        logger.info('{} Epoch:\t{:2}\tTotal Loss: {:.4f}\tAcc:{:.4f}'.format(
            phase,
            epoch,
            final_loss,
            acc
        ))
        logger.info(
            '****************************************************************************************************************************')
        if phase == 'train':
            torch.save(model.state_dict(), args.save_path + '/' + args.mode + '_' + str(epoch + 1) + '.pt')
            return model, final_loss, acc
        else:
            return final_loss, acc

(Without knowing much about your model), did you try replacing features with outputs in this line and see if that works?

Thanks, the problem is that my loss function is based on some embeddings or features that I extracted!

I also checked out the grad of all model parameters, and I found out that the grad of two last layers are zero.
Why??

@marziehoghbaie Can you post your model here?

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import math
from collections import OrderedDict
from torchsummary import summary

# Dense3D with clustering loss
# class NLLSequenceLoss(nn.Module):
#     """
#     Custom loss function.
#     Returns a loss that is the sum of all losses at each time step.
#     """
#
#     def __init__(self):
#         super(NLLSequenceLoss, self).__init__()
#         self.criterion = nn.NLLLoss(reduction='none')
#         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#
#     def forward(self, input, length, target):
#         loss = []
#         loss = []
#         transposed = input.transpose(0, 1).contiguous()
#         for i in range(transposed.size(0)):
#             loss.append(self.criterion(transposed[i,], target).unsqueeze(1))
#         loss = torch.cat(loss, 1)
#         # print('loss:',loss)
#         mask = torch.zeros(loss.size(0), loss.size(1)).float().to(self.device)
#
#         # mask = torch.zeros(loss.size(0), loss.size(1)).float()
#
#         for i in range(length.size(0)):
#             L = min(mask.size(1), length[i])
#             mask[i, L - 1] = 1.0
#         # print('mask:',mask)
#         # print('mask * loss',mask*loss)
#         loss = (loss * mask).sum() / mask.sum()
#         return loss
#
#         # return loss/transposed.size(0)


class SkNN_loss(nn.Module):
    def __init__(self):
        super(SkNN_loss, self).__init__()

    def pairwise_euclid_distance(self, A, B):
        """Pairwise Euclidean distance between two matrices.
        :param A: a matrix.
        :param B: a matrix.

        :returns: A tensor for the pairwise Euclidean between A and B.
        """
        dist = torch.sub(A, B)
        euc_dist = torch.norm(dist, p=2)
        return euc_dist

    def pairwise_cos_distance(self, A, B):
        """Pairwise cosine distance between two matrices.
        :param A: a matrix.
        :param B: a matrix.

        :returns: A tensor for the pairwise cosine between A and B.
        """
        # l2 normalized vectors
        norm_A = A.norm(p=2, keepdim=True)
        norm_B = B.norm(p=2, keepdim=True)
        A = A.div(norm_A)
        B = A.div(norm_B)
        prod = torch.mul(A, B)
        return 1 - prod

    def forward(self, features, labels, T):
        b = -1/(features.shape[0])
        b = torch.tensor(b)
        neighbors = torch.tensor(0)
        total = torch.tensor(0)
        l_sn = torch.tensor(0)
        for i, x_i in enumerate(features):
            for j, x_j in enumerate(features):
                if i != j:
                    dist = self.pairwise_euclid_distance(x_i, x_j)
                    dist = torch.div(dist, T)
                    dist = torch.mul(dist, -1)
                    dist = torch.exp(dist)
                    if labels[i] == labels[j]:
                        neighbors = torch.add(neighbors, dist)
                    total = torch.add(total, dist)
            if neighbors != 0:
                l_sn = torch.add(l_sn, torch.log(neighbors / total))
            neighbors = torch.tensor(0)
            total = torch.tensor(0)
        l_sn = torch.mul(l_sn, b)
        return l_sn


def _validate(modelOutput, length, labels, total=None, wrong=None):
    averageEnergies = torch.sum(modelOutput.data, 1)
    for i in range(modelOutput.size(0)):
        # print(modelOutput[i,:length[i]].sum(0).shape)
        averageEnergies[i] = modelOutput[i, :length[i]].sum(0)

    maxvalues, maxindices = torch.max(averageEnergies, 1)

    count = 0

    for i in range(0, labels.squeeze(1).size(0)):
        l = int(labels.squeeze(1)[i].cpu())
        if total is not None:
            if l not in total:
                total[l] = 1
            else:
                total[l] += 1
        if maxindices[i] == labels.squeeze(1)[i]:
            count += 1
        else:
            if wrong is not None:
                if l not in wrong:
                    wrong[l] = 1
                else:
                    wrong[l] += 1

    return (averageEnergies, count)


class _DenseLayer(nn.Sequential):
    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
        super(_DenseLayer, self).__init__()
        self.add_module('norm1', nn.BatchNorm3d(num_input_features)),
        self.add_module('relu1', nn.ReLU(inplace=True)),
        self.add_module('conv1', nn.Conv3d(num_input_features, bn_size *
                                           growth_rate, kernel_size=1, stride=1, bias=False)),
        self.add_module('norm2', nn.BatchNorm3d(bn_size * growth_rate)),
        self.add_module('relu2', nn.ReLU(inplace=True)),
        self.add_module('conv2', nn.Conv3d(bn_size * growth_rate, growth_rate,
                                           kernel_size=3, stride=1, padding=1, bias=False)),
        self.drop_rate = drop_rate

    def forward(self, x):
        new_features = super(_DenseLayer, self).forward(x)
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
        return torch.cat([x, new_features], 1)


class _DenseBlock(nn.Sequential):
    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
        super(_DenseBlock, self).__init__()
        for i in range(num_layers):
            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
            self.add_module('denselayer%d' % (i + 1), layer)


class _Transition(nn.Sequential):
    def __init__(self, num_input_features, num_output_features):
        super(_Transition, self).__init__()
        self.add_module('norm', nn.BatchNorm3d(num_input_features))
        self.add_module('relu', nn.ReLU(inplace=True))
        self.add_module('conv', nn.Conv3d(num_input_features, num_output_features,
                                          kernel_size=1, stride=1, bias=False))
        self.add_module('pool', nn.AvgPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)))


class Dense3D(torch.nn.Module):
    def __init__(self, N_classes, growth_rate=32, num_init_features=64, bn_size=4, drop_rate=0, n_frames=29):
        super(Dense3D, self).__init__()
        # block_config = (6, 12, 24, 16)
        block_config = (4, 8, 12, 8)
        self.N_classes = N_classes
        # self.loss = nn.CrossEntropyLoss()
        self.sknn_loss= SkNN_loss()
        self.features = nn.Sequential(OrderedDict([
            ('conv0',
             nn.Conv3d(3, num_init_features, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)),
            ('norm0', nn.BatchNorm3d(num_init_features)),
            ('relu0', nn.ReLU(inplace=True)),
            ('pool0', nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))),
        ]))
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):

            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
            self.features.add_module('denseblock%d' % (i + 1), block)

            num_features = num_features + num_layers * growth_rate
            if i != len(block_config) - 1:
                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
                self.features.add_module('transition%d' % (i + 1), trans)
                num_features = num_features // 2
        # output shape of this stage torch.Size([batch_size, 536, n_frames, 3, 3])
        # Final batch norm
        inp_channels = 536*n_frames
        out_channel = int(inp_channels/4)
        # ToDo Time distributed Conv, CRF
        self.conv2d_reduction = nn.Conv2d(in_channels=inp_channels, out_channels=out_channel, kernel_size=(1, 1))
        self.maxpooling_reduction = nn.MaxPool2d(kernel_size=(3, 3))
        self.output_FC = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(out_channel, self.N_classes)
        )

    def validator_function(self):
        return _validate

    def forward(self, x):
        """
        input shape (4, 3, 29, 112, 112)
        torch.Size([4, 536, 29, 3, 3])
        torch.Size([4, 15544, 3, 3])
        torch.Size([4, 3886, 3, 3])
        torch.Size([4, 3886, 1, 1])
        torch.Size([4, 3886])
        torch.Size([4, 10])
        """
        dense_lyr_features = self.features(x)
        dense_lyr_features = torch.flatten(dense_lyr_features, start_dim=1, end_dim=2)
        embedding = self.conv2d_reduction(dense_lyr_features)
        embedding = self.maxpooling_reduction(embedding)
        embedding = torch.flatten(embedding, start_dim=1)
        dense_lyr_features = self.output_FC(embedding)
        return dense_lyr_features, embedding


if __name__ == '__main__':
    # Batch size, channels(RGB), Number of frames per video, size of mouth region
    # data = torch.zeros((4, 3, 29, 112, 112))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = Dense3D(N_classes=10)
    print(summary(model.cuda(), (3, 29, 112, 112)))
    # model_output = model(data)

As I know, the weights of the network will change according to the loss function if they are used in the loss function ( W := W - alpha . L’(W)), so the model’s 2 last layers are not used in the loss function and it is obvious that their grad is Zero?? Am I right??