Model parameters are not updated

marziehoghbaie · June 23, 2020, 5:48pm

Hi,
I have a custom loss function, but when I use it with my model, its parameters do not change, but when I use built-in cross-entropy loss, it changes. I dont know where is my problem??? Could someone help me out???
This is my loss function:

class SkNN_loss(nn.Module):
    def __init__(self):
        super(SkNN_loss, self).__init__()

    def pairwise_euclid_distance(self, A, B):
        """Pairwise Euclidean distance between two matrices.
        :param A: a matrix.
        :param B: a matrix.

        :returns: A tensor for the pairwise Euclidean between A and B.
        """
        dist = A - B
        euc_dist = torch.norm(dist, p=2)
        return euc_dist

    def pairwise_cos_distance(self, A, B):
        """Pairwise cosine distance between two matrices.
        :param A: a matrix.
        :param B: a matrix.

        :returns: A tensor for the pairwise cosine between A and B.
        """
        # l2 normalized vectors
        norm_A = A.norm(p=2, keepdim=True)
        norm_B = B.norm(p=2, keepdim=True)
        A = A.div(norm_A)
        B = A.div(norm_B)

        prod = torch.mul(A, B)
        return 1 - prod

    def forward(self, features, labels, T):
        b = len(features)
        neighbors = 0
        total = 0
        l_sn = 0
        for i, x_i in enumerate(features):
            for j, x_j in enumerate(features):
                if i != j:
                    dist = self.pairwise_euclid_distance(x_i, x_j)
                    dist = -(dist / T)
                    dist = torch.exp(dist)
                    if labels[i] == labels[j]:
                        neighbors += dist
                    total += dist
            if neighbors != 0:
                l_sn += torch.log(neighbors / total)

            neighbors = 0
            total = 0
        l_sn = l_sn * (-1 / b)
        return l_sn

and this is my training:

def train_test(self, model, criterion, epoch, phase, optimizer, args, logger, use_gpu):

        # Set Dataset loader
        if phase == 'train':
            dataset = self.train_loader
        elif phase == 'test':
            dataset = self.test_loader
        else:
            dataset = self.val_loader
        # Set model mode
        if phase == 'test' or phase == 'val':
            model.eval()
        if phase == 'train':
            model.train()
            logger.info('-' * 10)
            logger.info('Epoch {}/{}'.format(epoch, args.epochs - 1))
            logger.info('Current Learning rate: {}'.format(showLR(optimizer)))
        Temperature = 100
        running_loss = 0
        running_corrects = 0
        running_all = 0
        top_k = 0
        model_loss = model.sknn_loss
        for batch_idx, (inputs, targets) in enumerate(dataset):

            since = time.time()
            loss_CEntropy_array = []
            loss_cluster_array = []
            inputs = inputs.float().permute(0, 2, 1, 3, 4)
            if use_gpu:
                model.to(device)
                inputs, targets = inputs.to(device), targets.to(device)

            if phase != 'train':
                with torch.no_grad():
                    outputs, features = model(inputs)
            else:
                outputs, features = model(inputs)
            output_tmp = F.softmax(outputs, dim=1).data
            _, preds = torch.max(output_tmp, 1)
            _, top_k_index = torch.topk(output_tmp, self.k)
            # print('!!!!!!!!!!!!!!!!!!!!!!!',preds, targets, outputs)
            loss_CEntropy = criterion(outputs, targets)
            loss_kNN = model_loss(features, targets, Temperature)
            print(loss_kNN.grad, loss_kNN)
            loss_CEntropy_array.append(self.cross_entr_coef * loss_CEntropy.item())
            loss_cluster_array.append(self.kNN_coef * loss_kNN.item())
            # L2 Regularization
            l2 = 0
            for W in model.parameters():
                # l2 += torch.norm(W, p=2)
                # print(W.requires_grad)
                pass

            final_loss = self.kNN_coef * loss_kNN + self.cross_entr_coef * loss_CEntropy
            if phase == 'train':
                print(W)
                # print(list(model.parameters())[0].grad)
                optimizer.zero_grad()
                final_loss.backward()
                optimizer.step()
            running_loss += final_loss.item()
            for jj in range(len(preds)):
                if preds[jj] == targets.data[jj]:
                    running_corrects += 1
                if targets.data[jj] in top_k_index[jj]:
                    top_k += 1

            running_all += len(inputs)

            # Write info in logger
            cost_time = time.time() - since
            logger.info(
                'Process: [{:5.0f}/{:5.0f} ({:.0f}%), (running_corrects/running_all): ({}/{})]\t'
                'Top-k: {}, Top-2 acc: {}\n'
                'Loss: {:.4f}\tAcc:{:.4f}\n'
                'Mean of Cross Entropy: {:.4f}, Mean of Cluster Entropy: {:.4f}\n'
                'Cost time:{:5.0f}s\tRemaining time for the epoch:{:5.0f}s\r'.format(
                    running_all,
                    len(dataset.dataset),
                    100. * batch_idx / len(dataset),
                    running_corrects,
                    running_all,
                    top_k,
                    top_k / running_all,
                    running_loss / (batch_idx + 1),
                    running_corrects / running_all,
                    np.mean(loss_CEntropy_array),
                    np.mean(loss_cluster_array),
                    cost_time,
                    cost_time * (len(dataset) - batch_idx)
                ))

            if batch_idx % args.interval == 0 or (batch_idx == len(dataset) - 1):
                cost_time = time.time() - since
                print(
                    'Process: [{:5.0f}/{:5.0f} ({:.0f}%), (running_corrects/running_all): ({}/{})]\t'
                    'Top-k: {}, Top-k acc: {}, '
                    'Loss: {:.4f}\tAcc:{:.4f}\tCost time:{:5.0f}s\tRemaining time for the epoch:{:5.0f}s\r'.format(
                        running_all,
                        len(dataset.dataset),
                        100. * batch_idx / len(dataset),
                        running_corrects,
                        running_all,
                        top_k,
                        top_k / running_all,
                        running_loss / (batch_idx + 1),
                        running_corrects / running_all,
                        cost_time,
                        cost_time * (len(dataset) - batch_idx)
                    ))

        final_loss = running_loss / (batch_idx + 1)
        acc = running_corrects / len(dataset.dataset)
        logger.info('*******************************Final Epoch Results*******************************')
        logger.info('{} Epoch:\t{:2}\tTotal Loss: {:.4f}\tAcc:{:.4f}'.format(
            phase,
            epoch,
            final_loss,
            acc
        ))
        logger.info(
            '****************************************************************************************************************************')
        if phase == 'train':
            torch.save(model.state_dict(), args.save_path + '/' + args.mode + '_' + str(epoch + 1) + '.pt')
            return model, final_loss, acc
        else:
            return final_loss, acc

harsha_g · June 24, 2020, 3:39am

(Without knowing much about your model), did you try replacing features with outputs in this line and see if that works?

marziehoghbaie · June 24, 2020, 4:37am

Thanks, the problem is that my loss function is based on some embeddings or features that I extracted!

marziehoghbaie · June 25, 2020, 6:54pm

I also checked out the grad of all model parameters, and I found out that the grad of two last layers are zero.
Why??

harsha_g · June 25, 2020, 10:01pm

@marziehoghbaie Can you post your model here?

marziehoghbaie · June 26, 2020, 6:35am

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import math
from collections import OrderedDict
from torchsummary import summary

# Dense3D with clustering loss
# class NLLSequenceLoss(nn.Module):
#     """
#     Custom loss function.
#     Returns a loss that is the sum of all losses at each time step.
#     """
#
#     def __init__(self):
#         super(NLLSequenceLoss, self).__init__()
#         self.criterion = nn.NLLLoss(reduction='none')
#         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#
#     def forward(self, input, length, target):
#         loss = []
#         loss = []
#         transposed = input.transpose(0, 1).contiguous()
#         for i in range(transposed.size(0)):
#             loss.append(self.criterion(transposed[i,], target).unsqueeze(1))
#         loss = torch.cat(loss, 1)
#         # print('loss:',loss)
#         mask = torch.zeros(loss.size(0), loss.size(1)).float().to(self.device)
#
#         # mask = torch.zeros(loss.size(0), loss.size(1)).float()
#
#         for i in range(length.size(0)):
#             L = min(mask.size(1), length[i])
#             mask[i, L - 1] = 1.0
#         # print('mask:',mask)
#         # print('mask * loss',mask*loss)
#         loss = (loss * mask).sum() / mask.sum()
#         return loss
#
#         # return loss/transposed.size(0)


class SkNN_loss(nn.Module):
    def __init__(self):
        super(SkNN_loss, self).__init__()

    def pairwise_euclid_distance(self, A, B):
        """Pairwise Euclidean distance between two matrices.
        :param A: a matrix.
        :param B: a matrix.

        :returns: A tensor for the pairwise Euclidean between A and B.
        """
        dist = torch.sub(A, B)
        euc_dist = torch.norm(dist, p=2)
        return euc_dist

    def pairwise_cos_distance(self, A, B):
        """Pairwise cosine distance between two matrices.
        :param A: a matrix.
        :param B: a matrix.

        :returns: A tensor for the pairwise cosine between A and B.
        """
        # l2 normalized vectors
        norm_A = A.norm(p=2, keepdim=True)
        norm_B = B.norm(p=2, keepdim=True)
        A = A.div(norm_A)
        B = A.div(norm_B)
        prod = torch.mul(A, B)
        return 1 - prod

    def forward(self, features, labels, T):
        b = -1/(features.shape[0])
        b = torch.tensor(b)
        neighbors = torch.tensor(0)
        total = torch.tensor(0)
        l_sn = torch.tensor(0)
        for i, x_i in enumerate(features):
            for j, x_j in enumerate(features):
                if i != j:
                    dist = self.pairwise_euclid_distance(x_i, x_j)
                    dist = torch.div(dist, T)
                    dist = torch.mul(dist, -1)
                    dist = torch.exp(dist)
                    if labels[i] == labels[j]:
                        neighbors = torch.add(neighbors, dist)
                    total = torch.add(total, dist)
            if neighbors != 0:
                l_sn = torch.add(l_sn, torch.log(neighbors / total))
            neighbors = torch.tensor(0)
            total = torch.tensor(0)
        l_sn = torch.mul(l_sn, b)
        return l_sn


def _validate(modelOutput, length, labels, total=None, wrong=None):
    averageEnergies = torch.sum(modelOutput.data, 1)
    for i in range(modelOutput.size(0)):
        # print(modelOutput[i,:length[i]].sum(0).shape)
        averageEnergies[i] = modelOutput[i, :length[i]].sum(0)

    maxvalues, maxindices = torch.max(averageEnergies, 1)

    count = 0

    for i in range(0, labels.squeeze(1).size(0)):
        l = int(labels.squeeze(1)[i].cpu())
        if total is not None:
            if l not in total:
                total[l] = 1
            else:
                total[l] += 1
        if maxindices[i] == labels.squeeze(1)[i]:
            count += 1
        else:
            if wrong is not None:
                if l not in wrong:
                    wrong[l] = 1
                else:
                    wrong[l] += 1

    return (averageEnergies, count)


class _DenseLayer(nn.Sequential):
    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
        super(_DenseLayer, self).__init__()
        self.add_module('norm1', nn.BatchNorm3d(num_input_features)),
        self.add_module('relu1', nn.ReLU(inplace=True)),
        self.add_module('conv1', nn.Conv3d(num_input_features, bn_size *
                                           growth_rate, kernel_size=1, stride=1, bias=False)),
        self.add_module('norm2', nn.BatchNorm3d(bn_size * growth_rate)),
        self.add_module('relu2', nn.ReLU(inplace=True)),
        self.add_module('conv2', nn.Conv3d(bn_size * growth_rate, growth_rate,
                                           kernel_size=3, stride=1, padding=1, bias=False)),
        self.drop_rate = drop_rate

    def forward(self, x):
        new_features = super(_DenseLayer, self).forward(x)
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
        return torch.cat([x, new_features], 1)


class _DenseBlock(nn.Sequential):
    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
        super(_DenseBlock, self).__init__()
        for i in range(num_layers):
            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
            self.add_module('denselayer%d' % (i + 1), layer)


class _Transition(nn.Sequential):
    def __init__(self, num_input_features, num_output_features):
        super(_Transition, self).__init__()
        self.add_module('norm', nn.BatchNorm3d(num_input_features))
        self.add_module('relu', nn.ReLU(inplace=True))
        self.add_module('conv', nn.Conv3d(num_input_features, num_output_features,
                                          kernel_size=1, stride=1, bias=False))
        self.add_module('pool', nn.AvgPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)))


class Dense3D(torch.nn.Module):
    def __init__(self, N_classes, growth_rate=32, num_init_features=64, bn_size=4, drop_rate=0, n_frames=29):
        super(Dense3D, self).__init__()
        # block_config = (6, 12, 24, 16)
        block_config = (4, 8, 12, 8)
        self.N_classes = N_classes
        # self.loss = nn.CrossEntropyLoss()
        self.sknn_loss= SkNN_loss()
        self.features = nn.Sequential(OrderedDict([
            ('conv0',
             nn.Conv3d(3, num_init_features, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)),
            ('norm0', nn.BatchNorm3d(num_init_features)),
            ('relu0', nn.ReLU(inplace=True)),
            ('pool0', nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))),
        ]))
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):

            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
            self.features.add_module('denseblock%d' % (i + 1), block)

            num_features = num_features + num_layers * growth_rate
            if i != len(block_config) - 1:
                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
                self.features.add_module('transition%d' % (i + 1), trans)
                num_features = num_features // 2
        # output shape of this stage torch.Size([batch_size, 536, n_frames, 3, 3])
        # Final batch norm
        inp_channels = 536*n_frames
        out_channel = int(inp_channels/4)
        # ToDo Time distributed Conv, CRF
        self.conv2d_reduction = nn.Conv2d(in_channels=inp_channels, out_channels=out_channel, kernel_size=(1, 1))
        self.maxpooling_reduction = nn.MaxPool2d(kernel_size=(3, 3))
        self.output_FC = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(out_channel, self.N_classes)
        )

    def validator_function(self):
        return _validate

    def forward(self, x):
        """
        input shape (4, 3, 29, 112, 112)
        torch.Size([4, 536, 29, 3, 3])
        torch.Size([4, 15544, 3, 3])
        torch.Size([4, 3886, 3, 3])
        torch.Size([4, 3886, 1, 1])
        torch.Size([4, 3886])
        torch.Size([4, 10])
        """
        dense_lyr_features = self.features(x)
        dense_lyr_features = torch.flatten(dense_lyr_features, start_dim=1, end_dim=2)
        embedding = self.conv2d_reduction(dense_lyr_features)
        embedding = self.maxpooling_reduction(embedding)
        embedding = torch.flatten(embedding, start_dim=1)
        dense_lyr_features = self.output_FC(embedding)
        return dense_lyr_features, embedding


if __name__ == '__main__':
    # Batch size, channels(RGB), Number of frames per video, size of mouth region
    # data = torch.zeros((4, 3, 29, 112, 112))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = Dense3D(N_classes=10)
    print(summary(model.cuda(), (3, 29, 112, 112)))
    # model_output = model(data)

marziehoghbaie · June 26, 2020, 7:19am

As I know, the weights of the network will change according to the loss function if they are used in the loss function ( W := W - alpha . L’(W)), so the model’s 2 last layers are not used in the loss function and it is obvious that their grad is Zero?? Am I right??