Model parameters are not updated

Hi,
I have a custom loss function, but when I use it with my model, its parameters do not change, but when I use built-in cross-entropy loss, it changes. I dont know where is my problem??? Could someone help me out???
This is my loss function:

``````class SkNN_loss(nn.Module):
def __init__(self):
super(SkNN_loss, self).__init__()

def pairwise_euclid_distance(self, A, B):
"""Pairwise Euclidean distance between two matrices.
:param A: a matrix.
:param B: a matrix.

:returns: A tensor for the pairwise Euclidean between A and B.
"""
dist = A - B
euc_dist = torch.norm(dist, p=2)
return euc_dist

def pairwise_cos_distance(self, A, B):
"""Pairwise cosine distance between two matrices.
:param A: a matrix.
:param B: a matrix.

:returns: A tensor for the pairwise cosine between A and B.
"""
# l2 normalized vectors
norm_A = A.norm(p=2, keepdim=True)
norm_B = B.norm(p=2, keepdim=True)
A = A.div(norm_A)
B = A.div(norm_B)

prod = torch.mul(A, B)
return 1 - prod

def forward(self, features, labels, T):
b = len(features)
neighbors = 0
total = 0
l_sn = 0
for i, x_i in enumerate(features):
for j, x_j in enumerate(features):
if i != j:
dist = self.pairwise_euclid_distance(x_i, x_j)
dist = -(dist / T)
dist = torch.exp(dist)
if labels[i] == labels[j]:
neighbors += dist
total += dist
if neighbors != 0:
l_sn += torch.log(neighbors / total)

neighbors = 0
total = 0
l_sn = l_sn * (-1 / b)
return l_sn
``````

and this is my training:

``````def train_test(self, model, criterion, epoch, phase, optimizer, args, logger, use_gpu):

if phase == 'train':
elif phase == 'test':
else:
# Set model mode
if phase == 'test' or phase == 'val':
model.eval()
if phase == 'train':
model.train()
logger.info('-' * 10)
logger.info('Epoch {}/{}'.format(epoch, args.epochs - 1))
logger.info('Current Learning rate: {}'.format(showLR(optimizer)))
Temperature = 100
running_loss = 0
running_corrects = 0
running_all = 0
top_k = 0
model_loss = model.sknn_loss
for batch_idx, (inputs, targets) in enumerate(dataset):

since = time.time()
loss_CEntropy_array = []
loss_cluster_array = []
inputs = inputs.float().permute(0, 2, 1, 3, 4)
if use_gpu:
model.to(device)
inputs, targets = inputs.to(device), targets.to(device)

if phase != 'train':
outputs, features = model(inputs)
else:
outputs, features = model(inputs)
output_tmp = F.softmax(outputs, dim=1).data
_, preds = torch.max(output_tmp, 1)
_, top_k_index = torch.topk(output_tmp, self.k)
# print('!!!!!!!!!!!!!!!!!!!!!!!',preds, targets, outputs)
loss_CEntropy = criterion(outputs, targets)
loss_kNN = model_loss(features, targets, Temperature)
loss_CEntropy_array.append(self.cross_entr_coef * loss_CEntropy.item())
loss_cluster_array.append(self.kNN_coef * loss_kNN.item())
# L2 Regularization
l2 = 0
for W in model.parameters():
# l2 += torch.norm(W, p=2)
pass

final_loss = self.kNN_coef * loss_kNN + self.cross_entr_coef * loss_CEntropy
if phase == 'train':
print(W)
final_loss.backward()
optimizer.step()
running_loss += final_loss.item()
for jj in range(len(preds)):
if preds[jj] == targets.data[jj]:
running_corrects += 1
if targets.data[jj] in top_k_index[jj]:
top_k += 1

running_all += len(inputs)

# Write info in logger
cost_time = time.time() - since
logger.info(
'Process: [{:5.0f}/{:5.0f} ({:.0f}%), (running_corrects/running_all): ({}/{})]\t'
'Top-k: {}, Top-2 acc: {}\n'
'Loss: {:.4f}\tAcc:{:.4f}\n'
'Mean of Cross Entropy: {:.4f}, Mean of Cluster Entropy: {:.4f}\n'
'Cost time:{:5.0f}s\tRemaining time for the epoch:{:5.0f}s\r'.format(
running_all,
len(dataset.dataset),
100. * batch_idx / len(dataset),
running_corrects,
running_all,
top_k,
top_k / running_all,
running_loss / (batch_idx + 1),
running_corrects / running_all,
np.mean(loss_CEntropy_array),
np.mean(loss_cluster_array),
cost_time,
cost_time * (len(dataset) - batch_idx)
))

if batch_idx % args.interval == 0 or (batch_idx == len(dataset) - 1):
cost_time = time.time() - since
print(
'Process: [{:5.0f}/{:5.0f} ({:.0f}%), (running_corrects/running_all): ({}/{})]\t'
'Top-k: {}, Top-k acc: {}, '
'Loss: {:.4f}\tAcc:{:.4f}\tCost time:{:5.0f}s\tRemaining time for the epoch:{:5.0f}s\r'.format(
running_all,
len(dataset.dataset),
100. * batch_idx / len(dataset),
running_corrects,
running_all,
top_k,
top_k / running_all,
running_loss / (batch_idx + 1),
running_corrects / running_all,
cost_time,
cost_time * (len(dataset) - batch_idx)
))

final_loss = running_loss / (batch_idx + 1)
acc = running_corrects / len(dataset.dataset)
logger.info('*******************************Final Epoch Results*******************************')
logger.info('{} Epoch:\t{:2}\tTotal Loss: {:.4f}\tAcc:{:.4f}'.format(
phase,
epoch,
final_loss,
acc
))
logger.info(
'****************************************************************************************************************************')
if phase == 'train':
torch.save(model.state_dict(), args.save_path + '/' + args.mode + '_' + str(epoch + 1) + '.pt')
return model, final_loss, acc
else:
return final_loss, acc
``````

(Without knowing much about your model), did you try replacing `features` with `outputs` in this line and see if that works?

Thanks, the problem is that my loss function is based on some embeddings or features that I extracted!

I also checked out the grad of all model parameters, and I found out that the grad of two last layers are zero.
Why??

@marziehoghbaie Can you post your model here?

``````import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import math
from collections import OrderedDict
from torchsummary import summary

# Dense3D with clustering loss
# class NLLSequenceLoss(nn.Module):
#     """
#     Custom loss function.
#     Returns a loss that is the sum of all losses at each time step.
#     """
#
#     def __init__(self):
#         super(NLLSequenceLoss, self).__init__()
#         self.criterion = nn.NLLLoss(reduction='none')
#         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#
#     def forward(self, input, length, target):
#         loss = []
#         loss = []
#         transposed = input.transpose(0, 1).contiguous()
#         for i in range(transposed.size(0)):
#             loss.append(self.criterion(transposed[i,], target).unsqueeze(1))
#         loss = torch.cat(loss, 1)
#         # print('loss:',loss)
#
#         # mask = torch.zeros(loss.size(0), loss.size(1)).float()
#
#         for i in range(length.size(0)):
#             mask[i, L - 1] = 1.0
#         return loss
#
#         # return loss/transposed.size(0)

class SkNN_loss(nn.Module):
def __init__(self):
super(SkNN_loss, self).__init__()

def pairwise_euclid_distance(self, A, B):
"""Pairwise Euclidean distance between two matrices.
:param A: a matrix.
:param B: a matrix.

:returns: A tensor for the pairwise Euclidean between A and B.
"""
dist = torch.sub(A, B)
euc_dist = torch.norm(dist, p=2)
return euc_dist

def pairwise_cos_distance(self, A, B):
"""Pairwise cosine distance between two matrices.
:param A: a matrix.
:param B: a matrix.

:returns: A tensor for the pairwise cosine between A and B.
"""
# l2 normalized vectors
norm_A = A.norm(p=2, keepdim=True)
norm_B = B.norm(p=2, keepdim=True)
A = A.div(norm_A)
B = A.div(norm_B)
prod = torch.mul(A, B)
return 1 - prod

def forward(self, features, labels, T):
b = -1/(features.shape[0])
b = torch.tensor(b)
neighbors = torch.tensor(0)
total = torch.tensor(0)
l_sn = torch.tensor(0)
for i, x_i in enumerate(features):
for j, x_j in enumerate(features):
if i != j:
dist = self.pairwise_euclid_distance(x_i, x_j)
dist = torch.div(dist, T)
dist = torch.mul(dist, -1)
dist = torch.exp(dist)
if labels[i] == labels[j]:
if neighbors != 0:
l_sn = torch.add(l_sn, torch.log(neighbors / total))
neighbors = torch.tensor(0)
total = torch.tensor(0)
l_sn = torch.mul(l_sn, b)
return l_sn

def _validate(modelOutput, length, labels, total=None, wrong=None):
averageEnergies = torch.sum(modelOutput.data, 1)
for i in range(modelOutput.size(0)):
# print(modelOutput[i,:length[i]].sum(0).shape)
averageEnergies[i] = modelOutput[i, :length[i]].sum(0)

maxvalues, maxindices = torch.max(averageEnergies, 1)

count = 0

for i in range(0, labels.squeeze(1).size(0)):
l = int(labels.squeeze(1)[i].cpu())
if total is not None:
if l not in total:
total[l] = 1
else:
total[l] += 1
if maxindices[i] == labels.squeeze(1)[i]:
count += 1
else:
if wrong is not None:
if l not in wrong:
wrong[l] = 1
else:
wrong[l] += 1

return (averageEnergies, count)

class _DenseLayer(nn.Sequential):
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
super(_DenseLayer, self).__init__()
growth_rate, kernel_size=1, stride=1, bias=False)),
self.drop_rate = drop_rate

def forward(self, x):
new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)

class _DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
self.add_module('denselayer%d' % (i + 1), layer)

class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features):
super(_Transition, self).__init__()
kernel_size=1, stride=1, bias=False))
self.add_module('pool', nn.AvgPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)))

class Dense3D(torch.nn.Module):
def __init__(self, N_classes, growth_rate=32, num_init_features=64, bn_size=4, drop_rate=0, n_frames=29):
super(Dense3D, self).__init__()
# block_config = (6, 12, 24, 16)
block_config = (4, 8, 12, 8)
self.N_classes = N_classes
# self.loss = nn.CrossEntropyLoss()
self.sknn_loss= SkNN_loss()
self.features = nn.Sequential(OrderedDict([
('conv0',
nn.Conv3d(3, num_init_features, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)),
('norm0', nn.BatchNorm3d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))),
]))
num_features = num_init_features
for i, num_layers in enumerate(block_config):

block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
self.features.add_module('denseblock%d' % (i + 1), block)

num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features.add_module('transition%d' % (i + 1), trans)
num_features = num_features // 2
# output shape of this stage torch.Size([batch_size, 536, n_frames, 3, 3])
# Final batch norm
inp_channels = 536*n_frames
out_channel = int(inp_channels/4)
# ToDo Time distributed Conv, CRF
self.conv2d_reduction = nn.Conv2d(in_channels=inp_channels, out_channels=out_channel, kernel_size=(1, 1))
self.maxpooling_reduction = nn.MaxPool2d(kernel_size=(3, 3))
self.output_FC = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(out_channel, self.N_classes)
)

def validator_function(self):
return _validate

def forward(self, x):
"""
input shape (4, 3, 29, 112, 112)
torch.Size([4, 536, 29, 3, 3])
torch.Size([4, 15544, 3, 3])
torch.Size([4, 3886, 3, 3])
torch.Size([4, 3886, 1, 1])
torch.Size([4, 3886])
torch.Size([4, 10])
"""
dense_lyr_features = self.features(x)
dense_lyr_features = torch.flatten(dense_lyr_features, start_dim=1, end_dim=2)
embedding = self.conv2d_reduction(dense_lyr_features)
embedding = self.maxpooling_reduction(embedding)
embedding = torch.flatten(embedding, start_dim=1)
dense_lyr_features = self.output_FC(embedding)
return dense_lyr_features, embedding

if __name__ == '__main__':
# Batch size, channels(RGB), Number of frames per video, size of mouth region
# data = torch.zeros((4, 3, 29, 112, 112))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Dense3D(N_classes=10)
print(summary(model.cuda(), (3, 29, 112, 112)))
# model_output = model(data)

``````

As I know, the weights of the network will change according to the loss function if they are used in the loss function ( W := W - alpha . L’(W)), so the model’s 2 last layers are not used in the loss function and it is obvious that their grad is Zero?? Am I right??