Build your own loss function in PyTorch

Hi all,

I struggled with this myself, so I’ve started building a tutorial for such stuff in PyTorch. You can find a section on custom losses there too (Section 5). Github link - https://github.com/Spandan-Madan/A-Collection-of-important-tasks-in-pytorch

I wrote this up quickly in my free time so it must have some typos etc. If you think there’s things you would like to see there but are missing, feel free to create an issue on GitHub to make suggestions. Hope this helps!

6 Likes

I have already implemented my own loss in python, but it is too slow. Is there any tutorials which can teach me
to speed it up?(there is a for loop in my loss)

1 Like

if the individual loss for a sample in a batch can be positive or negative depending on some conditions, how do i sum the loss over samples? it will become zero if i sum all the samples within a batch.

1 Like

excuse me, have you figured this out? So it’s necessary that writing a custom backward function and then return the gradient by self?

Hello,
I would like to use euclidean loss in pytorch. I was writing the formaula. But it is not working. Is this loss function already available in pytorch library? How do i use euclidean loss in network. Thank you.

In pytorch it is called MSELoss: http://pytorch.org/docs/0.3.0/nn.html#torch.nn.MSELoss

Thank you very much for the help.

In this paper, they have used euclidean loss for translation and orientation. Can i use the same loss function by using MSEloss for the regression problem?

Hi, when you give Euclidean loss between x1 and x2,

loss = torch.norm(x1 - x2, 2)

seems proper implementation.

1 Like

Hi Adam,
I have read this post several times, however, i don’t understand some terminologies, such as “re-wrap the .data in a new Variable”, “.data unpacking”, and “.data repacking”, would you mind showing some examples. Thank you so much.

In addition, i have a special requirement for center_loss, i.e., i need to set different weight to each class. So I refine the code (https://github.com/BestSonny/examples/tree/master/center_loss) show in PyTorch exmaples, and reinplement this loss function by myself.

In exmaples,
trainer.py:
def get_center_loss(centers, features, target, alpha, num_classes):
batch_size = target.size(0)
features_dim = features.size(1)

target_expand = target.view(batch_size,1).expand(batch_size,features_dim)
centers_var = Variable(centers)
centers_batch = centers_var.gather(0,target_expand)
criterion = nn.MSELoss()
center_loss = criterion(features,  centers_batch)

diff = centers_batch - features
unique_label, unique_reverse, unique_count = np.unique(target.cpu().data.numpy(), return_inverse=True, return_counts=True)
appear_times = torch.from_numpy(unique_count).gather(0,torch.from_numpy(unique_reverse))
appear_times_expand = appear_times.view(-1,1).expand(batch_size,features_dim).type(torch.FloatTensor)
diff_cpu = diff.cpu().data / appear_times_expand.add(1e-6)
diff_cpu = alpha * diff_cpu
for i in range(batch_size):
    centers[target.data[i]] -= diff_cpu[i].type(centers.type())

return center_loss, centers

the call of this function:
center_loss, self.model._buffers[‘centers’] = get_center_loss(self.model._buffers[‘centers’], self.model.features, target_var, self.alpha, self.model.num_classes)
softmax_loss = self.criterion(output, target_var)
loss = self.center_loss_weight*center_loss + softmax_loss

My refinement:

self.centers = torch.zeros(num_classes, embedding_size).type(torch.FloatTensor) # 2d tensor
x = self.fc2(x)
self.features = F.relu(x) # 2D tensor

def get_center_loss(self, target, class_weight, alpha):
batch_size = target.size(0)
features_dim = self.features.size(1)

    target_expand = target.view(batch_size,1).expand(batch_size,features_dim)

    centers_var = Variable(self.centers)
    centers_batch = centers_var.gather(0,target_expand).cuda()

    abnormal_loss = Variable(torch.FloatTensor([0]), requires_grad=True)
    normal_loss = Variable(torch.FloatTensor([0]), requires_grad=True)
    for i in range(batch_size):
        if target.data[i] == 0:
            #abnormal_loss += torch.sum((self.features.data[i,:] - centers_batch.data[i,:]) **2)
            abnormal_loss = abnormal_loss.clone() + (self.features.data[i,:] - centers_batch.data[i,:]).pow(2).sum()
        else:
            #normal_loss += torch.sum((self.features.data[i,:] - centers_batch.data[i,:]) **2)
            normal_loss = normal_loss.clone() + (self.features.data[i,:] - centers_batch.data[i,:]).pow(2).sum()
    center_loss = class_weight[0] * abnormal_loss + class_weight[1] * normal_loss
    center_loss = center_loss/features_dim/batch_size


    diff = centers_batch - self.features

    unique_label, unique_reverse, unique_count = np.unique(target.cpu().data.numpy(), return_inverse=True, return_counts=True)

    appear_times = torch.from_numpy(unique_count).gather(0,torch.from_numpy(unique_reverse))

    appear_times_expand = appear_times.view(-1,1).expand(batch_size,features_dim).type(torch.FloatTensor)

    diff_cpu = diff.cpu().data / appear_times_expand.add(1e-6)
    
    #∆c_j =(sum_i=1^m δ(yi = j)(c_j - x_i)) / (1 + sum_i=1^m δ(yi = j))
    diff_cpu = alpha * diff_cpu

    for i in range(batch_size):
        #Update the parameters c_j for each j by c^(t+1)_j = c^t_j − α · ∆c^t_j
        self.centers[target.data[i]] -= diff_cpu[i].type(self.centers.type())

    return center_loss, self.centers, colorization_loss/features_dim/batch_size, normal_loss/features_dim/batch_size

Here, class_weight = torch.FloatTensor([10, 100]) # two weight for center_loss (binary classification)

The call:
criterion = nn.CrossEntropyLoss().cuda()
prediction = model(data_var)

    center_loss, xx, abnormal_loss, normal_loss = model.get_center_loss(target_var, class_weight, args.alpha)
    classfier_loss = criterion(prediction.cuda(), target_var.cuda())
	 
    loss = center_loss.cuda() + classfier_loss
    # compute gradient and update weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Is is right for this code? can this backward to change the weights of model?

The .clone() here is unnecessary. The addition operation clones the Variable, so you don’t have to do so explicitly. In fact if you do, you just add an extra copy in memory. That said, if you were to do an inplace addition +=, then using .clone() might be necessary, but even then, I would wait until PyTorch complained about the inplace operation.

If I understand correctly, the actual loss that needs to be backpropagated is center_loss.
Now center_loss = weighted sum of abnormal_loss and normal_loss so gradients can flow back up to abnormal_loss and normal_loss.
But both of those are calculated from Tensors, not from Variables, so the gradients will go no further. Try this instead…

abnormal_loss = abnormal_loss + (self.features[i,:] - centers_batch[i,:]).pow(2).sum()

Same for normal_loss

normal_loss = normal_loss + (self.features[i,:] - centers_batch[i,:]).pow(2).sum()

I want to define a loss function that is defined piece-wise while both pieces “touch” each other. However, the gradient at the point where they touch is not the same. Is this possible or must the gradient be identical from both directions? If yes, I could possibly also modify the definition of the function, but if it’s not required I would avoid this

thanks. thats helpful. it would be nice for a beginner like me if you could show the usage of this custom loss function in an example. I wanted to cross check how the input x and y are used when Regress_Loss is called.

If I just define the loss function as in here I am not able to send it to cuda, i.e. mse_loss.cuda() will fail with the Traceback: AttributeError: ‘function’ object has no attribute ‘cuda’
I am clearly doing something wrong… I guess I am not defining something correctly. Could anyone please help? I am quite a beginner in pytorch, so I could learn a lot from this.

You don’t have to push it to GPU.
This is only required for modules having an internal state (and tensors). The provided function is a plain python function and works on GPU as long as both input tensors are on (the same) GPU.

2 Likes

Hello,
I am facing a issue of backprop in custom loss function, I probably know the reason but not sure how to solve it. I am actually implemented the quadratic weighted kappa loss(https://www.kaggle.com/aroraaman/quadratic-kappa-metric-explained-in-5-simple). I changed this code in pytorch.

class WeightedKappaLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, preds, true,nb_classes = None):
        nb_classes = preds.shape[1]
        _,pred = torch.max(preds.view(1,-1).type(torch.cuda.FloatTensor),1)
        pred = pred.type(torch.cuda.FloatTensor)
        pred.requires_grad = True
        confusion_matrix = torch.empty([nb_classes, nb_classes],requires_grad = True)
        for t, p in zip(pred.view(-1), true.view(-1)):
            confusion_matrix[p.long(), t.long()] += 1
        weights = torch.empty([nb_classes,nb_classes],requires_grad = True)
        for i in range(len(weights)):
            for j in range(len(weights)):
                weights[i][j] = float(((i-j)**2)/(len(weights)-1)**2)
        #Histograms
        true_hist= torch.empty([nb_classes],requires_grad = True)
        for item in true: 
            true_hist[item]+=1
        pred_hist=torch.empty([nb_classes],requires_grad = True)
        for item in pred: 
            pred_hist[int(item)]+=1
        E = torch.ger(true_hist,pred_hist)
        E = E/E.sum()
        confusion_matrix = confusion_matrix/confusion_matrix.sum()
        num = (confusion_matrix*weights).sum()
        den = (E*weights).sum()
        return num/den

The code is giving this error - leaf variable has been moved into the graph interior.
I think error is because of no relation of confusion matrix to the outputs of neural network(directly). Any ideas how to solve it.

1 Like

Hi,

The problem is that _,pred = torch.max(preds.view(1,-1).type(torch.cuda.FloatTensor),1) is not a differentiable operation. So you cannot have gradients flowing back from pred to preds.
In general, if you have to set the requires_grad=True flag by hand on an intermediary value it means that an operation before was not differentiable and so you won’t get the gradients you want!

You can look around (google or other post on this forum) for differentiable functions to replace .max_indices() but they are all quite heuristic.

Hey, I got this and I found the solution for this which I have added in code, still it is showing same error. I think it is because the confusion matrix part is also not differentiable, can you help me solve this,

import torch
class SoftArgmax1D(torch.nn.Module):
    """
    Implementation of a 1d soft arg-max function as an nn.Module, so that we can differentiate through arg-max operations.
    """
    def __init__(self, base_index=0, step_size=1):
        """
        The "arguments" are base_index, base_index+step_size, base_index+2*step_size, ... and so on for
        arguments at indices 0, 1, 2, ....
        Assumes that the input to this layer will be a batch of 1D tensors (so a 2D tensor).
        :param base_index: Remember a base index for 'indices' for the input
        :param step_size: Step size for 'indices' from the input
        """
        super(SoftArgmax1D, self).__init__()
        self.base_index = base_index
        self.step_size = step_size
        self.softmax = torch.nn.Softmax(dim=1)


    def forward(self, x):
        """
        Compute the forward pass of the 1D soft arg-max function as defined below:
        SoftArgMax(x) = \sum_i (i * softmax(x)_i)
        :param x: The input to the soft arg-max layer
        :return: Output of the soft arg-max layer
        """
        smax = self.softmax(x)
        end_index = self.base_index + x.size()[1] * self.step_size
        indices = torch.arange(start=self.base_index, end=end_index, step=self.step_size).type(torch.cuda.FloatTensor)
        return torch.round(torch.matmul(smax, indices))
class WeightedKappaLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.softargmax = SoftArgmax1D()
    def forward(self, preds, true,nb_classes = None):
        nb_classes = preds.shape[1]
        pred = self.softargmax(preds.type(torch.cuda.FloatTensor))
        pred = pred.type(torch.cuda.LongTensor)
        confusion_matrix = torch.zeros([nb_classes, nb_classes],requires_grad = True)
        for t, p in zip(pred.view(-1), true.view(-1)):
            confusion_matrix[p.long(), t.long()] += 1
        weights = torch.empty([nb_classes,nb_classes],requires_grad = True)
        for i in range(len(weights)):
            for j in range(len(weights)):
                weights[i][j] = float(((i-j)**2)/(len(weights)-1)**2)
        true_hist= torch.zeros([nb_classes],requires_grad = True)
        for item in true: 
            true_hist[item]+=1
        pred_hist=torch.zeros([nb_classes],requires_grad = True)
        for item in pred: 
            pred_hist[int(item)]+=1
        E = torch.ger(true_hist,pred_hist)
        E = E/E.sum()
        confusion_matrix = confusion_matrix/confusion_matrix.sum()
        num = (confusion_matrix*weights).sum()
        den = (E*weights).sum()
        return num/den

Thanks in advance

1 Like

Could you fix your problem? Thanks in advance

Hello.
I am implementing a custom loss function including two parts, clustering and regularization. I created a simple dataset with two clusters in order to see how this loss function works.
I use kmeans_mod that make only one single iteration. Here is the plot of loss values for each parts, clustering and regularization. As you see, the clutering_loss value is increasing and regularization_loss value is decreasing and unstable! I have changed many parameters like the architecture of the network, learning rate, number of epochs, batch size and so on. But unfortunately I have not got a good result.
Moreover, to check if clustering part is working correctly, I set up a program that I apply kmeans_mod on dataset and compute clustering_loss for several iterations. I get a plot whish shows clustering_loss value is decreasing, but when I add regularization part to it and I am using this as a custom loss function of a network, it does not work!!
Would you please help me to fix this problem? If you need more information, please let me know.

class Autoencoder(nn.Module):
def init(self, n1=10, n2=8, n3=6, n4=4, n5=2):
super(Autoencoder, self).init()
self.nl = nn.ReLU()
self.n1 = n1
# encoder
self.enc1 = nn.Linear(n1, n2)
self.enc2 = nn.Linear(n2, n3)
self.enc3 = nn.Linear(n3, n4)
self.enc4 = nn.Linear(n4, n5)

    # decoder
    self.dec1 = nn.Linear(n5, n4)
    self.dec2 = nn.Linear(n4, n3)
    self.dec3 = nn.Linear(n3, n2)
    self.dec4 = nn.Linear(n2, n1)

def forward(self, x):
    x = self.nl(self.enc1(x))
    x = self.nl(self.enc2(x))
    x = self.nl(self.enc3(x))
    y = self.nl(self.enc4(x))

    x = self.nl(self.dec1(y))
    x = self.nl(self.dec2(x))
    x = self.nl(self.dec3(x))
    x = self.nl(self.dec4(x))

    return y, x

Training Part

def train_fn(model, device, train_loader, optimizer, epoch, log_interval, num_cluster, centroids):
batch_size = train_loader.batch_size
model.train()
loss_ = 0

for batch_idx, (data, _) in enumerate(train_loader):

    data = data.to(device)
    optimizer.zero_grad()

    enc, dec = model(data)

    cluster_ids_x, centroids = kmeans_mod(X=dec, num_clusters=num_cluster, centroid=centroids)

    loss1 = loss_fn_clustering(dec, cluster_ids_x, centroids)  # Clustering Part
    loss1.requires_grad = True
    near_ids = nearest(data)   ## Getting the indices of nearest neighbor of each data point

    loss2 = loss_fn_reg(data, dec, near_ids)   # Regularization Part

    loss = loss1+loss2    ## custom loss function

    loss.backward()
    optimizer.step()
    loss_ += loss.item() * len(data)
    # loss_ += loss.item()
    if batch_idx % log_interval == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
                   100. * batch_idx / len(train_loader), loss.item()))

return centroids, cluster_ids_x, loss1, loss2, loss_/train_loader.dataset.__len__()

def main():
# Training settings
model = Autoencoder()

num_cluster = 2
batch_size = 128
num_epochs = 100
learning_rate = 0.0001
log_interval = 2

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

### Training set and Validation set ###

train_dataset = Twoclusters(1000, 1500, [15, 19])
val_dataset = Twoclusters(100, 150, [7, 4])

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

model.to(device)   # load the neural network on to the device
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


loss_value_train = np.zeros((num_epochs))
loss_value_1 = np.zeros((num_epochs))
loss_value_2 = np.zeros((num_epochs))
loss_value_val = np.zeros((num_epochs))

for epoch in range(num_epochs):

    if epoch == 0:
        c, label,loss_1, loss_2, training_loss = train_fn(model, device, train_loader, optimizer, epoch, log_interval, num_cluster,
                                           centroids=None)

        c, validation_loss, decoded_data, label = val_fn(model, device, val_loader, num_cluster, centroids=None)


    else:
        c, label,loss_1,loss_2, training_loss = train_fn(model, device, train_loader, optimizer, epoch, log_interval, num_cluster,
                                           centroids=c)

        c, validation_loss, decoded_data, label = val_fn(model, device, val_loader, num_cluster, centroids=c)

    loss_value_train[epoch] = training_loss
    loss_value_1[epoch] = loss_1
    loss_value_2[epoch] = loss_2
    loss_value_val[epoch] = validation_loss


plt.figure(figsize=(10, 7))
plt.plot(loss_value_train, label='Training loss')
plt.plot(loss_value_val, label='Validation loss')
plt.plot(loss_value_1, label='loss1: Clustering')
plt.plot(loss_value_2, label='loss2: Regularization')
plt.legend()
plt.show()

if name == “main”:
main()

The defining characteristic of loss is that the lower it is, the better. While arbitrary offsets don’t matter, it does matter that, for example, every negative value is viewed as better than every positive value. If your numbers are “signed discrepancies”, you need to transform them so this is so. For example, squaring them or applying the absolute value function (the first minimising mean error, the second minimising median error).