Build your own loss function in PyTorch

sanchit2843 · February 3, 2020, 4:46pm

Hey, I got this and I found the solution for this which I have added in code, still it is showing same error. I think it is because the confusion matrix part is also not differentiable, can you help me solve this,

import torch
class SoftArgmax1D(torch.nn.Module):
    """
    Implementation of a 1d soft arg-max function as an nn.Module, so that we can differentiate through arg-max operations.
    """
    def __init__(self, base_index=0, step_size=1):
        """
        The "arguments" are base_index, base_index+step_size, base_index+2*step_size, ... and so on for
        arguments at indices 0, 1, 2, ....
        Assumes that the input to this layer will be a batch of 1D tensors (so a 2D tensor).
        :param base_index: Remember a base index for 'indices' for the input
        :param step_size: Step size for 'indices' from the input
        """
        super(SoftArgmax1D, self).__init__()
        self.base_index = base_index
        self.step_size = step_size
        self.softmax = torch.nn.Softmax(dim=1)


    def forward(self, x):
        """
        Compute the forward pass of the 1D soft arg-max function as defined below:
        SoftArgMax(x) = \sum_i (i * softmax(x)_i)
        :param x: The input to the soft arg-max layer
        :return: Output of the soft arg-max layer
        """
        smax = self.softmax(x)
        end_index = self.base_index + x.size()[1] * self.step_size
        indices = torch.arange(start=self.base_index, end=end_index, step=self.step_size).type(torch.cuda.FloatTensor)
        return torch.round(torch.matmul(smax, indices))
class WeightedKappaLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.softargmax = SoftArgmax1D()
    def forward(self, preds, true,nb_classes = None):
        nb_classes = preds.shape[1]
        pred = self.softargmax(preds.type(torch.cuda.FloatTensor))
        pred = pred.type(torch.cuda.LongTensor)
        confusion_matrix = torch.zeros([nb_classes, nb_classes],requires_grad = True)
        for t, p in zip(pred.view(-1), true.view(-1)):
            confusion_matrix[p.long(), t.long()] += 1
        weights = torch.empty([nb_classes,nb_classes],requires_grad = True)
        for i in range(len(weights)):
            for j in range(len(weights)):
                weights[i][j] = float(((i-j)**2)/(len(weights)-1)**2)
        true_hist= torch.zeros([nb_classes],requires_grad = True)
        for item in true: 
            true_hist[item]+=1
        pred_hist=torch.zeros([nb_classes],requires_grad = True)
        for item in pred: 
            pred_hist[int(item)]+=1
        E = torch.ger(true_hist,pred_hist)
        E = E/E.sum()
        confusion_matrix = confusion_matrix/confusion_matrix.sum()
        num = (confusion_matrix*weights).sum()
        den = (E*weights).sum()
        return num/den

Thanks in advance

TheOpenfield · March 12, 2021, 12:40pm

Could you fix your problem? Thanks in advance

Mali_Ch · April 27, 2021, 1:51am

Hello.
I am implementing a custom loss function including two parts, clustering and regularization. I created a simple dataset with two clusters in order to see how this loss function works.
I use kmeans_mod that make only one single iteration. Here is the plot of loss values for each parts, clustering and regularization. As you see, the clutering_loss value is increasing and regularization_loss value is decreasing and unstable! I have changed many parameters like the architecture of the network, learning rate, number of epochs, batch size and so on. But unfortunately I have not got a good result.
Moreover, to check if clustering part is working correctly, I set up a program that I apply kmeans_mod on dataset and compute clustering_loss for several iterations. I get a plot whish shows clustering_loss value is decreasing, but when I add regularization part to it and I am using this as a custom loss function of a network, it does not work!!
Would you please help me to fix this problem? If you need more information, please let me know.

class Autoencoder(nn.Module):
def init(self, n1=10, n2=8, n3=6, n4=4, n5=2):
super(Autoencoder, self).init()
self.nl = nn.ReLU()
self.n1 = n1
# encoder
self.enc1 = nn.Linear(n1, n2)
self.enc2 = nn.Linear(n2, n3)
self.enc3 = nn.Linear(n3, n4)
self.enc4 = nn.Linear(n4, n5)

    # decoder
    self.dec1 = nn.Linear(n5, n4)
    self.dec2 = nn.Linear(n4, n3)
    self.dec3 = nn.Linear(n3, n2)
    self.dec4 = nn.Linear(n2, n1)

def forward(self, x):
    x = self.nl(self.enc1(x))
    x = self.nl(self.enc2(x))
    x = self.nl(self.enc3(x))
    y = self.nl(self.enc4(x))

    x = self.nl(self.dec1(y))
    x = self.nl(self.dec2(x))
    x = self.nl(self.dec3(x))
    x = self.nl(self.dec4(x))

    return y, x

Training Part

def train_fn(model, device, train_loader, optimizer, epoch, log_interval, num_cluster, centroids):
batch_size = train_loader.batch_size
model.train()
loss_ = 0

for batch_idx, (data, _) in enumerate(train_loader):

    data = data.to(device)
    optimizer.zero_grad()

    enc, dec = model(data)

    cluster_ids_x, centroids = kmeans_mod(X=dec, num_clusters=num_cluster, centroid=centroids)

    loss1 = loss_fn_clustering(dec, cluster_ids_x, centroids)  # Clustering Part
    loss1.requires_grad = True
    near_ids = nearest(data)   ## Getting the indices of nearest neighbor of each data point

    loss2 = loss_fn_reg(data, dec, near_ids)   # Regularization Part

    loss = loss1+loss2    ## custom loss function

    loss.backward()
    optimizer.step()
    loss_ += loss.item() * len(data)
    # loss_ += loss.item()
    if batch_idx % log_interval == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
                   100. * batch_idx / len(train_loader), loss.item()))

return centroids, cluster_ids_x, loss1, loss2, loss_/train_loader.dataset.__len__()

def main():
# Training settings
model = Autoencoder()

num_cluster = 2
batch_size = 128
num_epochs = 100
learning_rate = 0.0001
log_interval = 2

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

### Training set and Validation set ###

train_dataset = Twoclusters(1000, 1500, [15, 19])
val_dataset = Twoclusters(100, 150, [7, 4])

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

model.to(device)   # load the neural network on to the device
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


loss_value_train = np.zeros((num_epochs))
loss_value_1 = np.zeros((num_epochs))
loss_value_2 = np.zeros((num_epochs))
loss_value_val = np.zeros((num_epochs))

for epoch in range(num_epochs):

    if epoch == 0:
        c, label,loss_1, loss_2, training_loss = train_fn(model, device, train_loader, optimizer, epoch, log_interval, num_cluster,
                                           centroids=None)

        c, validation_loss, decoded_data, label = val_fn(model, device, val_loader, num_cluster, centroids=None)


    else:
        c, label,loss_1,loss_2, training_loss = train_fn(model, device, train_loader, optimizer, epoch, log_interval, num_cluster,
                                           centroids=c)

        c, validation_loss, decoded_data, label = val_fn(model, device, val_loader, num_cluster, centroids=c)

    loss_value_train[epoch] = training_loss
    loss_value_1[epoch] = loss_1
    loss_value_2[epoch] = loss_2
    loss_value_val[epoch] = validation_loss


plt.figure(figsize=(10, 7))
plt.plot(loss_value_train, label='Training loss')
plt.plot(loss_value_val, label='Validation loss')
plt.plot(loss_value_1, label='loss1: Clustering')
plt.plot(loss_value_2, label='loss2: Regularization')
plt.legend()
plt.show()

if name == “main”:
main()

Liam_Roche · October 17, 2021, 11:54am

The defining characteristic of loss is that the lower it is, the better. While arbitrary offsets don’t matter, it does matter that, for example, every negative value is viewed as better than every positive value. If your numbers are “signed discrepancies”, you need to transform them so this is so. For example, squaring them or applying the absolute value function (the first minimising mean error, the second minimising median error).