Custom loss function does not work properly

I have been struggling with implementing a custom loss function, and not sure why it is not working properly. If I use one of the provided loss functions, and perform a training step via

loss_fn = nn.MSELoss()
loss = loss_fn(predicted, target)
loss.backward()                                                                                                  
optimizer.step() 

everything works well, and the loss decreases as expected:

Train epoch: 0 Loss: 0.802491
Train epoch: 100 Loss: 0.010990
Train epoch: 200 Loss: 0.009987
Train epoch: 300 Loss: 0.009558
Train epoch: 400 Loss: 0.007305

However, if I change the loss calculation to

loss = torch.mean((predicted - target) ** 2)

it does not work:

Train epoch: 0 Loss: 4.732191
Train epoch: 100 Loss: 0.255895
Train epoch: 200 Loss: 0.256236
Train epoch: 300 Loss: 0.253145
Train epoch: 400 Loss: 0.255263

Any suggestions for where I am going wrong would be greatly appreciated.

Thanks

That is weird. It should work. Could you post more code please?

In creating a simplified version of my code that recreates the problem, I think I have it narrowed down to some issue with my Dataset class.

import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.init as weight_init
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        channels = 100
        layers=1
        # AutoEncoder for onehot representation
        self.autoencoder_head = nn.Linear(4, channels)
        self.autoencoder = nn.ModuleList([nn.Linear(channels, channels) for i in range(layers)])
        self.autoencoder.append(nn.Linear(channels, 2))

        self.encoded_map = nn.Linear(2, channels)
        self.vel_map = nn.Linear(2, channels)

        self.predictor = nn.ModuleList(
                [nn.Linear(channels, channels) for i in range(layers)])
        self.predictor.append(nn.Linear(channels, 1))

        for m in self.modules():
            if isinstance(m, (nn.Linear)):
                weight_init.xavier_normal(m.weight.data)

    def embedding(self, onehot):
        x = self.autoencoder_head(onehot)
        for layer in self.autoencoder:
            x = F.relu(x)
            x = layer(x)
        return x

    def forward(self, x):
        material = x[:, :4]
        vel_stress = x[:, 4:]

        x = self.embedding(material)
        x = self.encoded_map(x) + self.vel_map(vel_stress)
        for layer in self.predictor:
            x = F.relu(x)
            x = layer(x)
        return x

class TestDataset1(Dataset):
    def __len__(self):
        return 40000

    def __getitem__(self, idx):
        xy, material = divmod(idx, 4)
        x, y = divmod(xy, 100)
        x = x/100
        y = y/100
        if material == 0:
            return np.array([1.0, 0, 0, 0, x, y]), np.array([.2 * x*y + .1 * x**2 - 10*x + 5])
        elif material == 1:
            return np.array([0, 1.0, 0, 0, x, y]), np.array([.3 * x*y - .1 * x**2 - 10*x + 5])
        elif material == 2:
            return np.array([0, 0, 1.0, 0, x, y]), np.array([-.1 * x*y - .1 * x**2  + .01 * y**2 - 10*x + 5])
        else:
            return np.array([0, 0, 0, 1.0, x, y]), np.array([.1 * x*y + .1 * x**2  - .1 * y**2 - 10*x + 5])

class TestDataset2(Dataset):
    def __init__(self):
        x, y = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
        self.m1 = .2 * x*y + .1 * x**2 - 10*x + 5
        self.m2 = .3 * x*y - .1 * x**2 - 10*x + 5
        self.m3 = -.1 * x*y - .1 * x**2  + .01 * y**2 - 10*x + 5
        self.m4 = .1 * x*y + .1 * x**2  - .1 * y**2 - 10*x + 5
        
    def __len__(self):
        return 40000

    def __getitem__(self, idx):
        xy, material = divmod(idx, 4)
        x_idx, y_idx = divmod(xy, 100)
        x = x_idx/100
        y = y_idx/100
        if material == 0:
            return np.array([1.0, 0, 0, 0, x, y]), self.m1[x_idx, y_idx]
        elif material == 1:
            return np.array([0, 1.0, 0, 0, x, y]), self.m2[x_idx, y_idx]
        elif material == 2:
            return np.array([0, 0, 1.0, 0, x, y]), self.m3[x_idx, y_idx]
        else:
            return np.array([0, 0, 0, 1.0, x, y]), self.m4[x_idx, y_idx]

def train(dataloader, model, optimizer, epoch, custom_loss=False):
    model.train()

    cum_loss = 0
    for batch_idx, (data, target) in enumerate(dataloader):
        dtype = torch.cuda.FloatTensor
        data, target = data.type(dtype), target.type(dtype)
        data = Variable(data)
        target = Variable(target)

        optimizer.zero_grad()
        predicted = model(data)

        loss_fn = nn.MSELoss()
        if custom_loss:
            loss = torch.mean((predicted - target)**2)
        else:
            loss = loss_fn(predicted, target)
        # Update Model
        loss.backward()
        optimizer.step()
        cum_loss = cum_loss + loss.data[0]
    if epoch % 10 == 0:
        print('Train epoch: {}\tLoss: {:.6f}'.format(epoch, cum_loss / batch_idx))

Training using TestDataset1 works as expected, with either loss function. However, when I use TestDataset2, it does not train properly with the custom loss function:

dataloader = DataLoader(TestDataset2(), batch_size=100, shuffle=True, pin_memory=True)
model = Model().float().cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(41):
     train(dataloader, model, optimizer, epoch, custom_loss=False)

Train epoch: 0 Loss: 6.635968
Train epoch: 10 Loss: 0.002086
Train epoch: 20 Loss: 0.000487
Train epoch: 30 Loss: 0.000299
Train epoch: 40 Loss: 0.000168

dataloader = DataLoader(TestDataset2(), batch_size=100, shuffle=True, pin_memory=True)
model = Model().float().cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(41):
     train(dataloader, model, optimizer, epoch, custom_loss=True)

Train epoch: 0 Loss: 8.424415
Train epoch: 10 Loss: 8.422460
Train epoch: 20 Loss: 8.423582
Train epoch: 30 Loss: 8.423183
Train epoch: 40 Loss: 8.422752

There seems to be some problem with using fixed numpy arrays to hold the training data, but it is not clear to me what I am doing wrong.

Thanks

Hi,
I have the same problem with the BCELoss.

I defined a customBCELoss using:

def customBCELoss(outputs, truths):
     loss_BCE = ( truths * torch.log(outputs) + 
                     torch.add(torch.neg(truths), 1.) * 
                     torch.log(torch.add(torch.neg(outputs), 1.)) )
    avg_BCE = torch.neg(torch.mean(loss_BCE))
    return avg_BCE

This custom loss gives the same results as the nn.BCELoss on all the batches for the first two epochs up to machine precision, but divergences on the third.

Does anyone know how to solve this problem ?