Negative loss for BC learning

I think the criterion and model seems to work now using a standard method now (so without using the mixing class), so thanks a lot for that! However when I do use the mixing function, the network doesn’t seem to learn anything at all.
I’m trying to follow the paper more closely so I took a batch size of 128, and mix them together to 64 images. This does not decrease the loss during training at all however.
The train function looks like this:

criterion = nn.KLDivLoss()
learning_rate = 0.1
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, nesterov=True, weight_decay=5e-4)
scheduler = MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1)
loss_values = []
PATH = '/content/drive/My Drive/CS4240 - Deep Learning/Reproducibility project/Results/BCplus_epoch20.pth'

for epoch in range(20):  # Number of epochs (loops over dataset)
  epoch_loss = 0.0
  running_loss = 0.0
  inputs = []
  labels = []
  for i, data in enumerate(trainloader, 0):      

      images_cifar, labels_cifar = data
      inputs = [] # clear input
      labels = [] # clear labels
      for j in range(0,len(data),2):

        images_mix, labels_mix = mix(images_cifar[j], images_cifar[j+1], labels_cifar[j], labels_cifar[j+1], False,
                                    True)
        inputs.append(images_mix)
        labels.append(labels_mix)

      inputs = torch.stack(inputs) # stack tensors
      labels = torch.stack(labels)

      optimizer.zero_grad()

      # forward + backward + optimize
      outputs = net(inputs)
      loss = criterion(outputs, labels) # calculate loss
      optimizer.step() # update weights

      # print statistics every 200 loops
      running_loss += loss.item()

      if i % 200 == 199:
          print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss / 200))
          running_loss = 0.0

  scheduler.step()

With mixing function (again)

def preprocess(image, optplus, train):
    if optplus:
        normalizer = zero_mean
        mean = np.array([4.60, 2.24, -6.84])
        std = np.array([55.9, 53.7, 56.5])
    else:
        normalizer = normalize
        mean = np.array([125.3, 123.0, 113.9])
        std = np.array([63.0, 62.1, 66.7])
    if train:
        image = normalize(image, mean, std)
        image = horizontal_flip(image)
        image = padding(image, 4)
        image = random_crop(image, 32)

    else:
        image = normalize(image, mean, std)

    return image


def mix(image1, image2, label1, label2, optplus, train):
    image1 = tensor_to_numpy(image1)
    image2 = tensor_to_numpy(image2)
    image1 = preprocess(image1, optplus, train)
    image2 = preprocess(image2, optplus, train)
    image1 = torch.from_numpy(image1).float()
    image2 = torch.from_numpy(image2).float()
    label1 = label1.to(device)
    label2 = label2.to(device)
    image1 = image1.to(device)
    image2 = image2.to(device)
    # Mix two images
    r = torch.rand(1).to(device)
    if optplus:
        g1 = torch.std(image1).to(device)
        g2 = torch.std(image2).to(device)
        p = (1.0 / (1 + g1 / g2 * (1 - r) / r)).to(device)
        image = ((image1 * p + image2 * (1 - p)) / torch.sqrt(p ** 2 + (1 - p) ** 2)).to(device)
    else:
        image = (image1 * r + image2 * (1 - r)).to(device)

    # Mix two labels
    eye = torch.eye(nClasses).to(device)
    label = (eye[label1] * r + eye[label2] * (1 - r)).to(device)

    return image, label

When training the network like this it does not reduce the loss at all, however when removing the mixing function from the trainer it does learn something, so something must be going wrong when mixing the images and labels. Is there something I am doing wrong in the mixing?