I think the criterion and model seems to work now using a standard method now (so without using the mixing class), so thanks a lot for that! However when I do use the mixing function, the network doesn’t seem to learn anything at all.
I’m trying to follow the paper more closely so I took a batch size of 128, and mix them together to 64 images. This does not decrease the loss during training at all however.
The train function looks like this:
criterion = nn.KLDivLoss()
learning_rate = 0.1
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, nesterov=True, weight_decay=5e-4)
scheduler = MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1)
loss_values = []
PATH = '/content/drive/My Drive/CS4240 - Deep Learning/Reproducibility project/Results/BCplus_epoch20.pth'
for epoch in range(20): # Number of epochs (loops over dataset)
epoch_loss = 0.0
running_loss = 0.0
inputs = []
labels = []
for i, data in enumerate(trainloader, 0):
images_cifar, labels_cifar = data
inputs = [] # clear input
labels = [] # clear labels
for j in range(0,len(data),2):
images_mix, labels_mix = mix(images_cifar[j], images_cifar[j+1], labels_cifar[j], labels_cifar[j+1], False,
True)
inputs.append(images_mix)
labels.append(labels_mix)
inputs = torch.stack(inputs) # stack tensors
labels = torch.stack(labels)
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels) # calculate loss
optimizer.step() # update weights
# print statistics every 200 loops
running_loss += loss.item()
if i % 200 == 199:
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 200))
running_loss = 0.0
scheduler.step()
With mixing function (again)
def preprocess(image, optplus, train):
if optplus:
normalizer = zero_mean
mean = np.array([4.60, 2.24, -6.84])
std = np.array([55.9, 53.7, 56.5])
else:
normalizer = normalize
mean = np.array([125.3, 123.0, 113.9])
std = np.array([63.0, 62.1, 66.7])
if train:
image = normalize(image, mean, std)
image = horizontal_flip(image)
image = padding(image, 4)
image = random_crop(image, 32)
else:
image = normalize(image, mean, std)
return image
def mix(image1, image2, label1, label2, optplus, train):
image1 = tensor_to_numpy(image1)
image2 = tensor_to_numpy(image2)
image1 = preprocess(image1, optplus, train)
image2 = preprocess(image2, optplus, train)
image1 = torch.from_numpy(image1).float()
image2 = torch.from_numpy(image2).float()
label1 = label1.to(device)
label2 = label2.to(device)
image1 = image1.to(device)
image2 = image2.to(device)
# Mix two images
r = torch.rand(1).to(device)
if optplus:
g1 = torch.std(image1).to(device)
g2 = torch.std(image2).to(device)
p = (1.0 / (1 + g1 / g2 * (1 - r) / r)).to(device)
image = ((image1 * p + image2 * (1 - p)) / torch.sqrt(p ** 2 + (1 - p) ** 2)).to(device)
else:
image = (image1 * r + image2 * (1 - r)).to(device)
# Mix two labels
eye = torch.eye(nClasses).to(device)
label = (eye[label1] * r + eye[label2] * (1 - r)).to(device)
return image, label
When training the network like this it does not reduce the loss at all, however when removing the mixing function from the trainer it does learn something, so something must be going wrong when mixing the images and labels. Is there something I am doing wrong in the mixing?