# How to handle tensor size problem with BCE Loss?

Hello,

I am currently working on the customization of the cDCGAN for the MNIST Database.
I found some problems with the use of the Binary Cross Entropy Loss when running the model.

It seems that the dimension of the tensors feeding the Loss are different, and I don’t really know why.
I got the following error :

ValueError: Target and input must have the same number of elements. target nelement (128) != input nelement (1280)

Here’s the code used

# -*- coding: utf-8 -*-

Automatically generated by Colaboratory.

Original file is located at
"""

from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.$$[0-9]*$$\.$$[0-9]*$$$/cu\1\2/' accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu' !pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision !pip install Pillow==4.1.1 # Commented out IPython magic to ensure Python compatibility. # %matplotlib inline import argparse import os import random import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.optim as optim import torch.utils.data import torchvision.datasets as dset import torchvision.transforms as transforms import torchvision.utils as vutils import numpy as np import matplotlib.pyplot as plt import matplotlib.animation as animation from IPython.display import HTML """## Hyperpameters Define the hyperparameters. You can play with those later. """ ## Data loading workers = 4 # Number of workers for dataloader (/!\ set to 4 when you're done debugging) ## Architecture arch = 'cDCGAN' # or cGAN nz = 100 # Size of z latent vector (i.e. size of generator input)µ ndf = 32 # Base size of feature maps in discriminator ngf = 32 # Base size of feature maps in generator ## Optimization lrD = 0.0002 # Learning rate for the discriminator lrG = 0.0002 # Learning rate for the generator beta1G = 0.5 # Momentum beta1 for the discriminator beta1D = 0.5 # Momentum beta1 for the generator ## Training batch_size = 128 # Images per batch nb_update_D = 1 # Number of sub-steps of discriminator optim. at each step nb_update_G = 1 # Number of sub-steps of generator optim. at each step #steps = 8000 # Number of global steps in the training loop nb_epochs = 20 # Number of epochs, leave "None" if you want to set the number of "steps" (i.e. batches) if nb_epochs is None: nb_epochs = (steps * batch_size) / (nb_update_D * 50000) else: steps = int(nb_epochs * nb_update_D * 50000 / batch_size) print("Doing %.1f epochs in %d steps" % (nb_epochs, steps)) steps_per_epoch = int(steps / nb_epochs) """# Dataset Download and load the dataset. Nothing to do here. """ # We can use an image folder dataset the way we have it setup. # Create the dataset dataroot = '/tmp/mnist' transform = transforms.Compose([ transforms.Pad(2), transforms.ToTensor(), transforms.Normalize(mean=(0.5,), std=(0.5,)) ]) dataset = dset.MNIST(dataroot, train=True, download=True, transform=transform) # Create the dataloader dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=workers) # Decide which device we want to run on device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Plot some training images real_batch = next(iter(dataloader)) plt.figure(figsize=(8,8)) plt.axis("off") plt.title("Training Images") plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=2, normalize=True).cpu(),(1,2,0))) plt.show() """# Model architectures ## Discriminator Input: Image$x \in \mathbb{R}^{32\times 32\times 1}$Output: "Real" image probability$\in [0,1]$## Generator Input: Random "noise"$z \in \mathbb{R}^{\text{nz}}$Output: Generated image$\tilde x \in \mathbb{R}^{32\times 32\times 1}$""" # cDCGAN if arch == 'cDCGAN': class Discriminator(nn.Module): def __init__(self): super(Discriminator, self).__init__() self.emb_x = nn.Sequential( # TODO nn.Conv2d(1,2*ndf,kernel_size = 4, stride=2, padding=1, bias=False), nn.LeakyReLU(0.2) ) self.emb_y = nn.Sequential( # TODO nn.Conv2d(10,2*ndf, kernel_size=4, stride=2, padding=1, bias=False), nn.LeakyReLU(0.2) ) self.emb_xy = nn.Sequential( # TODO nn.Conv2d(4*ndf,8*ndf,kernel_size=4, stride=2, padding=1, bias=False), nn.BatchNorm2d(8*ndf), nn.LeakyReLU(0.2), nn.Conv2d(8*ndf, 16*ndf, kernel_size=4, stride = 2, padding=1,bias=False), nn.BatchNorm2d(16*ndf), nn.LeakyReLU(0.2), nn.Conv2d(16*ndf,1,kernel_size=4, stride=2, padding=0, bias=False), nn.Sigmoid() ) def forward(self, x, y): #TODO h1 = self.emb_x(x) h2 = self.emb_y(y.expand(-1,-1,32,32)) x = torch.cat([h1,h2],1) o = self.emb_xy(x) return o class Generator(nn.Module): def __init__(self): super(Generator, self).__init__() self.emb_z = nn.Sequential( # TODO nn.ConvTranspose2d(nz,8*ngf,kernel_size=4,stride=1,padding=0, bias=False), nn.BatchNorm2d(8*ndf), nn.ReLU() ) self.emb_y = nn.Sequential( # TODO nn.ConvTranspose2d(10,ndf*8, kernel_size=4, stride=1,padding=0,bias=False), nn.BatchNorm2d(8*ngf), nn.ReLU() ) self.emb_zy = nn.Sequential( # TODO nn.ConvTranspose2d(ngf*16,ngf*8,kernel_size=4,stride=2,padding=1,bias=False), nn.BatchNorm2d(8*ngf), nn.ReLU(), nn.ConvTranspose2d(ngf*8,ngf*4,kernel_size=4,stride=2,padding=1,bias=False), nn.BatchNorm2d(4*ngf), nn.ReLU(), nn.ConvTranspose2d(4*ngf,1,kernel_size=4,stride=2,padding=1), nn.Tanh() ) def forward(self, z, y): # TODO h1 = self.emb_z(z) h2 = self.emb_y(y) x = torch.cat([h1,h2],1) o = self.emb_zy(x) return o # cGAN if arch == 'cGAN': class Discriminator(nn.Module): def __init__(self): super(Discriminator, self).__init__() self.emb_x = nn.Sequential( # TODO nn.Linear(784,1024), # 794 ? nn.LeakyReLU(0.2) ) self.emb_y = nn.Sequential( # TODO nn.Linear(10,1024), # self.label_emb = nn.Embedding(10, 10) ? nn.LeakyReLU(0.2) ) self.emb_xy = nn.Sequential( # TODO nn.Linear(2048,1024,bias=False), nn.BatchNorm1d(1024) nn.LeakyReLU(0.2), nn.Linear(1024,512,bias=False), nn.BatchNorm1d(512) nn.LeakyReLU(0.2), nn.Linear(512,1), nn.Sigmoid() ) def forward(self, x, y): #TODO h1 = self.emb_x(x) h2 = self.emb_y(y) x = torch.cat([h1,h2],1) o = self.emb_xy(x) return o class Generator(nn.Module): def __init__(self): super(Generator, self).__init__() self.emb_z = nn.Sequential( # TODO nn.Linear(110,256, bias=False), nn.BatchNorm1d(256), nn.ReLU() ) self.emb_y = nn.Sequential( # TODO nn.Linear(10,256), nn.BatchNorm1d(256), nn.ReLU() ) self.emb_zy = nn.Sequential( # TODO nn.Linear(512,1024), nn.BatchNorm1d(1024), nn.ReLU(), nn.Linear(1024,1024, bias=False), nn.BatchNorm1d(1024), nn.ReLU(), nn.Linear(1024,1024), nn.Tanh() ) def forward(self, z, y): # Remove unused dimensions in non-conv model z = z.view(z.shape[0], -1) z[:, :, 0, 0] y = y.view(y.shape[0], -1) # TODO h1 = self.emb_z(z) h2 = self.emb_y(y) x = torch.cat([h1,h2],1) o = self.emb_zy(x) return o # Create the models netG = Generator().to(device) netD = Discriminator().to(device) # Apply the weights_init function to randomly initialize all weights to mean=0, stdev=0.2 def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: nn.init.normal_(m.weight.data, 0.0, 0.02) elif classname.find('BatchNorm') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0) netG.apply(weights_init) netD.apply(weights_init) # Print the models print(netG) print(netD) # TODO test you models to check if they z = torch.zeros(10, nz, 1, 1).to(device) x = torch.zeros(10, 1, 32, 32).to(device) y = torch.randn(10, 10, 1, 1).to(device) #y.shape #torch.cat([x, y.expand(-1,-1,32,32)], 1).shape #y.shape #y.expand(10,10,32,32).shape #net_test = nn.Sequential(nn.Conv2d(10,ndf, kernel_size=4, stride=2, padding=1)) #net_test(y) print(netG(z, y).shape) # expected: [10, 1, 32, 32] print(netD(x, y).shape) # expected: [10, 1, 1, 1] """# Optimization and training Here we will define: * Our prior$P(z)\$ that we use to sample random "noise". We will use a Gaussian distribution.
* The criterion that will be used to train the discriminator, and indirectly the generator. We will use the binary cross-entropy.
* The optimizers of both models. We will use the Adam optimizer.
"""

# Prior P(z). Returns a Gaussian random tensor of shape (batch_size, nz, 1, 1)
def get_noise(batch_size):
noise = torch.randn((batch_size, nz, 1, 1), dtype=torch.float32) # TODO
return noise

# Returns a random vector of one-hot encoding of shape (batch_size, 10, 1, 1)
def get_fixed_y(batch_size):
y = torch.zeros(batch_size, 10, 1, 1, device=device)
n = batch_size // 10
for i in range(10):
y[i*n:i*n+n, i, :, :].fill_(1)
return y

# Create the criterion function that will take (y_hat, y) as input
criterion = nn.BCELoss() # TODO

# Setup Adam optimizers for D and G
optimizerD = optim.Adam(netD.parameters(), lr=lrD, betas=(beta1D, 0.999)) # TODO
optimizerG = optim.Adam(netG.parameters(), lr=lrG, betas=(beta1G, 0.999)) # TODO

"""### Data format / batch creation functions

r_real and r_fake are targets for the disriminator's criterion
"""

# Data format / batch creation functions

fixed_noise = get_noise(100) # Create a fixed random vector sampled from a Gaussian, will be used during train for viz
fixed_y = get_fixed_y(100)
real_label = 1
fake_label = 0

def onehot(data, nclass=10):
bsize = data.shape[0]
y = torch.zeros(bsize, nclass)
y.scatter_(1, data.unsqueeze(1), 1)
y = y.unsqueeze(2).unsqueeze(3)
return y

# returns a batch of real images from the dataset (iterates infinitely on the dataset)
def get_batch_real():
global iterator
try:
x_real, y_real = next(iterator)
except:
x_real, y_real = next(iterator)
x_real = x_real.to(device)
y_real = onehot(y_real).to(device)
r_real = torch.full((x_real.size(0),), real_label, device=device)

print('x_real_size',x_real.size())
print('y_real_size',y_real.size())
print('r_real_size',r_real.size())

return x_real, y_real, r_real

# returns a batch of generated images and training targets y_fake
# Note that the targets y_fake will be different is train_G is True or False
def get_batch_fake(y_real, train_G=False):
z = torch.randn(y_real.shape[0], nz, 1, 1, device=device)
x_fake = netG(z, y_real)
#
if train_G:
r_fake = None # TODO
else:
r_fake = torch.full((x_fake.size(0),), fake_label, device=device) # TODO

print('x_fake_size',x_fake.size())
#print('y_fake_size',y_fake.size())
print('r_fake_size',r_fake.size())

print('test', r_fake[0])

#print(r_fake)
#print(y_fake)
return x_fake, y_real, r_fake

"""### Training Loop

**Reminder:** when your training loop starts to work, change the workers variable to 4 and rerun your notebook
"""

img_list = []
G_losses = []
D_losses = []

# Commented out IPython magic to ensure Python compatibility.
for i in range(steps):

if i == steps_per_epoch * 11:
optimizerG.param_groups[0]['lr'] /= 10
optimizerD.param_groups[0]['lr'] /= 10
print("learning rate change!")

if i == steps_per_epoch * 16:
optimizerG.param_groups[0]['lr'] /= 10
optimizerD.param_groups[0]['lr'] /= 10
print("learning rate change!")

########
# Update D network: maximize log(D(x)) + log(1 - D(G(z)))
for _ in range(nb_update_D):

# Create batches
x_real, y_real, r_real = get_batch_real()
x_fake, y_fake, r_fake = get_batch_fake(y_real)

# Forward
r_hat_real = netD(x_real,y_real) # TODO
r_hat_fake = netD(x_fake,y_fake) # TODO
errD = criterion(y_real, r_hat_real) + criterion(y_fake, r_hat_fake) # TODO sum of criterion of real and fake samples

# Backward
# TODO backward & optimization step on D

errD.backward(retain_graph=True)
optimizerD.step()

# Compute / save metrics
avg_output_for_real = r_hat_real.mean().item()
avg_output_for_fake = r_hat_fake.mean().item()
D_losses.append(errD.item())

########
# Update G network: maximize log(D(G(z)))
for _ in range(nb_update_G):

# TODO: forward + backward
# NOTE: use errG as name for your loss variable, like errD above
errG = criterion(y_hat_fake, y_fake)
errG.backward(retain_graph=True)
optimizerG.step()

# Compute / save metrics
G_losses.append(errG.item())

########
# Logs
if i % 25 == 0:
print('[%5d/%5d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f'
#               % (i, steps, errD.item(), errG.item(), avg_output_for_real, avg_output_for_fake))

if i % 100 == 0:
x_fake = netG(fixed_noise, fixed_y).detach().cpu()
plt.figure(figsize=(10,10))
plt.imshow(np.transpose(img_list[-1],(1,2,0)))
plt.axis("off")
plt.show()

"""# Display training evolution"""

# Show generations
fig = plt.figure(figsize=(8,8))
plt.axis("off")
ims = [[plt.imshow(np.transpose(i,(1,2,0)), animated=True)] for i in img_list]
ani = animation.ArtistAnimation(fig, ims, interval=1000, repeat_delay=1000, blit=True)

HTML(ani.to_jshtml())

# Loss evolution
plt.figure(figsize=(10,8))
plt.subplot(2,1,1)
plt.title("Generator Trainig Loss")
plt.plot(G_losses)
plt.xlabel("iterations")
plt.ylabel("Loss")

plt.subplot(2,1,2)
plt.title("Generator Trainig Loss")
plt.plot(D_losses)
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.show()


Can anyone help me ?

Hi,

This is quite a lot of code, with a lot of TODOs in it.

You would need to give the exact stack trace.
Also you can try to print the size of the inputs you give your criterion to make sure that they are the shape you expect.

Hi,

Thanks for replying so quickly !

In the code, r_real and r_hat_real are passed to the BCE loss. (same for fake).

Dimensions are [128] for r_real (or r_fake) and [128, 1, 1, 1] for r_hat_real.

I don’t really understand why it’s failing, and how to correct this. Can you help me ?

Thanks

As mentionned in the doc they should both have the exact same shape.
So you should make sure they are both [128] or both [128, 1, 1, 1] . but they should not be different.

Is there a simple way to convert [128] tensor to [128,1,1,1] ? Or the reverse process ?

torch.squeeze(r_hat_real)
https://pytorch.org/docs/stable/torch.html#torch.squeeze

Seems to change the dimension, but there is still a problem with the BCE, got :

RuntimeError: the derivative for ‘target’ is not implemented

Yes, most of our loss function do not provide gradients for the targets, only for the input.

Is there a way to bypass the problem of gradients ?

If it is the case, I would recommend using BCELossWithLogits which combines Sigmoid and BCELoss and for which the gradients for the target is implemented.