RuntimeError: Expected 4-dimensional input for 4-dimensional weight 128 256, but got 2-dimensional input of size [32, 128] instead

I am working on creating an image generator using conditional GAN as the base model. I’ve run across an error that I don’t understand how to debug, even after searching for solutions online. I’m not sure if I should change the settings for training or do some adjustment to my model, or something else. Any help on what to do would be appreciated.

The CGAN model I am using:

class Generator(nn.Module):
    def __init__(self, classes, channels, img_size, latent_dim):
        super(Generator, self).__init__()
        self.classes = classes
        self.channels = channels
        self.img_size = img_size
        self.latent_dim = latent_dim
        self.img_shape = (self.channels, self.img_size, self.img_size)
        self.label_embedding = nn.Embedding(self.classes, self.classes) # process label information, behave as a lookup table

        self.model = nn.Sequential(
            *self._create_layer_1(self.latent_dim + self.classes, 128, False),
            *self._create_layer_2(128, 256),
            *self._create_layer_2(256, 512),
            *self._create_layer_2(512, 1024),
            nn.Linear(1024, int(np.prod(self.img_shape))),
            nn.Tanh()
        )

    def _create_layer_1(self, size_in, size_out, normalize=True):
        layers = [nn.Linear(size_in, size_out)]
        if normalize:
            layers.append(nn.BatchNorm1d(size_out))
        layers.append(nn.LeakyReLU(0.2, inplace=True))
        return layers

    def _create_layer_2(self, size_in, size_out, normalize=True):
        layers = [nn.ConvTranspose2d(size_in, size_out, 4, 2, 1, bias=False)]
        if normalize:
            layers.append(nn.BatchNorm1d(size_out))
        layers.append(nn.LeakyReLU(0.2, inplace=True))
        return layers

    def forward(self, noise, labels):
        z = torch.cat((self.label_embedding(labels), noise), -1)
        x = self.model(z)
        x = x.view(x.size(0), *self.img_shape)
        return x


class Discriminator(nn.Module):
    def __init__(self, classes, channels, img_size, latent_dim):
        super(Discriminator, self).__init__()
        self.classes = classes
        self.channels = channels
        self.img_size = img_size
        self.latent_dim = latent_dim
        self.img_shape = (self.channels, self.img_size, self.img_size)
        self.label_embedding = nn.Embedding(self.classes, self.classes)
        self.adv_loss = torch.nn.BCELoss()

        self.model = nn.Sequential(
            *self._create_layer_1(self.classes + int(np.prod(self.img_shape)), 1024, False, True),
            *self._create_layer_2(1024, 512, True, True),
            *self._create_layer_2(512, 256, True, True),
            *self._create_layer_2(256, 128, False, False),
            *self._create_layer_1(128, 1, False, False),
            nn.Sigmoid()
        )

    def _create_layer_1(self, size_in, size_out, drop_out=True, act_func=True):
        layers = [nn.Linear(size_in, size_out)]
        if drop_out:
            layers.append(nn.Dropout(0.4))
        if act_func:
            layers.append(nn.LeakyReLU(0.2, inplace=True))
        return layers

    def _create_layer_2(self, size_in, size_out, drop_out=True, act_func=True):
        layers = [nn.Conv2d(size_in, size_out, 4, 2, 1, bias=False)]
        if drop_out:
            layers.append(nn.Dropout(0.4))
        if act_func:
            layers.append(nn.LeakyReLU(0.2, inplace=True))
        return layers

    def forward(self, image, labels):
        x = torch.cat((image.view(image.size(0), -1), self.label_embedding(labels)), -1)
        return self.model(x)

    def loss(self, output, label):
        return self.adv_loss(output, label)

Code for initializing the model:

class Model(object):
    def __init__(self,
                 name,
                 device,
                 data_loader,
                 classes,
                 channels,
                 img_size,
                 latent_dim,
                 style_dim=3):
        self.name = name
        self.device = device
        self.data_loader = data_loader
        self.classes = classes
        self.channels = channels
        self.img_size = img_size
        self.latent_dim = latent_dim
        self.style_dim = style_dim
        self.netG = cganG(self.classes, self.channels, self.img_size, self.latent_dim)
        self.netG.to(self.device)
        self.netD = cganD(self.classes, self.channels, self.img_size, self.latent_dim)
        self.netD.to(self.device)
        self.optim_G = None
        self.optim_D = None

    @property
    def generator(self):
        return self.netG

    @property
    def discriminator(self):
        return self.netD

    def create_optim(self, lr, alpha=0.5, beta=0.999):
        self.optim_G = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        self.netG.parameters()),
                                        lr=lr,
                                        betas=(alpha, beta))
        self.optim_D = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        self.netD.parameters()),
                                        lr=lr,
                                        betas=(alpha, beta))

    def _to_onehot(self, var, dim):
        res = torch.zeros((var.shape[0], dim), device=self.device)
        res[range(var.shape[0]), var] = 1.
        return res

    def train(self,
              epochs,
              log_interval=100,
              out_dir='',
              verbose=True):
        self.netG.train()
        self.netD.train()
        viz_z = torch.zeros((self.data_loader.batch_size, self.latent_dim), device=self.device)
        viz_noise = torch.randn(self.data_loader.batch_size, self.latent_dim, device=self.device)
        nrows = self.data_loader.batch_size // 8
        viz_label = torch.LongTensor(np.array([num for _ in range(nrows) for num in range(8)])).to(self.device)
        viz_onehot = self._to_onehot(viz_label, dim=self.classes)
        viz_style = torch.zeros((self.data_loader.batch_size, self.style_dim), device=self.device)
        total_time = time.time()
        for epoch in range(epochs):
            batch_time = time.time()
            for batch_idx, (data, target) in enumerate(self.data_loader):
                data, target = data.to(self.device), target.to(self.device)
                batch_size = data.size(0)
                real_label = torch.full((batch_size, 1), 1., device=self.device)
                fake_label = torch.full((batch_size, 1), 0., device=self.device)

                # Train G
                self.netG.zero_grad()
                z_noise = torch.randn(batch_size, self.latent_dim, device=self.device)
                x_fake_labels = torch.randint(0, self.classes, (batch_size,), device=self.device)
                x_fake = self.netG(z_noise, x_fake_labels)
                y_fake_g = self.netD(x_fake, x_fake_labels)
                g_loss = self.netD.loss(y_fake_g, real_label)
                g_loss.backward()
                self.optim_G.step()

                # Train D
                self.netD.zero_grad()
                y_real = self.netD(data, target)
                d_real_loss = self.netD.loss(y_real, real_label)
                y_fake_d = self.netD(x_fake.detach(), x_fake_labels)
                d_fake_loss = self.netD.loss(y_fake_d, fake_label)
                d_loss = (d_real_loss + d_fake_loss) / 2
                d_loss.backward()
                self.optim_D.step()

                if verbose and batch_idx % log_interval == 0 and batch_idx > 0:
                    print('Epoch {} [{}/{}] loss_D: {:.4f} loss_G: {:.4f} time: {:.2f}'.format(
                            epoch, batch_idx, len(self.data_loader),
                            d_loss.mean().item(),
                            g_loss.mean().item(),
                            time.time() - batch_time))
                    vutils.save_image(data, os.path.join(out_dir, 'real_samples.png'), normalize=True)
                    with torch.no_grad():
                        viz_sample = self.netG(viz_noise, viz_label)
                        vutils.save_image(viz_sample, os.path.join(out_dir, 'fake_samples_{}.png'.format(epoch)), nrow=8, normalize=True)
                    batch_time = time.time()   
            
            torch.save(self.netG.state_dict(), os.path.join(out_dir, 'netG_{}.pth'.format(epoch)))
            torch.save(self.netD.state_dict(), os.path.join(out_dir, 'netD_{}.pth'.format(epoch)))

            self.save_to(path=out_dir, name=self.name, verbose=False)
        if verbose:
            print('Total train time: {:.2f}'.format(time.time() - total_time))

Code for setting up the training:

def main():
    device = torch.device("cuda:0" if FLAGS.cuda else "cpu")
    if FLAGS.train:
        dataloader = torch.utils.data.DataLoader(
            dset.ImageFolder(FLAGS.data_dir, transforms.Compose([
                transforms.Resize(FLAGS.img_size),
                transforms.CenterCrop(FLAGS.img_size),
                transforms.ToTensor()
                ])),
                batch_size=FLAGS.batch_size,
                shuffle=True, 
                num_workers=4, 
                pin_memory=True
                )
        model = Model(FLAGS.model, device, dataloader, FLAGS.classes, FLAGS.channels, FLAGS.img_size, FLAGS.latent_dim)
        model.create_optim(FLAGS.lr)

        # Train
        print("Start training...\n")
        model.train(FLAGS.epochs, FLAGS.log_interval, FLAGS.out_dir, True)

if __name__ == '__main__':
    from utils import boolean_string
    parser.add_argument('--cuda', type=boolean_string, default=True, help='enable CUDA.')
    parser.add_argument('--train', type=boolean_string, default=True, help='train mode or eval mode.')
    parser.add_argument('--data_dir', type=str, default='../datasets', help='Directory for dataset.')
    parser.add_argument('--out_dir', type=str, default='output', help='Directory for output.')
    parser.add_argument('--epochs', type=int, default=800, help='number of epochs')
    parser.add_argument('--batch_size', type=int, default=32, help='size of batches')
    parser.add_argument('--lr', type=float, default=0.0002, help='learning rate')
    parser.add_argument('--latent_dim', type=int, default=62, help='latent space dimension')
    parser.add_argument('--classes', type=int, default=25, help='number of classes')
    parser.add_argument('--img_size', type=int, default=128, help='size of images')
    parser.add_argument('--channels', type=int, default=3, help='number of image channels')
    parser.add_argument('--log_interval', type=int, default=100, help='interval between logging and image sampling')

The error I got:

File "main.py", line 121, in <module>
    main()
  File "main.py", line 56, in main
    model.train(FLAGS.epochs, FLAGS.log_interval, FLAGS.out_dir, True)
  File "build_gan.py", line 123, in train
    x_fake = self.netG(z_noise, x_fake_labels)
  File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "cgan.py", line 42, in forward
    x = self.model(z)
  File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "anaconda3/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 796, in forward
    output_padding, self.groups, self.dilation)
RuntimeError: Expected 4-dimensional input for 4-dimensional weight 128 256, but got 2-dimensional input of size [32, 128] instead

I am using my own image dataset with 3 channels and 25 classes. Any help on what to do would be appreciated.

Hi,

This is a 2-D convolutional block. Convolutional Blocks perform convolutions and need inputs specifically in the shape of [batch_size,channels,img_height,img_width]. You can see that these are 4-D tensors. You are trying to pass 2-D tensors into the conv block–and getting an error.

Thank you Hmrishav for your insights. Could you be a bit more specific on how should I make the change to make it work? Thanks a bunch!

You can use a linear layer instead of a convolutional layer if you want to stick with 2-D tensors… Or you can prevent flattening your images and convert your data into tensor shapes compatible for convolution.

I can point you towards C-GAN implementation to help you understand better :slight_smile:

Hi Hmrishav, I have just double-checked my settings and size of input image, for your reference:

Settings:

PyTorch version: 1.1.0
CUDA version: 9.0.176

         Args         |    Type    |    Value
  cuda                |  bool      |  True
  train               |  bool      |  True
  resume              |  bool      |  False
  data_dir            |  str       |  ../datasets
  out_dir             |  str       |  output
  epochs              |  int       |  800
  batch_size          |  int       |  32
  lr                  |  float     |  0.0002
  latent_dim          |  int       |  62
  classes             |  int       |  25
  img_size            |  int       |  128
  channels            |  int       |  3

Image size as input:

torch.Size([32, 3, 128, 128])

In fact, I actually replaced the in-between linear layers in the original model to convolutional layers for experiment on purpose, as the training performance was not quite ideal on my dataset with pure linear layers.
I would like to know if I wanna stick on using convolutional layers in-between, what adjustment should I make on my current settings. Like you mentioned, I can “prevent flattening your images and convert your data into tensor shapes compatible for convolution”, what specifically should I change on my code?

Again, thank you!

I would suggest reshaping the Z here :

After this line, you can reshape Z to form a 4-D tensor from a 2-D tensor. I assume shape of your z tensor is [32,128] as that is what throws an error. In that case, the reshape can be done as z = z.view( -1,2,8,8) to start with(insert this line after the z=torch.cat(...) line) . In that case, your model should be :

self.model = nn.Sequential(
            *self._create_layer_1(self.latent_dim + self.classes, 128, False),
            *self._create_layer_2(2, 64),
            *self._create_layer_2(64, 128),
            *self._create_layer_2(128, 256),
            *self._create_layer_2(256, 512),
            *self._create_layer_2(512, 1024),
            nn.Linear(1024, int(np.prod(self.img_shape))),
            nn.Tanh()
        )

Keep in mind that all these changes have to be made in the generator module.

Hi, I have finally managed to tackle this, by borrow the generator from InfoGAN, the Generator now looks like this:

class Upsample(nn.Module):
    def __init__(self, scale_factor):
        super(Upsample, self).__init__()
        self.scale_factor = scale_factor

    def forward(self, x):
        return F.interpolate(x, scale_factor=self.scale_factor, mode='bilinear', align_corners=False)


class Generator(nn.Module):
    def __init__(self, classes, channels, img_size, latent_dim):
        super(Generator, self).__init__()
        self.classes = classes
        self.channels = channels
        self.img_size = img_size
        self.img_init_size = self.img_size // 4
        self.latent_dim = latent_dim
        self.img_init_shape = (128, self.img_init_size, self.img_init_size)
        self.img_shape = (self.channels, self.img_size, self.img_size)
        self.stem_linear = nn.Sequential(
            nn.Linear(latent_dim + classes,
                      int(np.prod(self.img_init_shape)))
        )
        self.model = nn.Sequential(
            nn.BatchNorm2d(128),
            *self._create_deconv_layer(128, 128, upsample=True),
            *self._create_deconv_layer(128, 64, upsample=True),
            *self._create_deconv_layer(64, self.channels, upsample=False, normalize=False),
            nn.Tanh()
        )

    def _create_deconv_layer(self, size_in, size_out, upsample=True, normalize=True):
        layers = []
        if upsample:
            layers.append(Upsample(scale_factor=2))
        layers.append(nn.Conv2d(size_in, size_out, 3, stride=1, padding=1))
        if normalize:
            layers.append(nn.BatchNorm2d(size_out, 0.8))
            layers.append(nn.LeakyReLU(0.2, inplace=True))
        return layers

    def forward(self, noise, labels):
        z = torch.cat((noise, labels), -1)
        z_vec = self.stem_linear(z)
        z_img = z_vec.view(z_vec.shape[0], *self.img_init_shape)
        x = self.model(z_img)
        return x

And I also need to make changes to my training settings:

def train(self,
              epochs,
              log_interval=100,
              out_dir='',
              verbose=True):
        self.netG.train()
        self.netD.train()
        for epoch in range(epochs):
            batch_time = time.time()
            for batch_idx, (data, target) in enumerate(self.data_loader):
                data, target = data.to(self.device), target.to(self.device)
                batch_size = data.size(0)
                real_label = torch.full((batch_size, 1), 1., device=self.device)
                fake_label = torch.full((batch_size, 1), 0., device=self.device)

                # Train G
                self.netG.zero_grad()
                z_noise = torch.randn(batch_size, self.latent_dim, device=self.device)
                x_fake_labels = torch.randint(0, self.classes, (batch_size,), device=self.device)
                labels_onehot = self._to_onehot(x_fake_labels, dim=self.classes)
                x_fake = self.netG(z_noise, labels_onehot)
                y_fake_g = self.netD(x_fake, x_fake_labels)
                g_loss = self.netD.loss(y_fake_g, real_label)
                g_loss.backward()
                self.optim_G.step()

                # Train D
                self.netD.zero_grad()
                y_real = self.netD(data, target)
                d_real_loss = self.netD.loss(y_real, real_label)
                y_fake_d = self.netD(x_fake.detach(), x_fake_labels)
                d_fake_loss = self.netD.loss(y_fake_d, fake_label)
                d_loss = (d_real_loss + d_fake_loss) / 2
                d_loss.backward()
                self.optim_D.step()

Everything works out now!