DenseNet for Fashion MNIST throws an error

I am trying to adapt the tutorial on a DenseNet from https://goosemi.wordpress.com/2018/05/30/first-blog-post/ to the Fashion MNIST dataset which can be downloaded from https://github.com/zalandoresearch/fashion-mnist
The formats of MNIST images of hand-written digits and fashion items seem to be the same, i.e. these are 28x28 greyscale images. However, something does not match up, as the code from the tutorial throws an error. Does anyone has a clue where the problem(s) in the code is(are)?

Below is the code. First, I import the necessary modules/classes.

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F

Then I import the dataset (and apply normalization).

mean = 0.2860347330570221
std = 0.3530242443084717

# Normalised train set
train_set_normal = torchvision.datasets.FashionMNIST(
    root='./data'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
          transforms.ToTensor()
        , transforms.Normalize(mean, std)
    ])
)

Then there I define the DenseNet architecture as it is defined in the tutorial.

class Dense_Block(nn.Module):
    def __init__(self, in_channels):
        super(Dense_Block, self).__init__()

        self.relu = nn.ReLU(inplace = True)
        self.bn = nn.BatchNorm2d(num_features = in_channels)

        self.conv1 = nn.Conv2d(in_channels = in_channels, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv3 = nn.Conv2d(in_channels = 64, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv4 = nn.Conv2d(in_channels = 96, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv5 = nn.Conv2d(in_channels = 128, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)

    def forward(self, x):

        bn = self.bn(x)
        conv1 = self.relu(self.conv1(bn))

        conv2 = self.relu(self.conv2(conv1))
        c2_dense = self.relu(torch.cat([conv1, conv2], 1))

        conv3 = self.relu(self.conv3(c2_dense))
        c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))

        conv4 = self.relu(self.conv4(c3_dense))
        c4_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4], 1))

        conv5 = self.relu(self.conv5(c4_dense))
        c5_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4, conv5], 1))

        return c5_dense

class Transition_Layer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Transition_Layer, self).__init__()

        self.relu = nn.ReLU(inplace = True)
        self.bn = nn.BatchNorm2d(num_features = out_channels)
        self.conv = nn.Conv2d(in_channels = in_channels, out_channels = out_channels, kernel_size = 1, bias = False)
        self.avg_pool = nn.AvgPool2d(kernel_size = 2, stride = 2, padding = 0)

    def forward(self, x):

        bn = self.bn(self.relu(self.conv(x)))
        out = self.avg_pool(bn)

        return out

class DenseNet(nn.Module):
    def __init__(self, nr_classes):
        super(DenseNet, self).__init__()

        self.lowconv = nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 7, padding = 3, bias = False)
        self.relu = nn.ReLU()

        # Make Dense Blocks
        self.denseblock1 = self._make_dense_block(Dense_Block, 64)
        self.denseblock2 = self._make_dense_block(Dense_Block, 128)
        self.denseblock3 = self._make_dense_block(Dense_Block, 128)

        # Make transition Layers
        self.transitionLayer1 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 128)
        self.transitionLayer2 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 128)
        self.transitionLayer3 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 64)

        # Classifier
        self.bn = nn.BatchNorm2d(num_features = 64)
        self.pre_classifier = nn.Linear(64*4*4, 512)
        self.classifier = nn.Linear(512, nr_classes)

    def _make_dense_block(self, block, in_channels):
        layers = []
        layers.append(block(in_channels))
        return nn.Sequential(*layers)

    def _make_transition_layer(self, layer, in_channels, out_channels):
        modules = []
        modules.append(layer(in_channels, out_channels))
        return nn.Sequential(*modules)

    def forward(self, x):
        out = self.relu(self.lowconv(x))

        out = self.denseblock1(out)
        out = self.transitionLayer1(out)

        out = self.denseblock2(out)
        out = self.transitionLayer2(out)

        out = self.denseblock3(out)
        out = self.transitionLayer3(out)

        out = self.bn(out)
        out = out.view(-1, 64*4*4)

        out = self.pre_classifier(out)
        out = self.classifier(out)

        return out

Finally, I am trying to train the network.

dense_network = DenseNet(nr_classes=10)

loader = DataLoader(train_set_normal,batch_size=10,shuffle=True,num_workers=0)
optimizer = optim.Adam(dense_network.parameters(), lr=0.01)

N_epochs = 10
for epoch in range(N_epochs):
    
    total_loss = 0
    total_correct = 0
    
    for batch in loader: # Get Batch

        images, labels = batch

        preds = dense_network(images) # Pass Batch

        loss = F.cross_entropy(preds, labels)           

        optimizer.zero_grad()
        loss.backward() # Calculate Gradients
        optimizer.step() # Update Weights
        total_loss += loss.item()
        total_correct += get_num_correct(preds, labels)
        
    print(
    "epoch", epoch, 
    "total_correct:", total_correct, 
    "loss:", total_loss
    )

As a result, I see the following error-message.

RuntimeError                              Traceback (most recent call last)
<ipython-input-4-84df8dc8fb2f> in <module>
     14         images, labels = batch
     15 
---> 16         preds = dense_network(images) # Pass Batch
     17 
     18         loss = F.cross_entropy(preds, labels)

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

<ipython-input-3-417f3f93841b> in forward(self, x)
     80 
     81     def forward(self, x):
---> 82         out = self.relu(self.lowconv(x))
     83 
     84         out = self.denseblock1(out)

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

~\anaconda3\lib\site-packages\torch\nn\modules\conv.py in forward(self, input)
    351 
    352     def forward(self, input):
--> 353         return self._conv_forward(input, self.weight)
    354 
    355 class Conv3d(_ConvNd):

~\anaconda3\lib\site-packages\torch\nn\modules\conv.py in _conv_forward(self, input, weight)
    348                             _pair(0), self.dilation, self.groups)
    349         return F.conv2d(input, weight, self.bias, self.stride,
--> 350                         self.padding, self.dilation, self.groups)
    351 
    352     def forward(self, input):

RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[10, 1, 28, 28] to have 3 channels, but got 1 channels instead

Hi,

Grayscale images have 1 channel but in the code for self.lowconv, you set in_channel=3.

The error says:

Which means you have a batch of 10 images of size [1, 28, 28], but you are trying to use 64 filters of size [3, 7, 7] which cannot match the channel size in input.

Bests

Thanks a lot for your reply! I have corrected in_channels = 3 to in_channels = 1 and there is another error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-6-84df8dc8fb2f> in <module>
     14         images, labels = batch
     15 
---> 16         preds = dense_network(images) # Pass Batch
     17 
     18         loss = F.cross_entropy(preds, labels)

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

<ipython-input-5-d099d05e804c> in forward(self, x)
     92 
     93         out = self.bn(out)
---> 94         out = out.view(-1, 64*4*4)
     95 
     96         out = self.pre_classifier(out)

RuntimeError: shape '[-1, 1024]' is invalid for input of size 5760

It means, out in this line has a shape that is different from [64, 4, 4] that is why you getting size mismatch error.
Try to put a print(out) right after the aforementioned line and then use the sizes for out.view(-1, <printed_size>).

Thanks @Nikronic a lot for looking into the code.
I have added the line print(out.shape) right after the line out = self.bn(out) and got the following output in front of the error:

torch.Size([10, 64, 3, 3])

Then I changed from out = out.view(-1, 64*4*4) to out = out.view(-1, 64*3*3) and the error message now reads as

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-23-84df8dc8fb2f> in <module>
     14         images, labels = batch
     15 
---> 16         preds = dense_network(images) # Pass Batch
     17 
     18         loss = F.cross_entropy(preds, labels)

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

<ipython-input-22-b8522ba5d926> in forward(self, x)
     95         out = out.reshape(-1, 64*3*3)
     96 
---> 97         out = self.pre_classifier(out)
     98         out = self.classifier(out)
     99 

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

~\anaconda3\lib\site-packages\torch\nn\modules\linear.py in forward(self, input)
     85 
     86     def forward(self, input):
---> 87         return F.linear(input, self.weight, self.bias)
     88 
     89     def extra_repr(self):

~\anaconda3\lib\site-packages\torch\nn\functional.py in linear(input, weight, bias)
   1608     if input.dim() == 2 and bias is not None:
   1609         # fused op is marginally faster
-> 1610         ret = torch.addmm(bias, input, weight.t())
   1611     else:
   1612         output = input.matmul(weight.t())

RuntimeError: size mismatch, m1: [10 x 576], m2: [1024 x 512] at C:\w\b\windows\pytorch\aten\src\TH/generic/THTensorMath.cpp:41

If you trace the stacktrace, it ends up in this line which as you might guess size is different from value in .view.

You can find these type of issues easily by tracing the stacktrace from last line to first. Then error message usually is common in many different posts, so you can the idea of error.

1 Like

Oh, missed that one! Thanks heaps for pointing out to this! I have corrected the line from self.pre_classifier = nn.Linear(64*4*4, 512) to self.pre_classifier = nn.Linear(64*3*3, 512) and the script started the training process! It is rather slow though, but it works so far. Thanks again!