Getting runtime error of size mismatch

I am creating binary classifier to classify cat and dog images for this below is the network architecture I have for my model.

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=5, stride =1, padding=2),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size = 2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size = 2, stride=2))
        self.fc1 = nn.Linear(64 * 37 * 37, 1024)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(1024, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = self.relu(self.fc1(out))
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

I have created custom dataset to read images from folder below is the code for the same.

class CatDogDataset(Dataset):
    def __init__(self,root_dir, transform=transform):
        self.transform = transform
        self.root_dir = root_dir

        self.image_list = glob.glob(self.root_dir+'/*.tif')
        #print(len(self.image_list))


    def __len__(self):
        return len(self.image_list)


    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.to_list()

        img_name = self.image_list[idx]
        image = Image.open(img_name)
        img_label = img_name.split('/')[-1][:3]

        # setting 0 for cat and 1 for dog
        if (img_label == 'cat'):
            label = 0
        else:
            label = 1

        label = np.array(label)
        label = label.reshape(-1)

        if self.transform:
            image = self.transform(image)

        return (image, label)

Now the problem is when I try to run the model for certain epochs (say 3 for this example), It runs successfully till 3 epochs but after that it gives below size error. I am not getting why this error coming after training for certain epochs since if there is size issue then it should occur before even training.

Epoch: 0/3..  Training Loss: 0.606..  Validation Loss: 0.578..  Training Accuracy:  50.012..  Validation Accuracy: 50.215
Epoch: 1/3..  Training Loss: 0.538..  Validation Loss: 0.542..  Training Accuracy:  50.006..  Validation Accuracy: 49.863
Epoch: 2/3..  Training Loss: 0.495..  Validation Loss: 0.516..  Training Accuracy:  50.010..  Validation Accuracy: 50.039
Traceback (most recent call last):
  File "Classifier.py", line 153, in <module>
    outputs = model(images)
  File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
    result = self.forward(*input, **kwargs)
  File "Classifier.py", line 61, in forward
    out = self.relu(self.fc1(out))
  File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 547, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/modules/linear.py", line 87, in forward
    return F.linear(input, self.weight, self.bias)
  File "/usr/local/anaconda3/lib/python3.6/site-packages/torch/nn/functional.py", line 1369, in linear
    ret = torch.addmm(bias, input, weight.t())
RuntimeError: size mismatch, m1: [64 x 360000], m2: [87616 x 1024] at /opt/conda/conda-bld/pytorch_1565272279342/work/aten/src/THC/generic/THCTensorMathBlas.cu:273

Note:- Since the images are of high resolution due to which it takes lot of time to train, I have resized image to 148 * 148

transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                transforms.Resize([148,148]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.5],std=[0.5])
                            ])

Hi,

I ran your code using random numbers for 10 epochs and it did not caused any error. Maybe other parts of your code?

x = torch.randn(32, 3, 148, 148)
model = CNN()

model.train()
for epoch in range(10):
  model(x)

Something that may weird is the memory issues. On my local environment, on deep models, I got mismatch errors in size or channel but not on non-low-end laptops.
Could you try your code on google colab for instance or run on your local system with small size of dataset such as 100 images etc?

Bests, Nik

Yes I know there is some issue which I resolved it using squeeze operation. Below is the entire code for training that I am using.

for epoch in range(num_epochs):
    train_loss = 0
    val_loss = 0
    train_acc = 0
    val_acc = 0
    for images,labels in train_loader:
        images = images.to(dev)
        labels = labels.to(dev)

        outputs = model(images)
        loss = criterion(outputs, labels.type(torch.FloatTensor).to(dev))
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        total = labels.size(0)
        _, predicted = torch.max(outputs.data, 1)
        labels = torch.squeeze(labels)
        correct = (predicted == labels).sum().item()
        train_acc += correct / total
    else:
        with torch.no_grad():
            model.eval()
            for images,labels in val_loader:
                images = images.to(dev)
                labels = labels.to(dev)

                val_out = model(images)
                loss = criterion(val_out, labels.type(torch.FloatTensor).to(dev))
                val_loss += loss.item()

                total = labels.size(0)
                labels = torch.squeeze(labels)
                _, predicted = torch.max(val_out.data, 1)
                correct = (predicted == labels).sum().item()
                val_acc += correct / total
        model.train()

    print("Epoch: {}/{}.. ".format(epoch, num_epochs),
              "Training Loss: {:.3f}.. ".format(train_loss/len(train_loader)),
              "Validation Loss: {:.3f}.. ".format(val_loss/len(val_loader)),
              "Training Accuracy: {: .3f}.. ".format(train_acc/len(train_loader)*100),
              "Validation Accuracy: {:.3f}".format(val_acc/len(val_loader)*100))

    train_losses.append(train_loss/len(train_loader))
    val_losses.append(val_loss/len(val_loader))
    train_accs.append(train_acc/len(train_loader)*100)
    val_accs.append(val_acc/len(val_loader)*100)
                                                                                                                                     

I am not sure but is that the resize is not working?

I tried on my local machine as well but giving me same error.

Issue got resolved please ignore it :slight_smile:

Could you share how it was resolved at least ?

how did you resolve it? ty!