RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 224 and 244 in dimension 2

I am trying to do semantic segmentation with two classes.

I have 224x224x3 images and 224x224 binary segmentation masks. I am reshaping the masks to be 224x224x1 (I read somewhere that this is the format that I should pass to the model).

When I try to loop through the train data loader it either runs without errors or I get the following error:

Traceback (most recent call last):
  File "/Users/nikolaykolibarov/Desktop/GATE/rooftop-recognition/rooftop-edge-segmentation-ml/", line 43, in <module>
    for i, sample in enumerate(train_loader):
  File "/Users/nikolaykolibarov/opt/anaconda3/envs/dtcc/lib/python3.7/site-packages/torch/utils/data/", line 346, in __next__
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/Users/nikolaykolibarov/opt/anaconda3/envs/dtcc/lib/python3.7/site-packages/torch/utils/data/_utils/", line 47, in fetch
    return self.collate_fn(data)
  File "/Users/nikolaykolibarov/opt/anaconda3/envs/dtcc/lib/python3.7/site-packages/torch/utils/data/_utils/", line 79, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/Users/nikolaykolibarov/opt/anaconda3/envs/dtcc/lib/python3.7/site-packages/torch/utils/data/_utils/", line 79, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/Users/nikolaykolibarov/opt/anaconda3/envs/dtcc/lib/python3.7/site-packages/torch/utils/data/_utils/", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 224 and 244 in dimension 2 at /tmp/pip-req-build-9oilk29k/aten/src/TH/generic/THTensor.cpp:689

Most of the issues with the same exception I found online don’t have matching numbers like I do (Got 224 and 224). I don’t understand, this exception should occur when they aren’t equal (if I am not wrong).

When I run it, occasionally I get this error a few times, and then it doesn’t occur a few times.

I am not sure if I messed something up with the data types and shapes or it is something else.

Here is the code:

import os
import cv2 as cv
from import Dataset
from torchvision.transforms import transforms

from utils import create_binary_mask, get_labelme_shapes, plot_segmentation_dataset

class RoofEdgesDataset(Dataset):
    def __init__(self, im_path, ann_path, transform=None):
        self.im_path = im_path
        self.ann_path = ann_path
        self.transform = transform

        self.im_fn_list = sorted(os.listdir(im_path), key=lambda x: int(x.split('.')[0]))
        self.ann_fn_list = sorted(os.listdir(ann_path), key=lambda x: int(x.split('.')[0]))

    def __len__(self):
        return len(self.im_fn_list)

    def __getitem__(self, index):
        im_path = os.path.join(self.im_path, self.im_fn_list[index])
        im = cv.imread(im_path)

        ann_path = os.path.join(self.ann_path, self.ann_fn_list[index])
        ann = create_binary_mask(im, get_labelme_shapes(ann_path))
        ann = ann.reshape(ann.shape[0], ann.shape[1], 1)
        ann = transforms.ToTensor()(ann)

        # ann = torch.tensor(ann)
        # ann = ann.permute(2, 0, 1)

        if self.transform:
            im = self.transform(im)

        return im, ann

import torch
import torchvision.transforms as transforms
from import DataLoader

from roof_edges_dataset import RoofEdgesDataset

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
in_im_shape = (3, 224, 224)
num_classes = 2  # Edge / Non-edge
learning_rate = 0.001
batch_size = 4
n_epochs = 10

# Data - 60% Train - 20% Val - 20% Test
transformations = transforms.Compose([
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

dataset = RoofEdgesDataset(im_path='data/images', ann_path='data/annotations', transform=transformations)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset =, [train_size, test_size])

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset =, [train_size, val_size])

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)


# Model

# Loss and Optimizer

# Train
for epoch in range(n_epochs):
    for batch_idx, (image, annotation) in enumerate(train_loader):
        image =
        annotation =


# Evaluate

I would be also very thankful if you let me know if something is done in a wrong or stupid way and how to do it better.

Thanks in advance!

Never mind. I am blind. I thought it says 224x224 but it says 224x244 (which makes sense now).
I used a resize transformation which fixes the issue in case of more images with unmatching dimensions. In my case I had just one image which was 224x244 so I fixed it manually and removed the resize transformation.