RuntimeError: The size of tensor a (224) must match the size of tensor b (244) at non-singleton dimension 3

imran · April 18, 2021, 7:54am

I want to create and train AutoEncoder to extract features and use that features for the clustering algorithms. Right now I am getting errors while calculating the loss.

RuntimeError: The size of tensor a (224) must match the size of tensor b (244) at non-singleton dimension 3

and a warning

UserWarning: Using a target size (torch.Size([1, 3, 224, 244])) that is different to the input size (torch.Size([1, 3, 224, 224])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.

return F.mse_loss(input, target, reduction=self.reduction)*

Can anyone tell me what is wrong with this? In warning and error size of input and output is the same but it is saying it is different.
The summary sizes of input and output images are as follow

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 16, 112, 112]             448
              ReLU-2         [-1, 16, 112, 112]               0
            Conv2d-3           [-1, 32, 56, 56]           4,640
              ReLU-4           [-1, 32, 56, 56]               0
            Conv2d-5           [-1, 64, 18, 18]         100,416
              ReLU-6           [-1, 64, 18, 18]               0
            Conv2d-7            [-1, 128, 3, 3]         401,536
              ReLU-8            [-1, 128, 3, 3]               0
            Conv2d-9            [-1, 256, 1, 1]         295,168
  ConvTranspose2d-10            [-1, 128, 3, 3]         295,040
             ReLU-11            [-1, 128, 3, 3]               0
  ConvTranspose2d-12           [-1, 64, 12, 12]         401,472
             ReLU-13           [-1, 64, 12, 12]               0
  ConvTranspose2d-14           [-1, 24, 28, 28]          75,288
             ReLU-15           [-1, 24, 28, 28]               0
  ConvTranspose2d-16           [-1, 16, 56, 56]           3,472
             ReLU-17           [-1, 16, 56, 56]               0
  ConvTranspose2d-18          [-1, 8, 111, 111]           1,160
             ReLU-19          [-1, 8, 111, 111]               0
  ConvTranspose2d-20          [-1, 3, 224, 224]             603
          Sigmoid-21          [-1, 3, 224, 224]               0
================================================================
Total params: 1,579,243
Trainable params: 1,579,243
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 9.94
Params size (MB): 6.02
Estimated Total Size (MB): 16.54
----------------------------------------------------------------
Min Value of input Image =  tensor(0.0627)
Max Value of input Image =  tensor(0.5098)
Input Image shape =  torch.Size([1, 3, 224, 244])
Output Image shape =  torch.Size([1, 3, 224, 224])

My Autoencoder class is

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
                      
            nn.Conv2d(3, 16, 3, stride=2, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.Conv2d(16, 32, 3, stride=2, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.Conv2d(32, 64, 7, stride=3, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.Conv2d(64, 128, 7, stride=5, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.Conv2d(128, 256, 3, stride=5, padding=1)  # b, 16, 10, 10
            
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(256, 128, 3),  # b, 16, 5, 5
            nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, 7,stride=3, padding=1,output_padding=1),  # b, 16, 5, 5
            nn.ReLU(True),
            nn.ConvTranspose2d(64, 24, 7,stride=2, padding=1,output_padding=1),  # b, 16, 5, 5
            nn.ReLU(True),
            nn.ConvTranspose2d(24, 16, 3, stride=2, padding=1,output_padding=1),  # b, 8, 15, 15
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 3, stride=2, padding=1),  # b, 1, 28, 28
            nn.ReLU(True),
            nn.ConvTranspose2d(8,3, 5, stride=2, padding=1,output_padding=1),  # b, 1, 28, 28
            nn.Sigmoid()
            
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

and training function is as follow

dataset = DatasetLoader('E:/DAL/Dataset/Images', get_transform(train=True))

torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset, indices[-50:])

data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=1, shuffle=True, num_workers=0)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1, shuffle=False, num_workers=0)


model = autoencoder().cuda()
summary(model, (3, 224, 224))

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay=1e-5)

total_loss = 0
for epoch in range(num_epochs):
    for data in data_loader:
        # print(data)
        img = data
        print("Min Value of input Image = ",torch.min(img))
        print("Max Value of input Image = ",torch.max(img))        
        img = Variable(img).cuda()
        # ===================forward=====================
        output = model(img)
        print("Input Image shape = ",img.shape)
        print("Output Image shape = ",output.shape)
        loss = criterion(output, img)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    total_loss += loss.data
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch+1, num_epochs, total_loss))
    if epoch % 10 == 0:
        pic = to_img(output.cpu().data)
        save_image(pic, './dc_img/image_{}.png'.format(epoch))

torch.save(model.state_dict(), './conv_autoencoder.pth')

Dataloader Class and transform function is as follow

def get_transform(train):
    transforms = []    
   
    transforms.append(T.Resize((224,244)))
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
        transforms.append(T.RandomVerticalFlip(0.5))
    transforms.append(T.ToTensor())
    return T.Compose(transforms)

class DatasetLoader(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(root)))
        

    def __getitem__(self, idx):
        img_path = os.path.join(self.root, self.imgs[idx])
        
        img = Image.open(img_path).convert("RGB")   
        

        if self.transforms is not None:
            img = self.transforms(img)

        return img

    def __len__(self):
        return len(self.imgs)

imran · April 18, 2021, 9:11am

It resolved. It was small mistake. Visually I was not able to find the difference between 224 and 244.

A typo in get_transform function:

transforms.append(T.Resize((224,244)))

I wanted to resize it to (224, 224) instead of (224, 244).