RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 302 and 253 in dimension 3 at /pytorch/aten/src/TH/generic/THTensor.cpp:612

I am loading my images using DataLoader like this:

batch_size = 32
n_iters = 50

def load_images(image_size=224, batch_size=batch_size, root=dir):

    transform = transforms.Compose([
                    transforms.Resize(image_size),
                    transforms.ToTensor()])

    train_set = datasets.ImageFolder(root = root +'/train', transform=transform)
    valid_set = datasets.ImageFolder(root = root +'/validation', transform=transform)
    test_set = datasets.ImageFolder(root = root +'/test', transform=transform)

    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=False, num_workers=0)
    valid_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,shuffle=False, num_workers=0)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size,shuffle=False, num_workers=0)


    return train_set, train_loader, valid_set, valid_loader, test_set, test_loader


train_set, train_loader, valid_set, valid_loader, test_set, test_loader = load_images(image_size = 224, batch_size = batch_size, root = dir)

This is my CNN:

class CustomConvNet(nn.Module):
    def __init__(self, num_classes):
        super(CustomConvNet, self).__init__()

        self.layer1 = self.conv_module(3, 64)
        self.layer2 = self.conv_module(64, 128)
        self.layer3 = self.conv_module(128, 256)
        self.layer4 = self.conv_module(256, 256)
        self.layer5 = self.conv_module(256, 512)
        self.gap = self.global_avg_pool(512, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.gap(out)
        out = out.view(-1, 4)

        return out

    def conv_module(self, in_num, out_num):
        return nn.Sequential(
            nn.Conv2d(in_num, out_num, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(out_num),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

    def global_avg_pool(self, in_num, out_num):
        return nn.Sequential(
            nn.Conv2d(in_num, out_num, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(out_num),
            nn.LeakyReLU(),
            nn.AdaptiveAvgPool2d((1, 1)))

model = CustomConvNet(num_classes=4).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

This is how I am training the network:

def train(epoch):
    model.train()
    tr_loss = 0
    correct = 0
    total = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data), Variable(target)
        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()
            
        # Clearing the Gradients of the model parameters
        optimizer.zero_grad()
        output = model(data)
        pred = torch.max(output.data, 1)[1]
        correct += (pred == target).sum()
        total += len(data)
        
        # Computing the loss
        loss = criterion(output, target)
        
        # Computing the updated weights of all the model parameters
        loss.backward()
        optimizer.step()
        tr_loss = loss.item()
        if (batch_idx + 1)% 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} \t Accuracy: {} %'.format(
                epoch, (batch_idx + 1) * len(data), len(train_loader.dataset),
                100. * (batch_idx + 1) / len(train_loader), loss.item(),100 * correct / total))
            torch.save(model.state_dict(), './model.pth')
            torch.save(model.state_dict(), './optimizer.pth')
    train_loss.append(tr_loss / len(train_loader))
    train_accuracy.append(100 * correct / total)

def evaluate(data_loader):
    model.eval()
    loss = 0
    correct = 0
    total = 0
    for data, target in data_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()
        
        output = model(data)
        loss += F.cross_entropy(output, target, size_average=False).item()
        pred = torch.max(output.data, 1)[1]
        total += len(data)
        correct += (pred == target).sum()
    loss /= len(data_loader.dataset)
    valid_loss.append(loss)    
    valid_accuracy.append(100 * correct / total)
    print('\nAverage Validation loss: {:.5f}\tAccuracy: {} %'.format(loss, 100 * correct / total))

But when I call the train and evaluate functions like:

n_epochs = 20
train_loss = []
train_accuracy = []
valid_loss = []
valid_accuracy = []
for epoch in range(n_epochs):
    train(epoch)
    evaluate(test_loader)

This error pops up:
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 302 and 253 in dimension 3 at /pytorch/aten/src/TH/generic/THTensor.cpp:612

RuntimeError                              Traceback (most recent call last)
<ipython-input-49-3eb1f7459ac0> in <module>()
      5 valid_accuracy = []
      6 for epoch in range(n_epochs):
----> 7     train(epoch)
      8     evaluate(test_loader)

6 frames
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
     53             storage = elem.storage()._new_shared(numel)
     54             out = elem.new(storage)
---> 55         return torch.stack(batch, 0, out=out)
     56     elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
     57             and elem_type.__name__ != 'string_':

Images are size 224x224 and batches are 64.

What went wrong?

Based on the stack trace it seems your returned image tensors do not have the same shape, as the collate function raises this error.

Could you pass the image_size argument as a tuple to Resize as:

transforms.Resize((image_size, image_size))

This will make sure to return images with the specified shape in the width and height dimension.
From the docs:

Desired output size. If size is a sequence like (h, w), output size will be matched to this. If size is an int, smaller edge of the image will be matched to this number. i.e, if height > width, then image will be rescaled to (size * height / width, size)

2 Likes