Unet training Error: The size of tensor a (16) must match the size of tensor b (6) at non-singleton dimension 1

I’m trying to train a Unit model on LandCoverNet dataset, which is a satellite imagery dataset that contains input images and corresponding land cover type masks.
I have created a custom dataset to get my images and masks:

# Create custom dataset that accepts 4 channels images
from torch.utils.data import Dataset, DataLoader, sampler
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import os
import numpy as np
import rasterio as rio
from torchvision import transforms, datasets, models
# We have two dir: inputs(folder for each image) and tatgets

class LandCoverNetDataset(BaseDataset):
  
  CLASSES = ['otherland', 'cropland', 'pastureland', 'bare soil', 'openwater', 'forestland']

  def __init__(self, inputs_dir, targets_dir, 
               classes = None,  
               augmentation=None , 
               preprocessing = False,
               pytorch=True):
    
    super().__init__()
    self.samples = []
    self.pytorch = pytorch
    self.augmentation = augmentation
    self.preprocessing = preprocessing

    # Convert str names to class values on masks
    self.class_value = [self.CLASSES.index(cls.lower()) for cls in classes]

    # Create dictionary for images and targets
    for sub_dir in os.listdir(inputs_dir):
      files = {}
      files = {
         'img_bands' : os.path.join(inputs_dir, sub_dir),
          'target' : os.path.join(targets_dir, sub_dir[:13] + "_LC_10m.png")
        }
      self.samples.append(files)



                                       
  def __len__(self):
    return len(self.samples)
  
  def normalize(self, band):
    
    '''Notmalize a numpy array to have values between 0 and 1'''  
    band_min, band_max = band.min(), band.max()
    np.seterr(divide='ignore', invalid='ignore')
    normalized_band = ((band - band_min)/(band_max - band_min))
    #Remove any nan value and subtitute by zero
    where_are_NaNs = isnan(normalized_band)
    normalized_band[where_are_NaNs] = 0
    return normalized_band


  def open_as_array(self, idx, include_ndvi = False):
    '''
      Merge the 4 bands into one image and normalize the bands
    '''
    # List indivisual bands in each image folder
    # Stack them togather
    list_bands = []
    for img_file in os.listdir(self.samples[idx]['img_bands']):
      # Get the ndvi band
      if 'NDVI' in img_file:
        ndvi_band = os.path.join(self.samples[idx]['img_bands'], img_file)
      else:
        # Get the rgb bands
        band = rio.open(os.path.join(self.samples[idx]['img_bands'], img_file)).read(1)

        if self.preprocessing:
          # preprocess the bands before stacking them (only rgb)
          band = self.normalize(band)
        list_bands.append(band)

    # Stack the bands 
    raw_rgb = np.stack(list_bands, axis=2).astype('float32')

    if include_ndvi:
      # Include the NDVI band in the input images
      ndvi = np.expand_dims(rio.open(ndvi_band).read(1).astype('float32'), 2)
      raw_rgb = np.concatenate([raw_rgb, ndvi], axis=2)

    if self.augmentation:
      transformed = self.augmentation(image = raw_rgb)
      raw_rgb  = transformed["image"]

    if self.preprocessing:
      # transpose to tensor shape
      raw_rgb = raw_rgb.transpose((2,0,1)).astype('float32')
    
    return raw_rgb

  def open_mask(self, idx):
    # Extract certain classes from mask

    mask = cv2.imread(self.samples[idx]['target'], 0)
    masks = [(mask == v) for v in self.class_value]
    mask = np.stack(masks, axis=-1).astype('long')

    if self.augmentation:
      transformed = self.augmentation(image = mask)
      mask  = transformed["image"]
    
    if self.preprocessing:
      # preprocess the mask
      mask = self.normalize(mask)
      # transpose to tensor shape
      mask = mask.transpose((2, 0, 1)).astype('long')
      mask = mask[0, :, :]
    return mask
  
  def __getitem__(self, idx):
    x = torch.tensor(self.open_as_array(idx, include_ndvi=True), dtype=torch.float)
    y = torch.tensor(self.open_mask(idx), dtype=torch.long)

    return x, y
    
  def open_as_pil(self, idx):
    arr = 256*self.open_as_array(idx)   
    return Image.fromarray(arr.astype(np.uint8), 'RGB')

  def __repr__(self):
    s = 'Dataset class with {} files'.format(self.__len__())
    return s


The input here is 4 bands.
This is the shape of the first batch for both input/target

torch.Size([16, 4, 224, 224])
torch.Size([16, 224, 224])

I’m using a model from segmentation-models-pytorch library, and here is how I customized it for my case:


ENCODER = 'se_resnext50_32x4d'
ENCODER_WEIGHTS = 'imagenet'
ACTIVATION = 'softmax2d'
DEVICE = 'cuda'

model = smp.FPN(ENCODER, classes=len(CLASSES), activation=ACTIVATION)

# Replace the model.conv1 to accept 4 channels
# first: copy the layer's weights
weight = model.encoder.layer0.conv1.weight.clone()
model.encoder.layer0.conv1 = nn.Conv2d(4, 64,kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
with torch.no_grad():
  model.encoder.layer0.conv1.weight[:, :3] = weight
  model.encoder.layer0.conv1.weight[:, 3] = model.encoder.layer0.conv1.weight[:, 0]

loss = smp.utils.losses.NLLLoss()
metrics = [
    smp.utils.metrics.IoU(threshold=0.5),
]
optimizer = torch.optim.SGD([ 
    dict(params=model.parameters(), lr=0.001, weight_decay=1e-8, momentum=0.9),
])


# create epoch runners 
# it is a simple loop of iterating over dataloader`s samples
train_epoch = smp.utils.train.TrainEpoch(
    model, 
    loss=loss, 
    metrics=metrics, 
    optimizer=optimizer,
    device=DEVICE,
    verbose=True,
)

valid_epoch = smp.utils.train.ValidEpoch(
    model, 
    loss=loss, 
    metrics=metrics, 
    device=DEVICE,
    verbose=True,
)


# train model for 40 epochs

And here is my training loop


# train model for 40 epochs

max_score = 0

for i in range(0, 40):
    
    print('\nEpoch: {}'.format(i))
    train_logs = train_epoch.run(train_loader)
    valid_logs = valid_epoch.run(valid_loader)
    
    # do something (save model, change lr, etc.)
    if max_score < valid_logs['iou_score']:
        max_score = valid_logs['iou_score']
        torch.save(model, './best_model.pth')
        print('Model saved!')
        
    if i == 25:
        optimizer.param_groups[0]['lr'] = 1e-5
        print('Decrease decoder learning rate to 1e-5!')

At first, the target shape was [16, 6, 224, 224] but I had an error and found this thread that it should be [batch_size, height, width]
That’s why I added this line in the Dataset class : mask = mask[0, :, :]
to get ride of the number of classes dim, and here where things get confusing for me, because the output of me model is torch.Size([10, 6, 224, 224]).

This is the entire error message:

Epoch: 0
train:   0%|          | 0/157 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-215-2ae39e205dee> in <module>()
      7 
      8     print('\nEpoch: {}'.format(i))
----> 9     train_logs = train_epoch.run(train_loader)
     10     valid_logs = valid_epoch.run(valid_loader)
     11 

3 frames
/usr/local/lib/python3.6/dist-packages/segmentation_models_pytorch/utils/functional.py in iou(pr, gt, eps, threshold, ignore_channels)
     32     pr, gt = _take_channels(pr, gt, ignore_channels=ignore_channels)
     33 
---> 34     intersection = torch.sum(gt * pr)
     35     union = torch.sum(gt) + torch.sum(pr) - intersection + eps
     36     return (intersection + eps) / union

RuntimeError: The size of tensor a (16) must match the size of tensor b (6) at non-singleton dimension 1

I have been working on this for days now and would appreciate any help!
Thanks a lot!

I understand that it is a 6-class segmentation problem?
I am not sure what the initial target shape [16, 6, 224, 224] means. Is it [batch size, num classes (one hot), H, W]?
If the target was one hot, you should do mask = torch.argmax(mask, dim=1).

This is confusing now. If the batch size is 16, the output of the model should have been [16, 6,224,224].

Thanks a lot! Yes the output is [16, 6, 224, 224]. The Tensor before was just for test and I specified the batch size to be 10.
But I’m not sure where to put the mask = torch.argmax(mask, dim=1)

if you are sure that mask contains one hot encoding of classes, you can replace mask = mask[0, :, :] with mask = torch.argmax(mask, dim=1).

1 Like

Just to answer your question, the second dimension [6] represents each class. Meaning if I iterate through this dim and plot the image I’ll get 6 different images, each has the corresponding class for the input image.
I know what one hot means, but I’m not sure if in this case, it considered a one-hot encoding. At least I don’t think so.
Sorry if this doesn’t make sense.
And to give you some contact, I used this code as a base, but there, we have only 1 class while mine is 6.

By the way I performed it selecting one class and it worked fine.
Thanks a lot

OK, I changed the loss function to smp.utils.losses.DiceLoss(), and I was able to start training my model. I also removed mask = mask[0, :, :].

I also had an issue with my normalization. Here is how I did it:
for input (4 bands):

      for i in range(raw_rgb.shape[0]):
        raw_rgb[i, :, :] = self.normalize(raw_rgb[i, :, :])

And the same for the masks (3 channels)
This was after converting them to tensor.

I would also still want to know how to prepare my masks for CrossEntropyLoss.

Hi! I am trying to use smp.utils.losses.DiceLoss() for two classes and I keep encountering this error
The size of tensor a (58) must match the size of tensor b (57) at non-singleton dimension 3.

This is the entire message:

Epoch: 0
train: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s, dice_loss - 0.4144, iou_score - 0.4247]
valid:   0%|          | 0/10 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-105-0db0875a4c35> in <module>()
      7     print('\nEpoch: {}'.format(i))
      8     train_logs = train_epoch.run(train_loader)
----> 9     valid_logs = valid_epoch.run(valid_loader)
     10 
     11     # do something (save model, change lr, etc.)

6 frames
/root/.local/lib/python3.7/site-packages/segmentation_models_pytorch/fpn/decoder.py in forward(self, x, skip)
     31         x = F.interpolate(x, scale_factor=2, mode="nearest")
     32         skip = self.skip_conv(skip)
---> 33         x = x + skip
     34         return x
     35 

RuntimeError: The size of tensor a (58) must match the size of tensor b (57) at non-singleton dimension 3

Any help would be appreciated.
Many thanks

The stacktrace from the error points to the forward method of the fpn/decoder (not the loss function) and explains that the skip connection is failing with a shape mismatch in dim3 which should be a spatial size. These errors are usually raised if your model isn’t flexible enough to accept different spatial input shapes and depends on e.g. powers of two (or other commonly used shapes such as 224x224).