Help with basic semantic segmentation (2 classes - binary)

I am new to PyTorch and I am trying to do semantic segmentation.

I am trying to do semantic segmentation with two classes - Edge and Non-Edge.
I have 224x224x3 images and 224x224 binary segmentation masks. I am reshaping the masks to be 224x224x1 (I read somewhere that this is the format that I should pass to the model). Here is a sample image and mask:

I want to try whatever model, loss, and optimizer to proceed with the training. I am currently trying with torchvision.models.segmentation. fcn_resnet50 which I found that can be used for segmentation from the docs (I am not sure if I have to modify it or use it as it is).

I get the following errors when I try different loss functions:

AttributeError: 'collections.OrderedDict' object has no attribute 'size'

AttributeError: 'collections.OrderedDict' object has no attribute 'log_softmax'

AttributeError: 'collections.OrderedDict' object has no attribute 'dim'

Here is the code:

import os
import cv2
from import Dataset
from torchvision.transforms import transforms

from utils import create_binary_mask, get_labelme_shapes, plot_segmentation_dataset

class RoofEdgesDataset(Dataset):
    def __init__(self, im_path, ann_path, transform=None):
        self.im_path = im_path
        self.ann_path = ann_path
        self.transform = transform

        self.im_fn_list = sorted(os.listdir(im_path), key=lambda x: int(x.split('.')[0]))
        self.ann_fn_list = sorted(os.listdir(ann_path), key=lambda x: int(x.split('.')[0]))

    def __len__(self):
        return len(self.im_fn_list)

    def __getitem__(self, index):
        im_path = os.path.join(self.im_path, self.im_fn_list[index])
        im = cv2.imread(im_path)

        ann_path = os.path.join(self.ann_path, self.ann_fn_list[index])
        ann = create_binary_mask(im, get_labelme_shapes(ann_path))
        ann = ann.reshape(ann.shape[0], ann.shape[1], 1)
        ann = transforms.ToTensor()(ann)

        if self.transform:
            im = self.transform(im)

        return im, ann

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch import optim
from import DataLoader

from roof_edges_dataset import RoofEdgesDataset

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
in_im_shape = (3, 224, 224)
num_classes = 2  # Edge / Non-edge
learning_rate = 0.001
batch_size = 4
n_epochs = 10

# Data - 60% Train - 20% Val - 20% Test
transformations = transforms.Compose([
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

dataset = RoofEdgesDataset(im_path='data/images', ann_path='data/annotations', transform=transformations)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset =, [train_size, test_size])

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset =, [train_size, val_size])

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

# Model
model = torchvision.models.segmentation.fcn_resnet50(pretrained=False, progress=True, num_classes=2)

# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train
for epoch in range(n_epochs):
    for batch_idx, (image, annotation) in enumerate(train_loader):
        image =
        annotation =

        # forward
        output = model(image)
        loss = criterion(output, annotation)

        # backward

        # gradient descent (adam step)

        if (batch_idx + 1) % 2 == 0:
                f'Epoch [{epoch + 1}/{n_epochs}], Step [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')

# Evaluate

How should I proceed? What am I doing wrong? How can I fix it?
Also, any examples, guides, tutorials, references, and everything that will help me solve my issue and understand the topic better is welcome.

The model output is an OrderedDict, while tensors are expected in the loss functions.
Use ['out'] to get the class logits:

model = models.segmentation.fcn_resnet50()
output = model(torch.randn(1, 3, 224, 224))
> torch.Size([1, 21, 224, 224])

Also note that PyTorch uses “channels-first” tensors, so the class and channel dimensions should be in dim1.

1 Like