Weird results in training FCN-32s in PyTorch

I’m trying to train FCN-32s in PyTorch, I follow this implementation in PyTorch [pytorch-fcn] to write my codes, and try to train FCN-32s with my wrapped API. However, the results are not so satisfied, in the pytorch-fcn, it reports results after 90K iterations achieving 63.13 IU, but in my implementation, even after 100K iterations, the results are still very bad.

Results in tensorboard(after 100K iterations):

My implementation codes:


def get_upsampling_weight(in_channels, out_channels, kernel_size):
    """Make a 2D bilinear kernel suitable for upsampling"""
    factor = (kernel_size + 1) // 2
    if kernel_size % 2 == 1:
        center = factor - 1
        center = factor - 0.5
    og = np.ogrid[:kernel_size, :kernel_size]
    filt = (1 - abs(og[0] - center) / factor) * \
        (1 - abs(og[1] - center) / factor)
    weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size),
    weight[range(in_channels), range(out_channels), :, :] = filt
    return torch.from_numpy(weight).float()

class FCN32s(nn.Module):
    def __init__(self, pretrained=False, num_classes=21):
        super(FCN32s, self).__init__()
        # vgg16 = VGG16(pretrained=True)
        vgg16 = VGG16(pretrained=False)
        if pretrained:
            state_dict = torch.load('./vgg16_from_caffe.pth')
        self.features = vgg16.features
        self.features._modules['0'].padding = (100, 100)
        for module in self.features.modules():
            if isinstance(module, nn.MaxPool2d):
                module.ceil_mode = True

        # Fully Connected 6 -> Fully Convolution
        self.fc6 = nn.Conv2d(512, 4096, 7)
        self.relu6 = nn.ReLU(inplace=True)
        self.drop6 = nn.Dropout2d()

        # FC 7
        self.fc7 = nn.Conv2d(4096, 4096, 1)
        self.relu7 = nn.ReLU(inplace=True)
        self.drop7 = nn.Dropout2d()

        self.score = nn.Conv2d(4096, num_classes, 1)

        self.upsample = nn.ConvTranspose2d(
            num_classes, num_classes, 64, 32, bias=False)

        # Init ConvTranspose2d
        init_weights = get_upsampling_weight(num_classes, num_classes, 64)

        # Init FC6 and FC7
        classifier = vgg16.classifier
        for idx, l in zip((0, 3), ('fc6', 'fc7')):
            layer = getattr(self, l)
            vgg16_layer = classifier[idx]

    def forward(self, x):
        w, h = x.size()[2:]
        x = self.features(x)
        x = self.drop6(self.relu6(self.fc6(x)))
        x = self.drop7(self.relu7(self.fc7(x)))

        x = self.score(x)
        x = self.upsample(x)

        x = x[:, :, 19:19+w, 19:19+h].contiguous()

        return x

def gen_voc_dataset(phase, path='/share/datasets/VOCdevkit/VOC2012'):
    # VOC Dataset
    voc_input_trans = T.Compose([
        ToTensor(rescale=False),  # Just ToTensor with no [0, 255] to [0, 1]
        IndexSwap(0, [2, 1, 0]),  # RGB --> BGR
            (1, 1, 1)),
    voc_target_trans = ToArray()
    dataset = VOCClassSegmentation(
        path, phase,
        input_trans=voc_input_trans, target_trans=voc_target_trans)
    return dataset

def gen_sbd_dataset(phase, path='/share/datasets/SBD/dataset'):
    sbd_input_trans = T.Compose([
        IndexSwap(0, [2, 1, 0]),
            (1, 1, 1)),
    sbd_target_trans = ToArray()
    dataset = SBDClassSegmentation(
        path, phase,
        input_trans=sbd_input_trans, target_trans=sbd_target_trans)
    return dataset

def make_dataset(phase, ignores=None):
    datasets = []
    if ignores is None:
        ignores = []
    ignores = set(ignores)
    for key, val in globals().items():
        if key.startswith('gen_'):
            dataset_name = key.split('_')[1]
            if dataset_name not in ignores:
                print('Use dataset {} for phase {}'.format(dataset_name, phase))
                d = val(phase)
    dataset = ConcatDataset(datasets)
    return dataset

def get_params(model, bias=False):
    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            if bias:
                yield m.bias
                yield m.weight

class CrossEntropyLoss2d(nn.Module):
    def __init__(self, weight=None, size_average=True, ignore_index=255):
        super(CrossEntropyLoss2d, self).__init__()
        self.nll_loss = nn.NLLLoss2d(weight, size_average, ignore_index)

    def forward(self, inputs, targets):
        return self.nll_loss(F.log_softmax(inputs), targets)

import argparse

import torch.optim as optim
import torch.nn as nn

from import DataLoader
from torchtools.trainer import ModelTrainer
from torchtools.callbacks import ModelCheckPoint
from torchtools.callbacks import TensorBoardLogger
from torchtools.meters import FixSizeLossMeter, EpochLossMeter
from torchtools.meters import EpochIoUMeter, BatchIoUMeter, FixSizeIoUMeter
from torchtools.meters import SemSegVisualizer
from torchtools.loss import CrossEntropyLoss2d

from datasets import make_dataset
from model import FCN32s as Model
from utils import get_params

parser = argparse.ArgumentParser()
parser.add_argument('--EPOCHS', type=int, default=200)
parser.add_argument('--BATCH_SIZE', type=int, default=1)
parser.add_argument('--LR_RATE', type=float, default=1e-10)
parser.add_argument('--MOMENTUM', type=float, default=0.99)
parser.add_argument('--WEIGHT_DECAY', type=float, default=5e-4)
parser.add_argument('--NUM_WORKERS', type=int, default=4)
parser.add_argument('--OUTPUT_PATH', type=str,
parser.add_argument('--PIN_MEMORY', type=bool, default=True)
parser.add_argument('--SHUFFLE', type=bool, default=True)
parser.add_argument('--DEVICE_ID', type=int, default=0)
parser.add_argument('--USE_CUDA', type=bool, default=True)
parser.add_argument('--DATA_PARALLEL', type=bool, default=False)

args = parser.parse_args()

train_set = make_dataset('train', ignores=['voc'])
val_set = make_dataset('val', ignores=['sbd'])

train_loader = DataLoader(train_set, args.BATCH_SIZE, shuffle=args.SHUFFLE,

val_loader = DataLoader(val_set, args.BATCH_SIZE, shuffle=args.SHUFFLE,

model = Model(pretrained=True)
criterion = CrossEntropyLoss2d()
if args.USE_CUDA:
    model = model.cuda(args.DEVICE_ID)
    criterion = criterion.cuda(args.DEVICE_ID)
    if args.DATA_PARALLEL:
        model = nn.DataParallel(model)

optimizer = optim.SGD([
    {'params': get_params(model, bias=False)},
    {'params': get_params(model, bias=True), 'lr': args.LR_RATE * 2,
     'weight_decay': 0},
    lr=args.LR_RATE, momentum=args.MOMENTUM, weight_decay=args.WEIGHT_DECAY)

trainer = ModelTrainer(model, train_loader, criterion, optimizer, val_loader,
                       use_cuda=args.USE_CUDA, device_id=args.DEVICE_ID)

checkpoint = ModelCheckPoint(args.OUTPUT_PATH, 'val_loss', save_best_only=True)

train_loss_meter = FixSizeLossMeter('loss', 'train', 20)
val_loss_meter = EpochLossMeter('val_loss', 'validate')
val_iou_meter = EpochIoUMeter('val_IoU', 'validate', num_classes=21)
train_iou_meter = FixSizeIoUMeter('train_IoU', 'train', 20, num_classes=21)
train_epoch_iou_meter = EpochIoUMeter('train_IoU_epoch', 'train',
ss_visualizer = SemSegVisualizer('Prediction', 'train', 'voc',
                                 300 // args.BATCH_SIZE)

tb_logger = TensorBoardLogger(args.OUTPUT_PATH)

trainer.register_hooks([train_loss_meter, val_loss_meter, ss_visualizer,
                        checkpoint, val_iou_meter, train_iou_meter,
                        train_epoch_iou_meter, tb_logger])


Could someone give me some hints with my errors :persevere: :persevere:? Thanks.

this might be because of our VGG model (I heard some reports that finetuning gives lower accuracy).

It’s converted the Caffe model directly into pytorch format.

These models expect different preprocessing than the other models in the PyTorch model zoo. Images should be in BGR format in the range [0, 255], and the following BGR values should then be subtracted from each pixel: [103.939, 116.779, 123.68]

1 Like

Currently I’m using what used, a pretrained vgg16 model in PyTorch format.

    vgg16 = VGG16(pretrained=False)
    if pretrained:
        state_dict = torch.load('./vgg16_from_caffe.pth')

But the performance is much worse than pytorch-fcn’s implementation.

yes i understand, that’s why i am suggesting an alternative.

Maybe I did not write this clear, A BGR in [0, 255] format with BGR mean subtracted and a pretrained caffe-converted vgg16 model is exactly what i’m using, I also tried RGB in [0, 1] with pretrained vgg16 model in torchvision.models, neither of them worked fine.

Oh I see. Sorry, I missed that.

Maybe this repo comes at a good time:
Dilated ResNets for Semantic Segmentation with good results.

1 Like