Changing the backbone

I’m trynig to change the backbone from efficientnet to mix visual transformer, but I got this error:

ValueError: not enough values to unpack (expected 4, got 3)

Any suggestion to solve it.
The problem’s snippet is:

import torch
from torch import nn
import timm

from hybridnets.model import BiFPN, Regressor, Classifier, BiFPNDecoder
from utils.utils import AnchorsPreformatted text
from hybridnets.model import SegmentationHead

from encoders import get_encoder
from utils.constants import *

class HybridNetsBackbone(nn.Module):
def init(self, num_classes=80, compound_coef=0, seg_classes=1, backbone_name=None, seg_mode=MULTICLASS_MODE, onnx_export=False, **kwargs):
super(HybridNetsBackbone, self).init()
self.compound_coef = compound_coef

    self.seg_classes = seg_classes
    self.seg_mode = seg_mode

    self.backbone_compound_coef = [0, 1, 2, 3, 4, 5, 6, 6, 7]
    self.fpn_num_filters = [64, 88, 112, 160, 224, 288, 384, 384, 384]
    self.fpn_cell_repeats = [3, 4, 5, 6, 7, 7, 8, 8, 8]
    self.input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536]
    self.box_class_repeats = [3, 3, 3, 4, 4, 4, 5, 5, 5]
    self.pyramid_levels = [5, 5, 5, 5, 5, 5, 5, 5, 6]
    self.anchor_scale = [1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,]
    # self.anchor_scale = [2.,2.,2.,2.,2.,2.,2.,2.,2.,]
    self.aspect_ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])
    self.num_scales = len(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]))
    conv_channel_coef = {
        # the channels of P3/P4/P5.
        0: [40, 112, 320],
        1: [40, 112, 320],
        2: [48, 120, 352],
        3: [48, 136, 384],
        4: [56, 160, 448],
        5: [64, 176, 512],
        6: [72, 200, 576],
        7: [72, 200, 576],
        8: [80, 224, 640],

    self.onnx_export = onnx_export
    num_anchors = len(self.aspect_ratios) * self.num_scales

    self.bifpn = nn.Sequential(
                True if _ == 0 else False,
                attention=True if compound_coef < 6 else False,
                use_p8=compound_coef > 7,
          for _ in range(self.fpn_cell_repeats[compound_coef])])

    self.num_classes = num_classes
    self.regressor = Regressor(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,

    '''Modified by Dat Vu'''
    # self.decoder = DecoderModule()
    self.bifpndecoder = BiFPNDecoder(pyramid_channels=self.fpn_num_filters[self.compound_coef])

    self.segmentation_head = SegmentationHead(
        out_channels=1 if self.seg_mode == BINARY_MODE else self.seg_classes+1,

    self.classifier = Classifier(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,

    if backbone_name:
        self.encoder = timm.create_model(backbone_name, pretrained=True, features_only=True, out_indices=(2,3,4))  # P3,P4,P5
        # EfficientNet_Pytorch
        self.encoder = get_encoder(
            'efficientnet-b' + str(self.backbone_compound_coef[compound_coef]),

    self.anchors = Anchors(anchor_scale=self.anchor_scale[compound_coef],
                           pyramid_levels=(torch.arange(self.pyramid_levels[self.compound_coef]) + 3).tolist(),
    if onnx_export:
        ## TODO: timm


def freeze_bn(self):
    for m in self.modules():
        if isinstance(m, nn.BatchNorm2d):

def forward(self, inputs):
    # p1, p2, p3, p4, p5 = self.backbone_net(inputs)
    p2, p3, p4, p5 = self.encoder(inputs)[-4:]  # self.backbone_net(inputs)

    features = (p3, p4, p5)

    features = self.bifpn(features)
    p3,p4,p5,p6,p7 = features
    outputs = self.bifpndecoder((p2,p3,p4,p5,p6,p7))

    segmentation = self.segmentation_head(outputs)
    regression = self.regressor(features)
    classification = self.classifier(features)
    anchors = self.anchors(inputs, inputs.dtype)
    if not self.onnx_export:
        return features, regression, classification, anchors, segmentation
        return regression, classification, segmentation
def initialize_decoder(self, module):
    for m in module.modules():

        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu")
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)

        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)

        elif isinstance(m, nn.Linear):
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)

def initialize_head(self, module):
    for m in module.modules():
        if isinstance(m, (nn.Linear, nn.Conv2d)):
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)

It’s unclear where the error is coming from and your code is unfortunately not executable.
However, based on the model definition I would recommend checking the output of HybridNetsBackbone.forward as it’s returning 3 values if self.onnx_export is not set while your training code might expect 4.

Sorry for the late reply, I was trying to fix the output of HybridNetsBackbone.forward as mentioned below:

def forward(self, inputs):
# Adjust the encoder output
encoder_features = self.encoder(inputs)

if len(encoder_features) == 3:
    #3 feature maps
    p3, p4, p5 = encoder_features[-3:]
    p2 = None  # p2 is not available
    p2, p3, p4, p5 = encoder_features[-4:]

features = (p3, p4, p5)

features = self.bifpn(features)

p3, p4, p5, p6, p7 = features

if p2 is not None:
    outputs = self.bifpndecoder((p2, p3, p4, p5, p6, p7))
    outputs = self.bifpndecoder((p3, p4, p5, p6, p7))

segmentation = self.segmentation_head(outputs)

regression = self.regressor(features)
classification = self.classifier(features)
anchors = self.anchors(inputs, inputs.dtype)

if not self.onnx_export:
    return features, regression, classification, anchors, segmentation
    return regression, classification, segmentation

But I got this error:

RuntimeError: Given groups=1, weight of size [160, 384, 1, 1], expected input[8, 960, 12, 20] to have 384 channels, but got 960 channels instead

And even when I tried to change the backbone of hybridnets through some of the existing encoders in the code base, I got the same error message:

ValueError: not enough values to unpack (expected 4, got 3)