Missing key error when train with DistributedDataParallel

I trained the efficientDet with DistributedDataParallel

  model = EfficientDet(num_classes=args.num_class,
                         network=args.network,
                         W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
                         D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
                         D_class=EFFICIENTDET[args.network]['D_class']
                         )
    if(args.resume is not None):
        model.load_state_dict(checkpoint['state_dict'])
    del checkpoint
    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            print('Gpu setting...',args.gpu)
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu]
                #,output_device=[args.gpu]
                ,find_unused_parameters=True)
            #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])

            print('Run with DistributedDataParallel with divice_ids....A')

            #modify
            #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])

then for the evaluation I load the weight with

   if(args.weight is not None):
        resume_path = str(args.weight)
        print("Loading checkpoint: {} ...".format(resume_path))
        checkpoint = torch.load(
            args.weight, map_location=lambda storage, loc: storage)
        params = checkpoint['parser']
        args.num_class = params.num_class
        args.network = params.network
        model = EfficientDet(
            num_classes=args.num_class,
            network=args.network,
            W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
            D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
            D_class=EFFICIENTDET[args.network]['D_class'],
            is_training=False,
            threshold=args.threshold,
            iou_threshold=args.iou_threshold)
        model.load_state_dict(checkpoint['state_dict'])
    model = model.cuda()

then got the error

Traceback (most recent call last):
  File "demokogas.py", line 150, in <module>
    detect = Detect(weights=args.weight)
  File "demokogas.py", line 74, in __init__
    self.model.load_state_dict(state_dict)
  File "/home/jake/venv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1045, in load_state_dict
    self.__class__.__name__, "\n\t".join(error_msgs)))
RuntimeError: Error(s) in loading state_dict for EfficientDet:
	Missing key(s) in state_dict: "backbone._conv_stem.weight", "backbone._bn0.weight", "backbone._bn0.bias", "backbone._bn0.running_mean", "backbone._bn0.running_var", "backbone._blocks.0._depthwise_conv.weight", "backbone._blocks.0._bn1.weight", "backbone._blocks.0._bn1.bias", "backbone._blocks.0._bn1.running_mean", "backbone._blocks.0._bn1.running_var", "backbone._blocks.0._se_reduce.weight", "backbone._blocks.0._se_reduce.bias", "backbone._blocks.0._se_expand.weight", "backbone._blocks.0._se_expand.bias", "backbone._blocks.0._project_conv.weight", "backbone._blocks.0._bn2.weight", "backbone._blocks.0._bn2.bias", "backbone._blocks.0._bn2.running_mean", "backbone._blocks.0._bn2.running_var", "backbone._blocks.1._expand_conv.weight", "backbone._blocks.1._bn0.weight", "backbone._blocks.1._bn0.bias", "backbone._blocks.1._bn0.running_mean", "backbone._blocks.1._bn0.running_var", "backbone._blocks.1._depthwise_conv.weight", "backbone._blocks.1._bn1.weight", "backbone._blocks.1._bn1.bias", "backbone._blocks.1._bn1.running_mean", "backbone._blocks.1._bn1.running_var", "backbone._blocks.1._se_reduce.weight", "backbone._blocks.1._se_reduce.bias", "backbone._blocks.1._se_expand.weight", "backbone._blocks.1._se_expand.bias", "backbone._blocks.1._project_conv.weight", "backbone._blocks.1._bn2.weight", "backbone._blocks.1._bn2.bias", "backbone._blocks.1._bn2.running_mean", "backbone._blocks.1._bn2.running_var", "backbone._blocks.2._expand_conv.weight", "backbone._blocks.2._bn0.weight", "backbone._blocks.2._bn0.bias", "backbone._blocks.2._bn0.running_mean", "backbone._blocks.2._bn0.running_var", "backbone._blocks.2._depthwise_conv.weight", "backbone._blocks.2._bn1.weight", "backbone._blocks.2._bn1.bias", "backbone._blocks.2._bn1.running_mean", "backbone._blocks.2._bn1.running_var", "backbone._blocks.2._se_reduce.weight", "backbone._blocks.2._se_reduce.bias", "backbone._blocks.2._se_expand.weight", "backbone._blocks.2._se_expand.bias", "backbone._blocks.2._project_conv.weight", "backbone._blocks.2._bn2.weight", "backbone._blocks.2._bn2.bias", "backbone._blocks.2._bn2.running_mean", "backbone._blocks.2._bn2.running_var", "backbone._blocks.3._expand_conv.weight", "backbone._blocks.3._bn0.weight", "backbone._blocks.3._bn0.bias", "backbone._blocks.3._bn0.running_mean", "backbone._blocks.3._bn0.running_var", "backbone._blocks.3._depthwise_conv.weight", "backbone._blocks.3._bn1.weight", "backbone._blocks.3._bn1.bias", "backbone._blocks.3._bn1.running_mean", "backbone._blocks.3._bn1.running_var", "backbone._blocks.3._se_reduce.weight", "backbone._blocks.3._se_reduce.bias", "backbone._blocks.3._se_expand.weight", "backbone._blocks.3._se_expand.bias", "backbone._blocks.3._project_conv.weight", "backbone._blocks.3._bn2.weight", "backbone._blocks.3._bn2.bias", "backbone._blocks.3._bn2.running_mean", "backbone._blocks.3._bn2.running_var", "backbone._blocks.4._expand_conv.weight", "backbone._blocks.4._bn0.weight", "backbone._blocks.4._bn0.bias", "backbone._blocks.4._bn0.running_mean", "backbone._blocks.4._bn0.running_var", "backbone._blocks.4._depthwise_conv.weight", "backbone._blocks.4._bn1.weight", "backbone._blocks.4._bn1.bias", "backbone._blocks.4._bn1.running_mean", "backbone._blocks.4._bn1.running_var", "backbone._blocks.4._se_reduce.weight", "backbone._blocks.4._se_reduce.bias", "backbone._blocks.4._se_expand.weight", "backbone._blocks.4._se_expand.bias", "backbone._blocks.4._project_conv.weight", "backbone._blocks.4._bn2.weight", "backbone._blocks.4._bn2.bias", "backbone._blocks.4._bn2.running_mean", "backbone._blocks.4._bn2.running_var", "backbone._blocks.5._expand_conv.weight", "backbone._blocks.5._bn0.weight", "backbone._blocks.5._bn0.bias", "backbone._blocks.5._bn0.running_mean", "backbone._blocks.5._bn0.running_var", "backbone._blocks.5._depthwise_conv.weight", "backbone._blocks.5._bn1.weight", "backbone._blocks.5._bn1.bias", "backbone._blocks.5._bn1.running_mean", "backbone._blocks.5._bn1.running_var", "backbone._blocks.5._se_reduce.weight", "backbone._blocks.5._se_reduce.bias", "backbone._blocks.5._se_expand.weight", "backbone._blocks.5._se_expand.bias", "backbone._blocks.5._project_conv.weight", "backbone._blocks.5._bn2.weight", "backbone._blocks.5._bn2.bias", "backbone._blocks.5._bn2.running_mean", "backbone._blocks.5._bn2.running_var", "backbone._blocks.6._expand_conv.weight", "backbone._blocks.6._bn0.weight", "backbone._blocks.6._bn0.bias", "backbone._blocks.6._bn0.running_mean", "backbone._blocks.6._bn0.running_var", "backbone._blocks.6._depthwise_conv.weight", "backbone._blocks.6._bn1.weight", "backbone._blocks.6._bn1.bias", "backbone._blocks.6._bn1.running_mean", "backbone._blocks.6._bn1.running_var", "backbone._blocks.6._se_reduce.weight", "backbone._blocks.6._se_reduce.bias", "backbone._blocks.6._se_expand.weight", "backbone._blocks.6._se_expand.bias", "backbone._blocks.6._project_conv.weight", "backbone._blocks.6._bn2.weight", "backbone._blocks.6._bn2.bias", "backbone._blocks.6._bn2.running_mean", "backbone._blocks.6._bn2.running_var", "backbone._blocks.7._expand_conv.weight", "backbone._blocks.7._bn0.weight", "backbone._blocks.7._bn0.bias", "backbone._blocks.7._bn0.running_mean", "backbone._blocks.7._bn0.running_var", "backbone._blocks.7._depthwise_conv.weight", "backbone._blocks.7._bn1.weight", "backbone._blocks.7._bn1.bias", "backbone._blocks.7._bn1.running_mean", "backbone._blocks.7._bn1.running_var", "backbone._blocks.7._se_reduce.weight", "backbone._blocks.7._se_reduce.bias", "backbone._blocks.7._se_expand.weight", "backbone._blocks.7._se_expand.bias", "backbone._blocks.7._project_conv.weight", "backbone._blocks.7._bn2.weight", "backbone._blocks.7._bn2.bias", "backbone._blocks.7._bn2.running_mean", "backbone._blocks.7._bn2.running_var", "backbone._blocks.8._expand_conv.weight", "backbone._blocks.8._bn0.weight", "backbone._blocks.8._bn0.bias", "backbone._blocks.8._bn0.running_mean", "backbone._blocks.8._bn0.running_var", "backbone._blocks.8._depthwise_conv.weight", "backbone._blocks.8._bn1.weight", "backbone._blocks.8._bn1.bias", "backbone._blocks.8._bn1.running_mean", "backbone._blocks.8._bn1.running_var", "backbone._blocks.8._se_reduce.weight", "backbone._blocks.8._se_reduce.bias", "backbone._blocks.8._se_expand.weight", "backbone._blocks.8._se_expand.bias", "backbone._blocks.8._project_conv.weight", "backbone._blocks.8._bn2.weight", "backbone._blocks.8._bn2.bias", "backbone._blocks.8._bn2.running_mean", "backbone._blocks.8._bn2.running_var", "backbone._blocks.9._expand_conv.weight", "backbone._blocks.9._bn0.weight", "backbone._blocks.9._bn0.bias", "backbone._blocks.9._bn0.running_mean", "backbone._blocks.9._bn0.running_var", "backbone._blocks.9._depthwise_conv.weight", "backbone._blocks.9._bn1.weight", "backbone._blocks.9._bn1.bias", "backbone._blocks.9._bn1.running_mean", "backbone._blocks.9._bn1.running_var", "backbone._blocks.9._se_reduce.weight", "backbone._blocks.9._se_reduce.bias", "backbone._blocks.9._se_expand.weight", "backbone._blocks.9._se_expand.bias", "backbone._blocks.9._project_conv.weight", "backbone._blocks.9._bn2.weight", "backbone._blocks.9._bn2.bias", "backbone._blocks.9._bn2.running_mean", "backbone._blocks.9._bn2.running_var", "backbone._blocks.10._expand_conv.weight", "backbone._blocks.10._bn0.weight", "backbone._blocks.10._bn0.bias", "backbone._blocks.10._bn0.running_mean", "backbone._blocks.10._bn0.running_var", "backbone._blocks.10._depthwise_conv.weight", "backbone._blocks.10._bn1.weight", "backbone._blocks.10._bn1.bias", "backbone._blocks.10._bn1.running_mean", "backbone._blocks.10._bn1.running_var", "backbone._blocks.10._se_reduce.weight", "backbone._blocks.10._se_reduce.bias", "backbone._blocks.10._se_expand.weight", "backbone._blocks.10._se_expand.bias", "backbone._blocks.10._project_conv.weight", "backbone._blocks.10._bn2.weight", "backbone._blocks.10._bn2.bias", "backbone._blocks.10._bn2.running_mean", "backbone._blocks.10._bn2.running_var", "backbone._blocks.11._expand_conv.weight", "backbone._blocks.11._bn0.weight", "backbone._blocks.11._bn0.bias", "backbone._blocks.11._bn0.running_mean", "backbone._blocks.11._bn0.running_var", "backbone._blocks.11._depthwise_conv.weight", "backbone._blocks.11._bn1.weight", "backbone._blocks.11._bn1.bias", "backbone._blocks.11._bn1.running_mean", "backbone._blocks.11._bn1.running_var", "backbone._blocks.11._se_reduce.weight", "backbone._blocks.11._se_reduce.bias", "backbone._blocks.11._se_expand.weight", "backbone._blocks.11._se_expand.bias", "backbone._blocks.11._project_conv.weight", "backbone._blocks.11._bn2.weight", "backbone._blocks.11._bn2.bias", "backbone._blocks.11._bn2.running_mean", "backbone._blocks.11._bn2.running_var", "backbone._blocks.12._expand_conv.weight", "backbone._blocks.12._bn0.weight", "backbone._blocks.12._bn0.bias", "backbone._blocks.12._bn0.running_mean", "backbone._blocks.12._bn0.running_var", "backbone._blocks.12._depthwise_conv.weight", "backbone._blocks.12._bn1.weight", "backbone._blocks.12._bn1.bias", "backbone._blocks.12._bn1.running_mean", "backbone._blocks.12._bn1.running_var", "backbone._blocks.12._se_reduce.weight", "backbone._blocks.12._se_reduce.bias", "backbone._blocks.12._se_expand.weight", "backbone._blocks.12._se_expand.bias", "backbone._blocks.12._project_conv.weight", "backbone._blocks.12._bn2.weight", "backbone._blocks.12._bn2.bias", "backbone._blocks.12._bn2.running_mean", "backbone._blocks.12._bn2.running_var", "backbone._blocks.13._expand_conv.weight", "backbone._blocks.13._bn0.weight", "backbone._blocks.13._bn0.bias", "backbone._blocks.13._bn0.running_mean", "backbone._blocks.13._bn0.running_var", "backbone._blocks.13._depthwise_conv.weight", "backbone._blocks.13._bn1.weight", "backbone._blocks.13._bn1.bias", "backbone._blocks.13._bn1.running_mean", "backbone._blocks.13._bn1.running_var", "backbone._blocks.13._se_reduce.weight", "backbone._blocks.13._se_reduce.bias", "backbone._blocks.13._se_expand.weight", "backbone._blocks.13._se_expand.bias", "backbone._blocks.13._project_conv.weight", "backbone._blocks.13._bn2.weight", "backbone._blocks.13._bn2.bias", "backbone._blocks.13._bn2.running_mean", "backbone._blocks.13._bn2.running_var", "backbone._blocks.14._expand_conv.weight", "backbone._blocks.14._bn0.weight", "backbone._blocks.14._bn0.bias", "backbone._blocks.14._bn0.running_mean", "backbone._blocks.14._bn0.running_var", "backbone._blocks.14._depthwise_conv.weight", "backbone._blocks.14._bn1.weight", "backbone._blocks.14._bn1.bias", "backbone._blocks.14._bn1.running_mean", "backbone._blocks.14._bn1.running_var", "backbone._blocks.14._se_reduce.weight", "backbone._blocks.14._se_reduce.bias", "backbone._blocks.14._se_expand.weight", "backbone._blocks.14._se_expand.bias", "backbone._blocks.14._project_conv.weight", "backbone._blocks.14._bn2.weight", "backbone._blocks.14._bn2.bias", "backbone._blocks.14._bn2.running_mean", "backbone._blocks.14._bn2.running_var", "backbone._blocks.15._expand_conv.weight", "backbone._blocks.15._bn0.weight", "backbone._blocks.15._bn0.bias", "backbone._blocks.15._bn0.running_mean", "backbone._blocks.15._bn0.running_var", "backbone._blocks.15._depthwise_conv.weight", "backbone._blocks.15._bn1.weight", "backbone._blocks.15._bn1.bias", "backbone._blocks.15._bn1.running_mean", "backbone._blocks.15._bn1.running_var", "backbone._blocks.15._se_reduce.weight", "backbone._blocks.15._se_reduce.bias", "backbone._blocks.15._se_expand.weight", "backbone._blocks.15._se_expand.bias", "backbone._blocks.15._project_conv.weight", "backbone._blocks.15._bn2.weight", "backbone._blocks.15._bn2.bias", "backbone._blocks.15._bn2.running_mean", "backbone._blocks.15._bn2.running_var", "backbone._conv_head.weight", "backbone._bn1.weight", "backbone._bn1.bias", "backbone._bn1.running_mean", "backbone._bn1.running_var", "backbone._fc.weight", "backbone._fc.bias", "neck.lateral_convs.0.conv.weight", "neck.lateral_convs.0.conv.bias", "neck.lateral_convs.1.conv.weight", "neck.lateral_convs.1.conv.bias", "neck.lateral_convs.2.conv.weight", "neck.lateral_convs.2.conv.bias", "neck.lateral_convs.3.conv.weight", "neck.lateral_convs.3.conv.bias", "neck.lateral_convs.4.conv.weight", "neck.lateral_convs.4.conv.bias", "neck.stack_bifpn_convs.0.w1", "neck.stack_bifpn_convs.0.w2", "neck.stack_bifpn_convs.0.bifpn_convs.0.0.conv.weight", "neck.stack_bifpn_convs.0.bifpn_convs.0.0.conv.bias", "neck.stack_bifpn_convs.0.bifpn_convs.1.0.conv.weight", "neck.stack_bifpn_convs.0.bifpn_convs.1.0.conv.bias", "neck.stack_bifpn_convs.0.bifpn_convs.2.0.conv.weight", "neck.stack_bifpn_convs.0.bifpn_convs.2.0.conv.bias", "neck.stack_bifpn_convs.0.bifpn_convs.3.0.conv.weight", "neck.stack_bifpn_convs.0.bifpn_convs.3.0.conv.bias", "neck.stack_bifpn_convs.0.bifpn_convs.4.0.conv.weight", "neck.stack_bifpn_convs.0.bifpn_convs.4.0.conv.bias", "neck.stack_bifpn_convs.0.bifpn_convs.5.0.conv.weight", "neck.stack_bifpn_c

It seems that you are saving state_dict saved from a single-gpu model and loading it to your DDP model.
DDP models have their elements under .module.
ex) self.model.module.backbone._conv_stem
I’d recommend you to try loading the state_dict by
self.model.module.load_state_dict(state_dict).

You can find more details in this thread.

This is my efficient Det then when I used the
self.model.module.load_state_dict(state_dict).

it said torch.nn.modules.module.ModuleAttributeError: ‘EfficientDet’ object has no attribute ‘module’

import torch
import torch.nn as nn
import math
from models.efficientnet import EfficientNet
from models.bifpn import BIFPN
from .retinahead import RetinaHead
from models.module import RegressionModel, ClassificationModel, Anchors, ClipBoxes, BBoxTransform
from torchvision.ops import nms
from .losses import FocalLoss
MODEL_MAP = {
    'efficientdet-d0': 'efficientnet-b0',
    'efficientdet-d1': 'efficientnet-b1',
    'efficientdet-d2': 'efficientnet-b2',
    'efficientdet-d3': 'efficientnet-b3',
    'efficientdet-d4': 'efficientnet-b4',
    'efficientdet-d5': 'efficientnet-b5',
    'efficientdet-d6': 'efficientnet-b6',
    'efficientdet-d7': 'efficientnet-b6',
}


class EfficientDet(nn.Module):
    def __init__(self,
                 num_classes,
                 network='efficientdet-d0',
                 D_bifpn=3,
                 W_bifpn=88,
                 D_class=3,
                 is_training=True,
                 threshold=0.01,
                 iou_threshold=0.5):
        super(EfficientDet, self).__init__()
        self.backbone = EfficientNet.from_pretrained(MODEL_MAP[network])
        self.is_training = is_training
        self.neck = BIFPN(in_channels=self.backbone.get_list_features()[-5:],
                          out_channels=W_bifpn,
                          stack=D_bifpn,
                          num_outs=5)
        self.bbox_head = RetinaHead(num_classes=num_classes,
                                    in_channels=W_bifpn)

        self.anchors = Anchors()
        self.regressBoxes = BBoxTransform()
        self.clipBoxes = ClipBoxes()
        self.threshold = threshold
        self.iou_threshold = iou_threshold
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        self.freeze_bn()
        self.criterion = FocalLoss()

    def forward(self, inputs):
        #print('EfficientDet forwarding...')
        if self.is_training:
            #try:
            inputs, annotations = inputs
            #except:
            #    inputs = inputs
            #    self.is_training = False
        else:
            #try:
            inputs = inputs
            #except:
            #inputs, annotations = inputs
            #self.is_training = True
        x = self.extract_feat(inputs)
        outs = self.bbox_head(x)
        classification = torch.cat([out for out in outs[0]], dim=1)
        regression = torch.cat([out for out in outs[1]], dim=1)
        anchors = self.anchors(inputs)
        if self.is_training:
            return self.criterion(classification, regression, anchors, annotations)
        else:
            transformed_anchors = self.regressBoxes(anchors, regression)
            transformed_anchors = self.clipBoxes(transformed_anchors, inputs)
            scores = torch.max(classification, dim=2, keepdim=True)[0]
            scores_over_thresh = (scores > self.threshold)[0, :, 0]

            if scores_over_thresh.sum() == 0:
                #print('No boxes to NMS 222')
                # no boxes to NMS, just return
                return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)]
            classification = classification[:, scores_over_thresh, :]
            transformed_anchors = transformed_anchors[:, scores_over_thresh, :]
            scores = scores[:, scores_over_thresh, :]
            anchors_nms_idx = nms(
                transformed_anchors[0, :, :], scores[0, :, 0], iou_threshold=self.iou_threshold)
            nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(
                dim=1)
            return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]]

    def freeze_bn(self):
        '''Freeze BatchNorm layers.'''
        for layer in self.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.eval()

    def extract_feat(self, img):
        """
            Directly extract features from the backbone+neck
        """
        x = self.backbone(img)
        x = self.neck(x[-5:])
        return x

How to yo set up the DistributedDataparallel for evaluation?

 self.model = EfficientDet(num_classes=args.num_class,
                         network=args.network,
                         W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
                         D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
                         D_class=EFFICIENTDET[args.network]['D_class']
                         )

        #self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[args.gpu],find_unused_parameters=True)
        #self.model =  torch.nn.parallel.DistributedDataParallel(self.model)
        self.model = torch.nn.parallel.DistributedDataParallel(
            self.model, output_device=[1])

Oh, I was mistaken. You are already loading the parameters before setting up DDP.
Then it is not necessary to load to use self.model.module.load_state_dict.

Would you mind checking if this is the opposite case?
How did you save your ‘state_dict’?
Did you save parameters of the DDP model as
self.model.state_dict() where
isinstance(self.model, torch.nn.parallel.DistributedDataParallel) == True?
If so, you may need to encapsulate the model as DDP and then load parameters.

model = ...
model = torch.nn.parallel.DistributedDataParallel(
    model, device_ids=[args.gpu]
    ,find_unused_parameters=True)

checkpoint = torch.load(...)

model.load_state_dict(checkpoint[state_dict])

I saved the file like this

      torch.save(
            state,
            os.path.join(
                args.save_folder,
                args.dataset,
                args.network,
                "checkpoint_{}.pth".format(epoch)))

and the key missing errror is happen with this code

 self.model = EfficientDet(num_classes=args.num_class,
                         network=args.network,
                         W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
                         D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
                         D_class=EFFICIENTDET[args.network]['D_class']
                         )

        #self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[args.gpu],find_unused_parameters=True)
        #self.model =  torch.nn.parallel.DistributedDataParallel(self.model)
        #self.model = torch.nn.parallel.DistributedDataParallel(
        #    self.model, output_device=[1])

        self.model = self.model.cuda()
        if(self.weights is not None):
            print('load state dic...',self.weights)
            checkpoint = torch.load(
                self.weights, map_location=lambda storage, loc: storage)

            #self.model = torch.nn.parallel.DistributedDataParallel(self.model)
            state_dict = checkpoint['state_dict']
            self.model.load_state_dict(state_dict)
            #self.model.module.load_state_dict(state_dict)
        if torch.cuda.is_available():
            self.model = self.model.cuda()
        self.model.eval()

Im not sure how to declare

#self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[args.gpu],find_unused_parameters=True)
        #self.model =  torch.nn.parallel.DistributedDataParallel(self.model)
        #self.model = torch.nn.parallel.DistributedDataParallel(
        #    self.model, output_device=[1])

to use the model.module
do I need to declare the model has

  #self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[args.gpu],find_unused_parameters=True)
        #self.model =  torch.nn.parallel.DistributedDataParallel(self.model)
        #self.model = torch.nn.parallel.DistributedDataParallel(
        #    self.model, output_device=[1])

to load the trained DistributedDataParallel model?

No, what I meant is just simply moving the line

model.load_state_dict(checkpoint['state_dict'])

after you casting DDP encapsulation not before it.

model = torch.nn.parallel.DistributedDataParallel(
    model, device_ids=[args.gpu]
    #,output_device=[args.gpu]
    ,find_unused_parameters=True)

In short, you can try:

model = torch.nn.parallel.DistributedDataParallel(
    model, device_ids=[args.gpu]
    #,output_device=[args.gpu]
    ,find_unused_parameters=True)

model.load_state_dict(checkpoint['state_dict'])

When you cast your model as DataParallel or DistributedDataParallel, they move your layers under .module. For example, self.model.backbone will move to self.model.module.backbone.
When you call your parameters as self.model.state_dict() at saving, the changed hierarchy will also be applied to the state_dict.
Due to the difference, you will be able to load your state_dict when your model is the same DistributedDataParallel class, not the basic nn.Module class.

ok now i understand to using .module this is my calling model part

 # multi gpu load
        self.model = EfficientDet(num_classes=args.num_class,
                         network=args.network,
                         W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
                         D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
                         D_class=EFFICIENTDET[args.network]['D_class']
                         )

        if torch.cuda.is_available():
             self.model = self.model.cuda()


        if args.distributed: 
            self.model = self.model.to(args.rank)

            self.model = torch.nn.parallel.DistributedDataParallel(self.model
            ,device_ids=[args.rank]
            ,output_device=[args.rank]
            ,find_unused_parameters=True)

            self.model = self.model.module


        #self.model = self.model.cuda()
        if(self.weights is not None):
            print('load state dic...',self.weights)
            checkpoint = torch.load(
                self.weights, map_location=lambda storage, loc: storage)
            state_dict = checkpoint['state_dict']
            self.model.load_state_dict(state_dict)
        if torch.cuda.is_available():
            self.model = self.model.cuda()
        self.model.eval()

then now it gives following error

  File "/home/jake/venv/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 305, in __init__
    self.process_group = _get_default_group()
  File "/home/jake/venv/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 285, in _get_default_group
    raise RuntimeError("Default process group has not been initialized, "
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

args.rank is 0

  1. The error is related to DDP initialization, not model loading.
    You should initialize a distributed process group before creating a DDP module.
    As you said you already trained your model with DDP, maybe the dist.init_process_group function moved to somewhere unwanted.

  2. When loading model parameters after you create DDP model, it is safe to load parameters on all processes.

If you want to get a general feeling of DDP code, you can refer to this DDP setup and model definition example. See how state_dict, save, load, parallelize functions are defined.

1 Like

can you show me the code example with my previous code?