[Solved] Initializing network and loading weights is extremely slow

I’m running into a problem where loading weights is quite slow.
I tried both CUDA 8.0 and CUDA 9.0. For both versions the problem occurs.
I disabled the random initialization of the weights because I load pretrained weights, but that didn’t solve the problem.

It looks like a similar problem like the following thread:
Thread 1152 - Strangely slow weight loading
However in this thread it was solved by updating CUDA.

What could I be doing wrong?
Here is my code:

class ResNet34(nn.Module):

def __init__(self, block, layers, initialize_weights, num_classes=1000):
    self.inplanes = 64
    super(ResNet34, self).__init__()
    self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=(35, 35),
                           bias=False)
    self.bn1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU(inplace=True)
    self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    self.layer1 = self._make_layer(block, 64, layers[0])
    self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
    self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
    self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
    self.avgpool = nn.AvgPool2d(7)
    self.fc_drop = nn.Dropout(p=0.75)
    self.fc = nn.Linear(512 * block.expansion, num_classes)

    if initialize_weights:
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, (2. / n)**.5)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

def _make_layer(self, block, planes, blocks, stride=1):
    downsample = None
    if stride != 1 or self.inplanes != planes * block.expansion:
        downsample = nn.Sequential(
            nn.Conv2d(self.inplanes, planes * block.expansion,
                      kernel_size=1, stride=stride, bias=False),
            nn.BatchNorm2d(planes * block.expansion),
        )

    layers = []
    layers.append(block(self.inplanes, planes, stride, downsample))
    self.inplanes = planes * block.expansion
    for i in range(1, blocks):
        layers.append(block(self.inplanes, planes))

    return nn.Sequential(*layers)

def forward(self, x):
    x = self.conv1(x)
    x = self.bn1(x)
    x0 = self.relu(x)
    x = self.maxpool(x0)

    x1 = self.layer1(x)
    x2 = self.layer2(x1)
    x3 = self.layer3(x2)
    x4 = self.layer4(x3)
    # x = self.fc_drop(x4)
    # x = self.avgpool(x)
    # x = x.view(x.size(0), -1)
    # x = self.fc(x)

    return x0, x1, x2, x3, x4


class HEDResNet(nn.Module):
def __init__(self, load_pretrained_resnet_weights, initialize_weights):
    super(HEDResNet, self).__init__()
    self.score_dsn1 = nn.Conv2d(64, 1, kernel_size=1, stride=1, padding=0)
    self.score_dsn2 = nn.Conv2d(64, 1, kernel_size=1, stride=1, padding=0)
    self.score_dsn3 = nn.Conv2d(128, 1, kernel_size=1, stride=1, padding=0)
    self.score_dsn4 = nn.Conv2d(256, 1, kernel_size=1, stride=1, padding=0)
    self.score_dsn5 = nn.Conv2d(512, 1, kernel_size=1, stride=1, padding=0)

    self.upsample1 = nn.Upsample(scale_factor=2, mode='bilinear')
    self.upsample2 = nn.Upsample(scale_factor=4, mode='bilinear')
    self.upsample3 = nn.Upsample(scale_factor=8, mode='bilinear')
    self.upsample4 = nn.Upsample(scale_factor=16, mode='bilinear')
    self.upsample5 = nn.Upsample(scale_factor=32, mode='bilinear')

    self.crop = torch.nn.ReflectionPad2d(-32)

    self.drop = nn.Dropout(p=.5)

    self.cd1 = nn.Conv2d(1472, 512, kernel_size=1, stride=1, padding=0)
    self.cd2 = nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0)
    self.cd3 = nn.Conv2d(512, 1, kernel_size=1, stride=1, padding=0)


    if initialize_weights:
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, (2. / n) ** .5)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    self.resnet = ResNet34(torchvision.models.resnet.BasicBlock, [3, 4, 6, 3], initialize_weights=False)
    if load_pretrained_resnet_weights:
        self.resnet.load_state_dict(torch.utils.model_zoo.load_url(torchvision.models.resnet.model_urls['resnet34']))

def forward(self, x):

    c1, c2, c3, c4, c5 = self.resnet.forward(x)

    s1 = self.score_dsn1(c1)
    s2 = self.score_dsn2(c2)
    s3 = self.score_dsn3(c3)
    s4 = self.score_dsn4(c4)
    s5 = self.score_dsn5(c5)

    s1 = self.upsample1(s1)
    s2 = self.upsample2(s2)
    s3 = self.upsample3(s3)
    s4 = self.upsample4(s4)
    s5 = self.upsample5(s5)

    s1 = F.sigmoid(s1)
    s2 = F.sigmoid(s2)
    s3 = F.sigmoid(s3)
    s4 = F.sigmoid(s4)
    s5 = F.sigmoid(s5)

    out = 0.2 * s1 + 0.2 * s2 + 0.2 * s3 + 0.2 * s4 + 0.2 * s5
    out = self.crop(out)

    return out

class HEDResNEtFeatures(nn.Module):

def __init__(self, gpu_ids, model_path="hed_resnet34_pretrained-20171105-204609.pkl",
             **kwargs):
    super(HEDResNEtFeatures, self).__init__()

    self.gpu_ids = gpu_ids
    self.hed = HEDResNet(load_pretrained_resnet_weights=False, initialize_weights=False)

    with open(model_path, 'rb') as f:
        metadata = pickle.load(f)

    self.hed.load_state_dict(metadata['param_values'])

    # disable gradient propagation for this feature network
    for param in self.hed.parameters():
        param.requires_grad = False

def forward(self, input):
    if len(self.gpu_ids) and isinstance(input.data, torch.cuda.FloatTensor):
        return nn.parallel.data_parallel(self.hed, input, self.gpu_ids)
    else:
        return self.hed(input)

The main thing happens in HEDResNEtFeatures init function.

I need to wait typically for 15 minutes before the weights are loaded into the network, which makes debugging impossible.

You shouldn’t use pickle to store/load state dict. Use torch.load and torch.save instead.

Thanks. Using torch.load solved the problem.