I’m running into a problem where loading weights is quite slow.
I tried both CUDA 8.0 and CUDA 9.0. For both versions the problem occurs.
I disabled the random initialization of the weights because I load pretrained weights, but that didn’t solve the problem.
It looks like a similar problem like the following thread:
Thread 1152 - Strangely slow weight loading
However in this thread it was solved by updating CUDA.
What could I be doing wrong?
Here is my code:
class ResNet34(nn.Module):
def __init__(self, block, layers, initialize_weights, num_classes=1000):
self.inplanes = 64
super(ResNet34, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=(35, 35),
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(7)
self.fc_drop = nn.Dropout(p=0.75)
self.fc = nn.Linear(512 * block.expansion, num_classes)
if initialize_weights:
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, (2. / n)**.5)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x0 = self.relu(x)
x = self.maxpool(x0)
x1 = self.layer1(x)
x2 = self.layer2(x1)
x3 = self.layer3(x2)
x4 = self.layer4(x3)
# x = self.fc_drop(x4)
# x = self.avgpool(x)
# x = x.view(x.size(0), -1)
# x = self.fc(x)
return x0, x1, x2, x3, x4
class HEDResNet(nn.Module):
def __init__(self, load_pretrained_resnet_weights, initialize_weights):
super(HEDResNet, self).__init__()
self.score_dsn1 = nn.Conv2d(64, 1, kernel_size=1, stride=1, padding=0)
self.score_dsn2 = nn.Conv2d(64, 1, kernel_size=1, stride=1, padding=0)
self.score_dsn3 = nn.Conv2d(128, 1, kernel_size=1, stride=1, padding=0)
self.score_dsn4 = nn.Conv2d(256, 1, kernel_size=1, stride=1, padding=0)
self.score_dsn5 = nn.Conv2d(512, 1, kernel_size=1, stride=1, padding=0)
self.upsample1 = nn.Upsample(scale_factor=2, mode='bilinear')
self.upsample2 = nn.Upsample(scale_factor=4, mode='bilinear')
self.upsample3 = nn.Upsample(scale_factor=8, mode='bilinear')
self.upsample4 = nn.Upsample(scale_factor=16, mode='bilinear')
self.upsample5 = nn.Upsample(scale_factor=32, mode='bilinear')
self.crop = torch.nn.ReflectionPad2d(-32)
self.drop = nn.Dropout(p=.5)
self.cd1 = nn.Conv2d(1472, 512, kernel_size=1, stride=1, padding=0)
self.cd2 = nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0)
self.cd3 = nn.Conv2d(512, 1, kernel_size=1, stride=1, padding=0)
if initialize_weights:
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, (2. / n) ** .5)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
self.resnet = ResNet34(torchvision.models.resnet.BasicBlock, [3, 4, 6, 3], initialize_weights=False)
if load_pretrained_resnet_weights:
self.resnet.load_state_dict(torch.utils.model_zoo.load_url(torchvision.models.resnet.model_urls['resnet34']))
def forward(self, x):
c1, c2, c3, c4, c5 = self.resnet.forward(x)
s1 = self.score_dsn1(c1)
s2 = self.score_dsn2(c2)
s3 = self.score_dsn3(c3)
s4 = self.score_dsn4(c4)
s5 = self.score_dsn5(c5)
s1 = self.upsample1(s1)
s2 = self.upsample2(s2)
s3 = self.upsample3(s3)
s4 = self.upsample4(s4)
s5 = self.upsample5(s5)
s1 = F.sigmoid(s1)
s2 = F.sigmoid(s2)
s3 = F.sigmoid(s3)
s4 = F.sigmoid(s4)
s5 = F.sigmoid(s5)
out = 0.2 * s1 + 0.2 * s2 + 0.2 * s3 + 0.2 * s4 + 0.2 * s5
out = self.crop(out)
return out
class HEDResNEtFeatures(nn.Module):
def __init__(self, gpu_ids, model_path="hed_resnet34_pretrained-20171105-204609.pkl",
**kwargs):
super(HEDResNEtFeatures, self).__init__()
self.gpu_ids = gpu_ids
self.hed = HEDResNet(load_pretrained_resnet_weights=False, initialize_weights=False)
with open(model_path, 'rb') as f:
metadata = pickle.load(f)
self.hed.load_state_dict(metadata['param_values'])
# disable gradient propagation for this feature network
for param in self.hed.parameters():
param.requires_grad = False
def forward(self, input):
if len(self.gpu_ids) and isinstance(input.data, torch.cuda.FloatTensor):
return nn.parallel.data_parallel(self.hed, input, self.gpu_ids)
else:
return self.hed(input)
The main thing happens in HEDResNEtFeatures init function.
I need to wait typically for 15 minutes before the weights are loaded into the network, which makes debugging impossible.