Hi,
I’m struggling with the CPU version of pytorch.
I did learn my network on GPU, and when I apply it with GPU I have no problem.
I would like to execute it on CPU. My problems is that some time I get a respons in like 30s, and for the same network but further in the learning, it increase up to 3 min and more.
I checked the different layers, and it seems that it come from the full connected layers, which go from 10s to 2min to apply.
My problem is not really the time of execution, but the difference of time beetween to checkpoints of the same model.
For information:
pytorch 0.3.1 (install with pip)
python 3.5.2
The part of the code that create/apply the network is this one, extracted from https://github.com/ZijunDeng/pytorch-semantic-segmentation :
class FCN8s(nn.Module):
def __init__(self, num_classes):
super(FCN8s, self).__init__()
vgg = models.vgg16()
features, classifier = list(vgg.features.children()), list(vgg.classifier.children())
features[0].padding = (100, 100)
for f in features:
if 'MaxPool' in f.__class__.__name__:
f.ceil_mode = True
elif 'ReLU' in f.__class__.__name__:
f.inplace = True
self.features3 = nn.Sequential(*features[: 17])
self.features4 = nn.Sequential(*features[17: 24])
self.features5 = nn.Sequential(*features[24:])
self.score_pool3 = nn.Conv2d(256, num_classes, kernel_size=1)
self.score_pool4 = nn.Conv2d(512, num_classes, kernel_size=1)
self.score_pool3.weight.data.zero_()
self.score_pool3.bias.data.zero_()
self.score_pool4.weight.data.zero_()
self.score_pool4.bias.data.zero_()
fc6 = nn.Conv2d(512, 4096, kernel_size=7)
fc6.weight.data.copy_(classifier[0].weight.data.view(4096, 512, 7, 7))
fc6.bias.data.copy_(classifier[0].bias.data)
fc7 = nn.Conv2d(4096, 4096, kernel_size=1)
fc7.weight.data.copy_(classifier[3].weight.data.view(4096, 4096, 1, 1))
fc7.bias.data.copy_(classifier[3].bias.data)
score_fr = nn.Conv2d(4096, num_classes, kernel_size=1)
score_fr.weight.data.zero_()
score_fr.bias.data.zero_()
self.score_fr = nn.Sequential(
fc6, nn.ReLU(inplace=True), nn.Dropout(), fc7, nn.ReLU(inplace=True), nn.Dropout(), score_fr
)
self.upscore2 = nn.ConvTranspose2d(num_classes, num_classes, kernel_size=4, stride=2, bias=False)
self.upscore_pool4 = nn.ConvTranspose2d(num_classes, num_classes, kernel_size=4, stride=2, bias=False)
self.upscore8 = nn.ConvTranspose2d(num_classes, num_classes, kernel_size=16, stride=8, bias=False)
self.upscore2.weight.data.copy_(get_upsampling_weight(num_classes, num_classes, 4))
self.upscore_pool4.weight.data.copy_(get_upsampling_weight(num_classes, num_classes, 4))
self.upscore8.weight.data.copy_(get_upsampling_weight(num_classes, num_classes, 16))
def forward(self, x):
x_size = x.size()
pool3 = self.features3(x)
pool4 = self.features4(pool3)
pool5 = self.features5(pool4)
score_fr = self.score_fr(pool5)
upscore2 = self.upscore2(score_fr)
score_pool4 = self.score_pool4(0.01 * pool4)
upscore_pool4 = self.upscore_pool4(
score_pool4[:, :, 5: (5 + upscore2.size()[2]), 5: (5 + upscore2.size()[3])]
+ upscore2)
score_pool3 = self.score_pool3(0.0001 * pool3)
upscore8 = self.upscore8(
score_pool3[:, :, 9: (9 + upscore_pool4.size()[2]), 9: (9 + upscore_pool4.size()[3])]
+ upscore_pool4)
return upscore8[:, :, 31: (31 + x_size[2]), 31: (31 + x_size[3])].contiguous()
Is there sommething with the way I apply the network ?
Thanks