Out of memory when using pretrained models

When I replace the feature encoder layers of my semantic segmentation models with pretrained VGG16 from torchvision I always encounter that python runs out of cuda memory (12GB).

I wonder how this can be when the models should be equal (I have no problems with cuda when hardcoding the complete network definition myself).

Can it be that pytorch does not free up the memory from unused layers of model.vgg16()?

Source code can be found here.

@apaszke I found out this does not occur when I directly inherit nn.Module. So maybe we should not use class inheritance here?

PyTorch definitely won’t free up memory from layers that you defined in __init__ but didn’t use in forward, because the Parameters those layers use will still be in scope. That should explain the behavior you observe with inheritance.

1 Like

Even though I do not save it as class property?

class FCN(nn.Module):

    def __init__(self, num_classes):
        super().__init__()

        feat = list(models.vgg16(pretrained=True).features.children())

        self.feat1 = nn.Sequential(*feat[0:4])
        self.feat2 = nn.Sequential(*feat[5:9])
        self.feat3 = nn.Sequential(*feat[10:16])
        self.feat4 = nn.Sequential(*feat[17:23])
        self.feat5 = nn.Sequential(*feat[24:30])
        self.fconn = nn.Sequential(
            nn.Conv2d(512, 4096, 7),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Conv2d(4096, 4096, 1),
            nn.ReLU(inplace=True),
            nn.Dropout(),
        )
        self.score_fconn = nn.Conv2d(4096, num_classes, 1)

    def forward(self, x):
        x = self.feat1(x)
        x = self.feat2(x)
        x = self.feat3(x)

        return x


class FCN8(FCN):

 def __init__(self, num_classes):
    super().__init__(num_classes)

    self.score_feat3 = nn.Conv2d(256, num_classes, 1)
    self.score_feat4 = nn.Conv2d(512, num_classes, 1)

def forward(self, x):
    feat3 = super().forward(x)
    feat4 = self.feat4(feat3)
    feat5 = self.feat5(feat4)
    fconn = self.fconn(feat5)

    score_feat3 = self.score_feat3(feat3)
    score_feat4 = self.score_feat4(feat4)
    score_fconn = self.score_fconn(fconn)

    score = F.upsample_bilinear(score_fconn, score_feat4.size()[2:])
    score += score_feat4
    score = F.upsample_bilinear(score, score_feat3.size()[2:])
    score += score_feat3

    return F.upsample_bilinear(score, x.size()[2:])

This however works:

class FCN8(nn.Module):

    def __init__(self, num_classes):
        super().__init__()

        feats = list(models.vgg16(pretrained=True).features.children())

        self.feats = nn.Sequential(*feats[0:9])
        self.feat3 = nn.Sequential(*feats[10:16])
        self.feat4 = nn.Sequential(*feats[17:23])
        self.feat5 = nn.Sequential(*feats[24:30])

        self.fconn = nn.Sequential(
            nn.Conv2d(512, 4096, 7),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Conv2d(4096, 4096, 1),
            nn.ReLU(inplace=True),
            nn.Dropout(),
        )
        self.score_feat3 = nn.Conv2d(256, num_classes, 1)
        self.score_feat4 = nn.Conv2d(512, num_classes, 1)
        self.score_fconn = nn.Conv2d(4096, num_classes, 1)

    def forward(self, x):
        feats = self.feats(x)
        feat3 = self.feat3(feats)
        feat4 = self.feat4(feat3)
        feat5 = self.feat5(feat4)
        fconn = self.fconn(feat5)

        score_feat3 = self.score_feat3(feat3)
        score_feat4 = self.score_feat4(feat4)
        score_fconn = self.score_fconn(fconn)

        score = F.upsample_bilinear(score_fconn, score_feat4.size()[2:])
        score += score_feat4
        score = F.upsample_bilinear(score, score_feat3.size()[2:])
        score += score_feat3

        return F.upsample_bilinear(score, x.size()[2:])

I can’t see anything that might be causing that right now. Can you please write a small script that uses these networks and leaks memory so we can investigate it?

@bodokaiser turns out actually first implementation has a bug ( @colesbury found this).

self.feat1 = nn.Sequential(*feat[0:4])
self.feat2 = nn.Sequential(*feat[5:9])

You are dropping layer 4 here.

Sorry I cannot reduce this to a minimal example. Maybe I scratched the memory limit of the GPU and there was just a small allocation more with the second example which hit the memory limit.