GPU OOM with Simple VGG net

Hi, I’m faced with GPU out of memory Problem.
Here’s the test script with VGG net and dummy data, attached below.
I run it in GPU K40 with 12GB, using the latest version of pytorch. It throws the out of memory error after several iterations(less than 10).
It will happen when trying to compute conv1 layer.

Can anyone help me check what’s the problem?
Is that problem with my implementation?

Any suggestions are appreciated.


import os
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np

def set_trainable(model, requires_grad):
    for param in model.parameters():
        param.requires_grad = requires_grad

class VGG16(nn.Module):
    def __init__(self, bn=False):
        super(VGG16, self).__init__()
        self.conv1 = nn.Sequential(nn.Conv2d(3, 64, kernel_size = 3, stride = 1, padding=1),
                                   #nn.ReLU(inplace=True)
                                   nn.ReLU(),
                                   nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.MaxPool2d(2))
        self.conv2 = nn.Sequential(nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.MaxPool2d(2))
        set_trainable(self.conv1, requires_grad=False)
        set_trainable(self.conv2, requires_grad=False)
        self.conv3 = nn.Sequential(nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.MaxPool2d(2))
        self.conv4 = nn.Sequential(nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.MaxPool2d(2))
        self.conv5 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
                                   nn.ReLU(),
                                   nn.MaxPool2d(4))
        self.fc6_new = nn.Linear(512 * 7 * 7, 4096)
        self.fc7_new = nn.Linear(4096, 4096)
        self.score_fc = nn.Linear(4096, 150)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, im_data):
        x = self.conv1(im_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = x.view(x.size()[0], -1)
        x = self.fc6_new(x)
        x = self.relu(x)
        x = self.fc7_new(x)
        x = self.relu(x)
        x = self.score_fc(x)
        return x

def train():
    net = VGG16()
    net = net.cuda()
    lr = 0.0001
    optimizer = torch.optim.SGD([
        {'params': net.conv3.parameters()},
        {'params': net.conv4.parameters()},
        {'params': net.conv5.parameters()},
        {'params': net.fc6_new.parameters(), 'lr': lr * 10.0},
        {'params': net.fc7_new.parameters(), 'lr': lr * 10.0},
        {'params': net.score_fc.parameters(), 'lr': lr * 10.0},
    ], lr=lr, momentum=0.9, weight_decay=0.005)
    for i in xrange(1000):
        im_data = np.random.rand(32, 3, 448, 448).astype(np.float32)
        label = np.floor(np.random.rand(32) * 150).astype(np.int64)
        out = net(Variable(torch.from_numpy(im_data).cuda()))
        loss = F.cross_entropy(out, Variable(torch.from_numpy(label).cuda()), size_average=True)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('Iteration %d finished.  Loss: %.4f' % (i, loss.data.cpu().numpy()))


if __name__ == '__main__':
    train()


If you reproduce your issue to 25 lines to 40 lines of code, it will be very helpful for others to investigate.

Thank you for your reply. I am not sure what you mean.
But I found the format of the code was messed up. Now I’ve revised it.
The script can be directly copied and run now.

How you revised your code?