Hey there,
I build a very small network architecture for a somewhat specific regression problem. While testing I noticed that the network used much more GPU memory during training, which was surprising given that I always use volatile variables when testing. I created a small code snipped to demonstrate my problem:
import torch
import torch.nn as nn
import torch.nn.functional as F
NEG_SLOPE = 1e-2
INPLACE=True
BIAS=False
AFFINE=False
def init_weights(module):
if isinstance(module, nn.Conv3d):
module.weight = nn.init.kaiming_normal(module.weight, a=1e-2)
if module.bias is not None:
module.bias = nn.init.constant(module.bias, 0)
class ConvConvPool(nn.Module):
def __init__(self, base_filters, num_input_channels):
super(ConvConvPool, self).__init__()
self.conv1 = nn.Conv3d(num_input_channels, base_filters, 3, 1, 1, 1, 1, BIAS)
self.in1 = nn.InstanceNorm3d(base_filters, affine=AFFINE)
self.lrelu1 = nn.LeakyReLU(inplace=INPLACE)
self.conv2 = nn.Conv3d(base_filters, base_filters, 3, 1, 1, 1, 1, BIAS)
self.in2 = nn.InstanceNorm3d(base_filters, affine=AFFINE)
self.lrelu2 = nn.LeakyReLU(inplace=INPLACE)
self.pool1 = nn.AvgPool3d(2)
def forward(self, x):
x = self.lrelu1(self.in1(self.conv1(x)))
x = self.lrelu2(self.in2(self.conv2(x)))
x = self.pool1(x)
return x
class ConvConvPoolConvConv(nn.Module):
def __init__(self, base_filters, num_input_channels):
super(ConvConvPoolConvConv, self).__init__()
self.convconvpool = ConvConvPool(base_filters, num_input_channels)
self.conv3 = nn.Conv3d(base_filters, base_filters * 2, 3, 1, 1, 1, 1, BIAS)
self.in3 = nn.InstanceNorm3d(base_filters * 2, affine=AFFINE)
self.lrelu3 = nn.LeakyReLU(inplace=INPLACE)
self.conv4 = nn.Conv3d(base_filters * 2, base_filters * 2, 3, 1, 1, 1, 1, BIAS)
self.in4 = nn.InstanceNorm3d(base_filters * 2, affine=AFFINE)
self.lrelu4 = nn.LeakyReLU(inplace=INPLACE)
def forward(self, x):
x = self.convconvpool(x)
x = self.lrelu3(self.in3(self.conv3(x)))
x = self.lrelu4(self.in4(self.conv4(x)))
return x
class MyNetwork(nn.Module):
def __init__(self, base_filters, num_input_channels):
super(MyNetwork, self).__init__()
self.net_1 = ConvConvPoolConvConv(base_filters, num_input_channels)
self.net_2 = ConvConvPoolConvConv(base_filters, num_input_channels)
self.net_3 = ConvConvPoolConvConv(base_filters, num_input_channels)
self.net_4 = ConvConvPoolConvConv(base_filters, num_input_channels)
self.convconvpool = ConvConvPool(base_filters * 2 * 8, base_filters * 2 * 4)
self.convconvpoolconvconv = ConvConvPoolConvConv(base_filters * 2 * 8, base_filters * 2 * 8)
self.dense = nn.Linear(base_filters * 2 * 16, 1)
self.apply(init_weights)
def forward(self, inp_1, inp_2, inp_3, inp_4):
# all shapes are shown for a (16, 5, 32, 32, 32) input
# input: 4x (16, 5, 32, 32, 32), output: (16, 128, 16, 16, 16)
x = torch.cat((self.net_1(inp_1), self.net_2(inp_2), self.net_3(inp_3), self.net_4(inp_4)), 1)
# output: (16, 256, 8, 8, 8)
x = self.convconvpool(x)
# output: (16, 512, 4, 4, 4)
x = self.convconvpoolconvconv(x)
# output (16, 512)
x = x.mean(4).mean(3).mean(2)
# output: (16, 1)
x = self.dense(x)
return x
if __name__ == "__main__":
a = b = c = d = torch.rand((16, 5, 32, 32, 32)).cuda()
net = MyNetwork(16, 5).cuda()
from torch.autograd import Variable
a = Variable(a, volatile=True)
b = Variable(b, volatile=True)
c = Variable(c, volatile=True)
d = Variable(d, volatile=True)
res = net(a, b, c, d)
The network takes 4 patches, each with size (16, 5, 32, 32, 32) as input. Each patch has its own feature extractor (conv conv pool conv conv) and the features are then concatenated. Then these concatenated features are processed together with some convolutions and finally a single output value is returned (regression problem). When I run the code above with volatile=True, the GPU uses ~10GB of memory. When volatile=False it only uses 6 GB. That does not make any sense to me. Would be great if someone could help me out here!
Cheers,
Fabian