Torch.cat memory explode

I tried to replace Conv2d with this module on ResNet50.

class SubtractedConv(nn.Module):
    def __init__(self, input_ch, output_ch, kernels, stride=1):
        super().__init__()
        self.point_wise = nn.Conv2d(input_ch, output_ch//2, 1, bias=False, stride=stride)
        self.depth_wise = nn.Conv2d(output_ch // 2, output_ch // 2, kernels, groups=output_ch // 2, bias=False, padding=kernels // 2)
        self.low_pass = nn.Conv2d(output_ch // 2, output_ch // 2, kernels, bias=False, padding=kernels // 2)
    def forward(self, x):
        p = self.point_wise(x)
        d = self.depth_wise(p)
        d -= p 
        l = self.low_pass(p)
        x = torch.cat((d, l), 1)
        return x

The expected outputs should have same channel with normal Conv2d but I got cuda out of memory at torch.cat().
I wonder why? And how to deal with this?

I use Pytorch 0.4.0 and run on Tesla P100, image size 224*224, batch size 16.

Since the image size is small, and the batch size is small, and you are using a suitable GPU, this is probably due to the number of filters that you are sending to this Module.

(assuming you are on a *nix system) can you try starting watch nvidia-smi in the terminal before you run this?

Also try various values for your kernels param. Start with 1, work up till it crashes, etc.

This is nvidia-smi result before I run anything.

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.37                 Driver Version: 396.37                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla P100-PCIE...  Off  | 00000000:37:00.0 Off |                    0 |
| N/A   58C    P0    37W / 250W |   1327MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
+-----------------------------------------------------------------------------+

As I just replace Conv2d in Bottleneck with this SubtractedConv so it’s looks like this.


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = SubtractedConv(inplanes, planes, 1)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = SubtractedConv(planes, planes, 3, stride=stride)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = SubtractedConv(planes, planes * 4, 1)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

I tried small kernels param, it’s works.
This is inside ResNet module.
This one works.

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 128, num_blocks[3], stride=2)

But this one explode.

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 32, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 64, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=2)

Well memory is finite. And cat is trying to put everything that you pass to it into one contiguous chunk. If self._make_layer is creating your SubtractedConv layers from above, then it seems to me it would be making (2 * filters) + 1 filters every time.

With all of that loaded into GPU memory, it is then trying to allocate enough space to concatenate your d and l tensors from above.

I’m not totally sure how cat works in allocating memory but a quick test seems to indicate that it is copying the concatenated tensors.

Maybe you can try taking the concatenation out of your SubtractedConv, return the d and l from each layer and then cat a bunch of them all at once?

I wonder if this is something to do with Dynamic Graph Characteristic?
But I want to put them together before process the next convolution layer so, I still need to use this cat even if it’s outside.