ResNet memory comsumption || lists of layers

Hi
I’ve seen a custom implementation of ResNet:

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable

from .revnet import possible_downsample

CUDA = torch.cuda.is_available()


class Block(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1,
                 no_activation=False):
        super(Block, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride
        self.no_activation = no_activation

        self.stride = stride

        self.bn1 = nn.BatchNorm2d(in_channels)

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                               padding=1, stride=stride)

        self.bn2 = nn.BatchNorm2d(out_channels)

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                               padding=1)

    def forward(self, x):
        orig_x = x

        out = x

        if not self.no_activation:
            out = F.relu(self.bn1(out))

        out = self.conv1(out)

        out = self.conv2(F.relu(self.bn2(out)))

        out += possible_downsample(orig_x, self.in_channels,
                                   self.out_channels, self.stride)

        return out


class Bottleneck(nn.Module):
    def __init__(self):
        pass

    def forward(self, x):
        pass


class ResNet(nn.Module):
    def __init__(self,
                 units,
                 filters,
                 strides,
                 classes,
                 bottleneck=False):
        """
        Parameters
        ----------
        units: list-like
            Number of residual units in each group
        filters: list-like
            Number of filters in each unit including the inputlayer, so it is
            one item longer than units
        strides: list-like
            Strides to use for the first units in each group, same length as
            units
        bottleneck: boolean
            Wether to use the bottleneck residual or the basic residual
        """
        super(ResNet, self).__init__()
        self.name = self.__class__.__name__

        if bottleneck:
            self.Residual = Bottleneck
        else:
            self.Residual = Block

        self.layers = nn.ModuleList()

        # Input layers
        self.layers.append(nn.Conv2d(3, filters[0], 3, padding=1))
        self.layers.append(nn.BatchNorm2d(filters[0]))
        self.layers.append(nn.ReLU())

        for i, group in enumerate(units):
            self.layers.append(self.Residual(filters[i], filters[i + 1],
                                             stride=strides[i],
                                             no_activation=True))

            for unit in range(1, group):
                self.layers.append(self.Residual(filters[i + 1],
                                                 filters[i + 1]))

        self.bn_last = nn.BatchNorm2d(filters[-1])

        self.fc = nn.Linear(filters[-1], classes)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)

        x = F.relu(self.bn_last(x))
        x = F.avg_pool2d(x, x.size(2))
        x = x.view(x.size(0), -1)
        x = self.fc(x)

return x
def resnet32():
    model = ResNet(
            units=[5, 5, 5],
            filters=[16, 16, 32, 64],
            strides=[1, 2, 2],
            classes=10
            )
    model.name = "resnet32"
return model

I’ve noticed that this implementation requires, for an input 40x3x224x224, almost 5 times more memory than the torchvision implementation.
Even if the architecture does not exactly match, 5 times more memory is a lot.
The most evident a priori difference is the usage of lists to build the architecture.
Is there any drawback related to that?

1 Like

Hi, have find the reason for the question you mentioned above? I think I have the same problem as yours.