Out of Memory Issues

I am running a neural network ( a variant of resnet) to optimize a bunch of images. In my outer loop, I am simply doing a forward and optimizing the weights of the network. In parallel, I am running a separate forward during and between every weight update. What I have found so far is that after a couple of episodes of calling forward in the inner loop, the inner loop breaks due to “out of memory” issues. I am on pytorch 0.3.1. Is there a way to free gpu memory during training?

I have the network constructed like so:

import hashlib
import torch, time
import random, math
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple
from torch.autograd import Variable

use_cuda = torch.cuda.is_available()
Tensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv3d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm3d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm3d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=36,  name=None, device_id=None):
        """
        Name helps in controlling the hash value of this class
        """
        self.name = name
        self.inplanes = 64
        self.value = None
        self.prior_prob = None
        self.num_classes = num_classes
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv3d(36, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.loss_value_term = []   # (z - v)^2
        self.loss_param_term = []   # pi^T log(p)
        self.loss_log_prob_term = []  # c||\theta||^2
        self.device_id = device_id


        # this for logit probs head for angle probabilities
        self.probhead = self._make_layer(block, num_classes, layers[4], stride=1)

        for m in self.modules():
            with torch.cuda.device(self.device_id):
                m = m.cuda()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv3d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm3d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        # define probability distribution over state-action pairs
        px = self.probhead(x)
        px = px.view(px.size(0), -1)
        s1, s2 = px.size()
        linear_layer = nn.Linear(s1*s2, self.num_classes)
        linear_layer = linear_layer.cuda(self.device_id) if use_cuda else linear_layer
        probs = linear_layer(px)
        probs = probs.cuda(self.device_id) if use_cuda else probs
        probs = F.softmax(probs, dim=1)

        valuehead = nn.Sequential(
                            nn.Linear(s1 * s2, 512),
                            nn.ReLU(inplace=True),
                            nn.Linear(512, 256),
                            nn.ReLU(inplace=True),
                            nn.Linear(256, 1),
                            )
        valuehead = valuehead.cuda(self.device_id) if use_cuda else valuehead
        value = F.tanh(valuehead(px))


        probs_tune_head = nn.Sequential(
                            nn.Linear(s1 * s2, 512),
                            nn.ReLU(inplace=True),
                            nn.Linear(512, 256),
                            nn.ReLU(inplace=True),
                            nn.Linear(256, 3),
                            )
        probs_tune_head = probs_tune_head.cuda(self.device_id) if use_cuda else probs_tune_head
        probs_tune = F.tanh(probs_tune_head(px))

        del valuehead, probs_tune_head, linear_layer

        self.value       = value
        self.prior_prob = probs
        self.probs_tune  = probs_tune

        return probs, value, probs_tune

    def __hash__(self):
        return int(hashlib.md5(self.name.encode('utf-8')).hexdigest(),16)

    def __eq__(self,other):
        if hash(self)==hash(other):
            return True
        return False

And it runs like so:

player = ResNet(BasicBlock, [3, 4, 6, 3, 1], num_classes=5, name='player', device_id=11) # use 10 deg resolution

for episode in range(500):
			# obtain input planes to be fed to the neural network
			probs, value, value_deg = player(Variable(torch.randn([1, 36, 122, 64, 64])).cuda(player.device_id))
			# turn player's strategy from a pure strategy to a mixed strategy
			# construct root node for tree search
			state.device_id = player_gpu
			root_node   = Node(parent=None, state_obj=state, prior_prob=probs, value=value,
					   value_deg=value_deg, device_id=player.device_id)
			prev_net.append(self.player)

			if len(prev_net) > 2: prev_net.pop(0)

			# player pool is an instance of python's multiprocessing
			best_node = player_pool.apply_async(mcts.run_tree_search, (root_node, prev_net[-2]))
			best_node = best_node.get()
			#best_node = mcts.run_tree_search(root_node, player)   # in this inner node, I am continually calling the forward method of player

The inner loop runs the network like so


			planes =Variable(torch.randn([1, 36, 122, 64, 64]))   
			prior_prob, value, value_deg = self.player(planes)  # player is an instance of the network

After a couple of iterations, I get a cuda runtime error:

THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1523242347739/work/torch/lib/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
  File "varian_main.py", line 409, in <module>
    neural_fsp.train_nets(mask.rsplit(sep=".")[0])
  File "varian_main.py", line 311, in train_nets
    best_node_oppo = mcts.run_tree_search(root_node_oppo, self.player_oppo)
  File "/mnt/md0/lex/RadOncol/beam_optim/scripts/monte_carlo/mcts.py", line 123, in run_tree_search
    new_node      = self.tree_policy(root_node)
  File "/mnt/md0/lex/RadOncol/beam_optim/scripts/monte_carlo/mcts.py", line 141, in tree_policy
    return self.expand(node)
  File "/mnt/md0/lex/RadOncol/beam_optim/scripts/monte_carlo/mcts.py", line 160, in expand
    prior_prob, value, value_deg = self.player(planes)
  File "/home/lekan/anaconda3/envs/py35/lib/python3.5/site-packages/torch/nn/modules/module.py", line 357, in __call__
    result = self.forward(*input, **kwargs)
  File "/mnt/md0/lex/RadOncol/beam_optim/scripts/models/varian_model.py", line 109, in forward
    x = self.conv1(x)
  File "/home/lekan/anaconda3/envs/py35/lib/python3.5/site-packages/torch/nn/modules/module.py", line 357, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/lekan/anaconda3/envs/py35/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 388, in forward
    self.padding, self.dilation, self.groups)
  File "/home/lekan/anaconda3/envs/py35/lib/python3.5/site-packages/torch/nn/functional.py", line 126, in conv3d
    return f(input, weight, bias)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1523242347739/work/torch/lib/THC/generic/THCStorage.cu:58

I can’t seem to find what I am doing wrong. Do we have to manually free cuda memory everytime we repeatedly call forward?

I am on 0.3.1 version

When I implemented the ResNet architecture, I too bumped into the same problem of out of memory execution. I realized that my GPU was getting populated since I was having a batch size of 64. So I reduced my batch size to 4 and it worked fine. This is just one such scenario.

Just wanted to point out an obvious (possibly!) anomaly in your code:

You are creating new layers (probs_tune_head, valuehead) in the forward() method of ResNet.
If you want to optimize the parameters, create the layers in __init__() method and use the created variables in forward() function.