I am running a neural network ( a variant of resnet) to optimize a bunch of images. In my outer loop, I am simply doing a forward and optimizing the weights of the network. In parallel, I am running a separate forward during and between every weight update. What I have found so far is that after a couple of episodes of calling forward in the inner loop, the inner loop breaks due to “out of memory” issues. I am on pytorch 0.3.1. Is there a way to free gpu memory during training?
I have the network constructed like so:
import hashlib
import torch, time
import random, math
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple
from torch.autograd import Variable
use_cuda = torch.cuda.is_available()
Tensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv3d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm3d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm3d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=36, name=None, device_id=None):
"""
Name helps in controlling the hash value of this class
"""
self.name = name
self.inplanes = 64
self.value = None
self.prior_prob = None
self.num_classes = num_classes
super(ResNet, self).__init__()
self.conv1 = nn.Conv3d(36, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm3d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.loss_value_term = [] # (z - v)^2
self.loss_param_term = [] # pi^T log(p)
self.loss_log_prob_term = [] # c||\theta||^2
self.device_id = device_id
# this for logit probs head for angle probabilities
self.probhead = self._make_layer(block, num_classes, layers[4], stride=1)
for m in self.modules():
with torch.cuda.device(self.device_id):
m = m.cuda()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv3d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm3d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
# define probability distribution over state-action pairs
px = self.probhead(x)
px = px.view(px.size(0), -1)
s1, s2 = px.size()
linear_layer = nn.Linear(s1*s2, self.num_classes)
linear_layer = linear_layer.cuda(self.device_id) if use_cuda else linear_layer
probs = linear_layer(px)
probs = probs.cuda(self.device_id) if use_cuda else probs
probs = F.softmax(probs, dim=1)
valuehead = nn.Sequential(
nn.Linear(s1 * s2, 512),
nn.ReLU(inplace=True),
nn.Linear(512, 256),
nn.ReLU(inplace=True),
nn.Linear(256, 1),
)
valuehead = valuehead.cuda(self.device_id) if use_cuda else valuehead
value = F.tanh(valuehead(px))
probs_tune_head = nn.Sequential(
nn.Linear(s1 * s2, 512),
nn.ReLU(inplace=True),
nn.Linear(512, 256),
nn.ReLU(inplace=True),
nn.Linear(256, 3),
)
probs_tune_head = probs_tune_head.cuda(self.device_id) if use_cuda else probs_tune_head
probs_tune = F.tanh(probs_tune_head(px))
del valuehead, probs_tune_head, linear_layer
self.value = value
self.prior_prob = probs
self.probs_tune = probs_tune
return probs, value, probs_tune
def __hash__(self):
return int(hashlib.md5(self.name.encode('utf-8')).hexdigest(),16)
def __eq__(self,other):
if hash(self)==hash(other):
return True
return False
And it runs like so:
player = ResNet(BasicBlock, [3, 4, 6, 3, 1], num_classes=5, name='player', device_id=11) # use 10 deg resolution
for episode in range(500):
# obtain input planes to be fed to the neural network
probs, value, value_deg = player(Variable(torch.randn([1, 36, 122, 64, 64])).cuda(player.device_id))
# turn player's strategy from a pure strategy to a mixed strategy
# construct root node for tree search
state.device_id = player_gpu
root_node = Node(parent=None, state_obj=state, prior_prob=probs, value=value,
value_deg=value_deg, device_id=player.device_id)
prev_net.append(self.player)
if len(prev_net) > 2: prev_net.pop(0)
# player pool is an instance of python's multiprocessing
best_node = player_pool.apply_async(mcts.run_tree_search, (root_node, prev_net[-2]))
best_node = best_node.get()
#best_node = mcts.run_tree_search(root_node, player) # in this inner node, I am continually calling the forward method of player
The inner loop runs the network like so
planes =Variable(torch.randn([1, 36, 122, 64, 64]))
prior_prob, value, value_deg = self.player(planes) # player is an instance of the network
After a couple of iterations, I get a cuda runtime error
:
THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1523242347739/work/torch/lib/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
File "varian_main.py", line 409, in <module>
neural_fsp.train_nets(mask.rsplit(sep=".")[0])
File "varian_main.py", line 311, in train_nets
best_node_oppo = mcts.run_tree_search(root_node_oppo, self.player_oppo)
File "/mnt/md0/lex/RadOncol/beam_optim/scripts/monte_carlo/mcts.py", line 123, in run_tree_search
new_node = self.tree_policy(root_node)
File "/mnt/md0/lex/RadOncol/beam_optim/scripts/monte_carlo/mcts.py", line 141, in tree_policy
return self.expand(node)
File "/mnt/md0/lex/RadOncol/beam_optim/scripts/monte_carlo/mcts.py", line 160, in expand
prior_prob, value, value_deg = self.player(planes)
File "/home/lekan/anaconda3/envs/py35/lib/python3.5/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/mnt/md0/lex/RadOncol/beam_optim/scripts/models/varian_model.py", line 109, in forward
x = self.conv1(x)
File "/home/lekan/anaconda3/envs/py35/lib/python3.5/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/lekan/anaconda3/envs/py35/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 388, in forward
self.padding, self.dilation, self.groups)
File "/home/lekan/anaconda3/envs/py35/lib/python3.5/site-packages/torch/nn/functional.py", line 126, in conv3d
return f(input, weight, bias)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1523242347739/work/torch/lib/THC/generic/THCStorage.cu:58
I can’t seem to find what I am doing wrong. Do we have to manually free cuda memory everytime we repeatedly call forward?
I am on 0.3.1
version