GPU memory increases on each iteration until it runs out of it

I wrote a NASNET in pytorch that essentially defines new layers in the forward method so there are no predefined layers in init except only when the size is known.

 torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
X OUT torch.Size([100, 100, 28, 28])
at epoch 1: consuming extra 1019.2 MB
x torch.Size([100, 1, 28, 28])
x torch.Size([100, 1, 28, 28])
x torch.Size([100, 1, 28, 28])
x torch.Size([100, 1, 28, 28])
x torch.Size([100, 1, 28, 28])
X OUT torch.Size([100, 50, 28, 28])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
X OUT torch.Size([100, 100, 28, 28])
at epoch 1: consuming extra 1355.3 MB
x torch.Size([100, 1, 28, 28])
x torch.Size([100, 1, 28, 28])
x torch.Size([100, 1, 28, 28])
x torch.Size([100, 1, 28, 28])
x torch.Size([100, 1, 28, 28])
X OUT torch.Size([100, 50, 28, 28])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
x torch.Size([100, 50, 14, 14])
X OUT torch.Size([100, 100, 28, 28])

What should I do?

I create layers like this

def forward(self, x,x_prev):
    self.hidden_out.append(x)
    self.hidden_out.append(x_prev)

    x_comb_iter_0_left = x 
    (_, CL, HL, WL) = x_comb_iter_0_left.size()
    (x_op_left,filter_left) = self.apply_conv_op(self.op_left[0],CL,self.out_channels_left)
    x_output_left = x_op_left(x_comb_iter_0_left)
    
    #print('x_output_left0',x_output_left.size())
    x_comb_iter_0_right = x_prev
    (_, CR, HR, WR) = x_comb_iter_0_right.size()
    (x_op_right,filter_right) = self.apply_conv_op(self.op_right[0],CR,self.out_channels_right)
    x_output_right = x_op_right(x_comb_iter_0_right)
    #print('x_output_right0',x_output_right.size())
    
    (_, CL, HL, WL) = x_output_left.size()
    (_, CR, HR, WR) = x_output_right.size()
    
    pad = abs(HL - HR) 
    if HL > HR: 
        x_output_right = nn.ZeroPad2d((pad,0,pad,0))(x_output_right)
    else:
   .
   .
     def apply_conv_op(self,op,in_channels,out_channels):
            if 'twoconv' in op: 
                return TwoConv(in_channels,out_channels),out_channels
            if 'conv3' in op: 
                return Conv(in_channels, out_channels,3),out_channels
            if 'conv5' in op: 
                return Conv(in_channels, out_channels,5),out_channels
            if 'conv7' in op: 
                return Conv(in_channels, out_channels,7),out_channels
            if 'sep' in op: 
                return TwoSeparables(in_channels, out_channels, 3, 1,bias=False),out_channels
            if 'max' in op: 
                return nn.Sequential(nn.MaxPool2d(3, stride=1, padding=1),nn.ReLU()),in_channels
            if 'avg' in op: 

ERROR:

x_output_right2 torch.Size([100, 1, 28, 28])
x_comb_iter_2 torch.Size([100, 128, 28, 28])
x torch.Size([100, 320, 14, 14])
x_output_left3 torch.Size([100, 128, 14, 14])
x_output_right3 torch.Size([100, 1, 28, 28])
x_comb_iter_3 torch.Size([100, 128, 28, 28])
x torch.Size([100, 320, 14, 14])
x_output_left4 torch.Size([100, 128, 14, 14])
x_output_right4 torch.Size([100, 1, 28, 28])
x_comb_iter_4 torch.Size([100, 128, 28, 28])
X OUT torch.Size([100, 640, 28, 28])
THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1503968623488/work/torch/lib/THC/generic/THCStorage.cu line=66 error=2 : out of memory
Traceback (most recent call last):
File “standalone_lowmem.py”, line 530, in
train(epoch,model)
File “standalone_lowmem.py”, line 469, in train
optimizer.step()
File “/home/ubuntu/anaconda/envs/rl/lib/python3.5/site-packages/torch/optim/adam.py”, line 68, in step
denom = exp_avg_sq.sqrt().add_(group[‘eps’])
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1503968623488/work/torch/lib/THC/generic/THCStorage.cu:66

This is for training during the time when controller is trying to find the best conv cell. Please help! it runs on CUDA