How to fix nn.GRU memory leak?

SerB · February 21, 2018, 10:54am

Hi,

This simple example of using nn.GRU take a memory leak on GPU. Maybe somebody know how to fix it? Thanks in advance!

import numpy as np
import torch
from torch import optim
from torch import Tensor
from torch import nn
from torch.autograd import Variable

from warpctc_pytorch import CTCLoss

class Model(nn.Module):
    def __init__(self,
                 num_inputs=512,
                 num_classes=11,
                 rnn_hidden_size=128,
                 rnn_num_layers=2,
                 rnn_dropout=False):
        super().__init__()
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn_num_layers = rnn_num_layers
        self.rnn = nn.GRU(input_size=num_inputs,
                          hidden_size=self.rnn_hidden_size,
                          num_layers=self.rnn_num_layers,
                          batch_first=False,
                          dropout=rnn_dropout,
                          bidirectional=True)
        self.linear = nn.Linear(self.rnn_hidden_size * 2, num_classes)

    def init_hidden(self, batch_size, gpu=False):
        h0 = Variable(torch.zeros(self.rnn_num_layers * 2, batch_size, self.rnn_hidden_size))
        if gpu:
            h0 = h0.cuda()
        return h0

    def forward(self, x, hidden):
        self.rnn.flatten_parameters()
        out, hidden = self.rnn(x, hidden)
        return self.linear(out)


num_inputs = 512
net = Model(num_inputs=num_inputs).cuda()
criterion = CTCLoss()
optimizer = optim.Adam(net.parameters(), lr=1e-5, weight_decay=0.0001)

batch_size = 256
seq_len = 4
num_classes = 10
for i in range(10000):
    x = Variable(torch.randn(seq_len, batch_size, num_inputs)).cuda()
    hidden = net.init_hidden(batch_size, gpu=True)
    acts = net(x, hidden).cpu()
    labels = Variable(torch.from_numpy(np.random.randint(1, num_classes + 1, (batch_size, seq_len))).int()).view(-1)
    act_lens = Variable(Tensor([seq_len] * batch_size).int())
    label_lens = Variable(Tensor([seq_len] * batch_size).int())
    loss = criterion(acts, labels, act_lens, label_lens) / batch_size
    loss.backward()
    optimizer.step()
    print("loss: {}".format(loss.data[0]))

richard · February 21, 2018, 2:59pm

What are the symptoms of your memory leak?

SerB · February 21, 2018, 3:02pm

So, script allocate more memory on each iteration and after some iterations it brokes with error “out of memory”. In nvidia-smi I see growth of memory usage.

isofun · January 2, 2019, 6:29am

Hi, I’ve meet the same problem, is this solved?

chenglu · January 2, 2019, 6:41am

You init_hidden in every iteration, and init_hidden function will create a Variable which need to be tracked grad. I think this is the reason of the “leak”. Try to remove the init_hidden in iteration or move it out of the iteration and see the memory again.