High GPU memory demand for Seq2Seq (compared to TF)

I am building a chatbot using a Seq2Seq model. During training, my gpu runs out of memory on pretty small batch sizes (anything >= 20); the tensorflow implementations I have seen can comfortably handle larger models:

Here are some specs for context (my pytorch implementation/tensorflow implementations):

  • num of layers (applies to encoder and decoder) = 1/2
  • hidden size (applies to encoder and decoder) = 256/512
  • batch size = 20/64
  • vocab size (shared vocab between encoder and decoder) = 20,000/20,000
  • GPU total memory = 11GB (nvidia gtx 1080 ti)
  • longest seq len = 686 words(cornell movie dialog corpus)

I tried playing around with the code a bit, it appears that the GPU memory is not freed after each training iteration.

here is the model code:

import time

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.optim import SGD

from data import TRAIN_FILE_NAME
from data import VAL_FILE_NAME
from data import pad_seqs
from dataset import Dataset
from data import PAD_TOKEN
from data import load_vocab
from data import VOCAB_FILE_NAME

from numpy import prod


class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Seq2Seq, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

        self.encoder = EncoderRNN(vocab_size, hidden_size)
        self.decoder = DecoderRNN(vocab_size, hidden_size)

    def _get_loss(self, batch):
        questions = [example['question'] for example in batch]
        answers = [example['answer'] for example in batch]

        answer_lens = [len(a) for a in answers]

        # print max(len(el) for el in questions), max(len(el) for el in answers)

        questions = Variable(torch.LongTensor(pad_seqs(questions))).cuda()
        answers = Variable(torch.LongTensor(pad_seqs(answers))).cuda()
        output = self(questions, answers)

        # print questions.size(), answers.size()

        loss = 0
        loss_fn = torch.nn.NLLLoss()
        batch_size = len(batch)
        for i in xrange(batch_size):
            loss += loss_fn(output[i, :answer_lens[i] - 1], answers[i, 1:answer_lens[i]])

        return loss / batch_size

    def forward(self, input_seqs, target_seqs):
        _, encoder_hidden = self.encoder(input_seqs)
        decoder_output, _ = self.decoder(target_seqs, encoder_hidden)
        return decoder_output

    def train(self, lr=1e-3, batch_size=1, iters=7500, print_iters=100):
        optimizer = SGD(self.parameters(), lr=lr)

        train_losses = []
        val_losses = []

        train = Dataset(TRAIN_FILE_NAME)
        val = Dataset(VAL_FILE_NAME)

        start_time = time.time()
        for i in xrange(1, iters + 1):
            train_batch = [train.get_random_example() for _ in xrange(batch_size)]
            val_batch = [val.get_random_example() for _ in xrange(batch_size)]

            train_loss = self._get_loss(train_batch)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            val_loss = self._get_loss(val_batch)

            train_losses.append(train_loss.data[0])
            val_losses.append(val_loss.data[0])

            if i % print_iters == 0:
                end_time = time.time()
                string = 'epoch: {}, iters: {}, train loss: {:.2f}, val loss: {:.2f}, time: {:.2f} s'
                print string.format(i / len(train), i, train_loss.data[0], val_loss.data[0], end_time - start_time)
                start_time = time.time()

        return train_losses, val_losses


class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(EncoderRNN, self).__init__()

        self.num_layers = 1
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, self.num_layers, batch_first=True)

    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).cuda()

    def forward(self, input_seqs):
        input_seqs = self.embedding(input_seqs)
        batch_size = input_seqs.size()[0]
        output, hidden = self.gru(input_seqs, self.init_hidden(batch_size))
        return output, hidden


class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(DecoderRNN, self).__init__()

        self.num_layers = 1
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, self.num_layers, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax()

    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).cuda()

    @staticmethod
    def create_rnn_input(embedded, thought):
        # reorder axes to be (seq_len, batch_size, hidden_size)
        embedded = embedded.permute(1, 0, 2)

        seq_len, batch_size, hidden_size = embedded.size()
        rnn_input = Variable(torch.zeros((seq_len, batch_size, 2 * hidden_size))).cuda()
        for i in xrange(seq_len):
            for j in xrange(batch_size):
                rnn_input[i, j] = torch.cat((embedded[i, j], thought[0, j]))

        # make batch first
        return rnn_input.permute(1, 0, 2)

    def softmax_batch(self, linear_output):
        result = Variable(torch.zeros(linear_output.size())).cuda()
        batch_size = linear_output.size()[0]
        for i in xrange(batch_size):
            result[i] = self.softmax(linear_output[i])
        return result

    def forward(self, target_seqs, thought):
        target_seqs = self.embedding(target_seqs)
        rnn_input = self.create_rnn_input(target_seqs, thought)
        batch_size = target_seqs.size()[0]
        output, hidden = self.gru(rnn_input, self.init_hidden(batch_size))
        output = self.softmax_batch(self.out(output))
        return output, hidden

and the training code:

import matplotlib.pyplot as plt

from data import VOCAB_SIZE
from models import Seq2Seq


def plot_loss(train_losses, val_losses):
    plt.plot(train_losses, color='red', label='train')
    plt.plot(val_losses, color='blue', label='val')
    plt.legend(loc='upper right', frameon=False)
    plt.show()

# VOCAB_SIZE is 20,000
model = Seq2Seq(VOCAB_SIZE, 256).cuda()
train_losses, val_losses = model.train(iters=1000, batch_size=100, print_iters=100)
plot_loss(train_losses, val_losses)
1 Like

Can you run this before and after you start training:

from subprocess import call
! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])

before training:

(/home/jkarimi91/Apps/anaconda2/envs/torch) jkarimi91@jkarimi91-desktop:~/Projects/chatbot$ python call.py 
('__CUDNN VERSION:', 6021)
('__Number CUDA Devices:', 1L)
index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
0, GeForce GTX 1080 Ti, 375.82, 11169 MiB, 569 MiB, 10600 MiB

during training (after input and target seqs have been pushed to gpu but before feeding them through the model):

(/home/jkarimi91/Apps/anaconda2/envs/torch) jkarimi91@jkarimi91-desktop:~/Projects/chatbot$ python chatbot.py 
('__CUDNN VERSION:', 6021)
('__Number CUDA Devices:', 1L)
index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
0, GeForce GTX 1080 Ti, 375.82, 11169 MiB, 1063 MiB, 10106 MiB

('__CUDNN VERSION:', 6021)
('__Number CUDA Devices:', 1L)
index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
0, GeForce GTX 1080 Ti, 375.82, 11169 MiB, 4985 MiB, 6184 MiB

('__CUDNN VERSION:', 6021)
('__Number CUDA Devices:', 1L)
index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
0, GeForce GTX 1080 Ti, 375.82, 11169 MiB, 4985 MiB, 6184 MiB

('__CUDNN VERSION:', 6021)
('__Number CUDA Devices:', 1L)
index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
0, GeForce GTX 1080 Ti, 375.82, 11169 MiB, 5765 MiB, 5404 MiB

('__CUDNN VERSION:', 6021)
('__Number CUDA Devices:', 1L)
index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
0, GeForce GTX 1080 Ti, 375.82, 11169 MiB, 8881 MiB, 2288 MiB

THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1503966894950/work/torch/lib/THC/generic/THCStorage.cu line=66 error=2 : out of memory
Traceback (most recent call last):
  File "chatbot.py", line 16, in <module>
    train_losses, val_losses = model.train(iters=1000, batch_size=100, print_iters=100)
  File "/home/jkarimi91/Projects/chatbot/models.py", line 84, in train
    train_loss.backward()
  File "/home/jkarimi91/Apps/anaconda2/envs/torch/lib/python2.7/site-packages/torch/autograd/variable.py", line 156, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
  File "/home/jkarimi91/Apps/anaconda2/envs/torch/lib/python2.7/site-packages/torch/autograd/__init__.py", line 98, in backward
    variables, grad_variables, retain_graph)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1503966894950/work/torch/lib/THC/generic/THCStorage.cu:66

My best guess is maybe there is a flaw/memory leak in init_hidden or create_rnn_input.

Two solutions/ideas come to mind to address this problem:

  1. when computing the val loss, set volatile=True
  2. many chatbot implementations that use Seq2Seq, coupled with the cornell movie dialog corpus, limit
    the sequence length to ~20 words

I am willing to run it locally and debug it, please upload the data set and the full code to git.