Stack overflow exception using LSTM

Joshua_Xie · May 1, 2018, 10:48pm

Hi,

I’m trying to learn and play with pytorch. But encountered a Stack overflow exception. Below are my code snippets. The last line of the codes below will throw a “Windows fatal exception: stack overflow” at some point during training. While If I change to use torch.nn.RNN, things are working just file. Any help will be appreciated.


class VanillaRNNModule(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(VanillaRNNModule, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers=1, nonlinearity='tanh', batch_first=True, dropout=0, bidirectional=False)
        self.output_layer = torch.nn.Linear(hidden_size, 1)

    def forward(self, input, input_lengths):
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input, input_lengths, batch_first=True)
        hidden = self.get_init_hidden()
        out, hiddens = self.rnn(packed_input)
        # unpacked_output, unpacked_lens = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        output = self.output_layer(hiddens)
        return output

    def get_init_hidden(self):
        return torch.zeros(1, self.hidden_size)

class LSTMRNNModule(torch.nn.Module):

    def __init__(self, input_size, hidden_size):
        super(LSTMRNNModule, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers=1, batch_first=True, dropout=0, bidirectional=False)
        self.output_layer = torch.nn.Linear(hidden_size, 1)

    def forward(self, input, input_lengths):
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input, input_lengths, batch_first=True)
        out, (hidden, _) = self.lstm(packed_input)
        # unpacked_output, unpacked_lens = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        output = self.output_layer(hidden)
        return output
    
    def get_init_hidden(self):
        return torch.zeros(1, self.hidden_size)

class RNNBinaryClassifierTrainer(object):

    def __init__(self, data_set, hidden_size, batch_size = 1):
        self.data_set = data_set
        self.batch_size = batch_size
        # self.rnn = RNNModule(data_set.n_words, hidden_size)
        # self.rnn = VanillaRNNModule(data_set.n_words, hidden_size)
        self.rnn = LSTMRNNModule(data_set.n_words, hidden_size)
        self.all_losses = []
        self.criterion = torch.nn.BCEWithLogitsLoss(size_average=True, reduce=True)
        self.optimizer = torch.optim.SGD(self.rnn.parameters(), lr = 0.001, momentum = 0.9)

    def train(self, max_iter, evl_every):
        current_loss = 0
        for i in range(1, max_iter + 1):
            st = time.time()
            for X, L, Y in self.data_set.next_train_batch(self.batch_size):
                current_loss += self.train_iter(X, L, Y)
            ed0 = time.time()
            if i % evl_every == 0:
                avg_loss = current_loss / evl_every
                self.all_losses.append(avg_loss)
                # correct_count, total_count = self.evaluate()
                auc = self.evaluate_auc(self.batch_size)
                ed1 = time.time()
                # print("Elapsed: {4}, Iter: {0}, Loss: {1:8.3f}, AUC: {}, Total: {2}, Correct: {3}".format(i, avg_loss, total_count, correct_count, timeSince(time_start), ))
                print("Epoch Used: {0:0.3f}, Eval Used: {1:0.3f}".format(ed0-st, ed1-st))
                print("Elapsed: {3}, Iter: {0}, Loss: {1:8.3f}, AUC: {2:0.3f}".format(i, avg_loss, auc, timeSince(time_start)))
                current_loss = 0


    def train_iter(self, input, input_lengths, target):

        self.optimizer.zero_grad()
        output = self.predict(input, input_lengths)
        output = output[0]
        loss = self.criterion(output, target)
        loss.backward()
        self.optimizer.step()

        # return loss.data.item()
        # return loss.item()
        return float(loss)

sairam.pillai · June 28, 2018, 4:02pm

I am facing the same issue while using LSTM. I used the Visual Studio 2017 debug to find this error message:

Unhandled exception at 0x00007FFF493FD7B5 (ucrtbase.dll) in python3.exe: 0xC00000FD: Stack overflow (parameters: 0x0000000000000001, 0x000000C094C13FE8). occurred

OS: Windows 10
Python: 3.6
PyTorch: 0.4.0

Is this something fixed in this PR#6873 ?

czs · July 25, 2018, 1:48pm

Hey, i think im having the same issue.
Ive attached a simple example, which reproduces the error. I would appreciate any help.

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable



class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=False, )

    def forward(self, x):
        outputs, _ = self.rnn(x)
        return Variable(x.view(x.size()[0], 1), requires_grad=True)


model = LSTM(
    input_size=1,
    hidden_size=5,
    num_layers=1
)


criterion = nn.MSELoss()

optimizer = optim.SGD(
    params=model.parameters(),
    lr=0.01
)

trainX = torch.randn(5000, 1, 1)
trainY = torch.randn(5000, 1)

for i in range(5):

    outTrain = model(trainX)
    loss = criterion(outTrain, trainY)

    model.zero_grad()
    loss.backward()
    optimizer.step()

Produces error: 0xc00000fd