A short script to repro stack overflow exception of LSTM

Hi folks,

I’m a fairly new user of pytorch, and just trying to switch to pytorch for my work. While I’m playing with it recently. I face this stack overflow exception. If any body can help me here, I’ll be really appreciate it. Thanks a lot!

The scripts below will throw stack overflow exception when “loss.item()” is called, it can be consistently reproduced for the first mini-batch.

IMy env information:

PyTorch version: 0.4.0
Is debug build: No
CUDA used to build PyTorch: 8.0

OS: Microsoft Windows 10 Enterprise
GCC version: Could not collect
CMake version: Could not collect

Python version: 3.6
Is CUDA available: Yes
CUDA runtime version: 8.0.60
GPU models and configuration: Could not collect
Nvidia driver version: Could not collect
cuDNN version: Could not collect

Versions of relevant libraries:
[pip] numpy (1.13.1)
[pip] numpydoc (0.7.0)
[pip] torch (0.4.0)
[pip] torchvision (0.2.0)
[conda] pytorch 0.4.0 py36_cuda80_cudnn7he774522_1 pytorch
[conda] torchvision 0.2.0


import torch
import torch.autograd

import random

random.seed(1)
torch.manual_seed(1)

class DataGenerator:
    def __init__(self, feature_dim, max_seq_len, num_train_samples):
        self.feature_dim = feature_dim
        self.max_seq_len = max_seq_len
        self.num_train_samples = num_train_samples
    
    def get_next_batch(self, batch_size=1, num_batches=10):                
        for b in range(num_batches):
            samples = []
            for i in range(batch_size):
                length = random.randint(self.max_seq_len - 100, self.max_seq_len)
                features = [self.create_feature() for j in range(length)]
                target = random.randint(0, 1)
                samples.append((features, target))
            yield self.create_padded_batch(samples)
    
    def create_feature(self):
        tensor = torch.zeros(1, self.feature_dim)
        d = random.randint(0, self.feature_dim - 1)
        tensor[0][d] = 1.0
        return tensor

    def create_padded_batch(self, samples):
        sorted_samples = sorted(samples, key=lambda x:len(x[0]), reverse=True)
        inputs = [torch.cat(x[0], dim=0) for x in sorted_samples]
        input_lengths = [len(x) for x in inputs]
        targets = [x[1] for x in sorted_samples]
        target_batch = torch.FloatTensor(targets).view(-1,1)
        packed_input = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True)
        return packed_input, input_lengths, target_batch


class LSTMRNNModule(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMRNNModule, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers=1, bidirectional=False, batch_first=True, dropout=0)
        self.output_layer = torch.nn.Linear(hidden_size, 1)
    
    def forward(self, input, input_lengths):
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input, input_lengths, batch_first=True)
        out, (hidden, cstate) = self.lstm(packed_input)
        output = self.output_layer(hidden)
        return output

class Trainer(object):

    def __init__(self, feature_dim, hidden_size, data_set):
        self.data_set = data_set
        self.rnn = LSTMRNNModule(feature_dim, hidden_size)
        self.criterion = torch.nn.BCEWithLogitsLoss(size_average=True, reduce=True)
        self.optimizer = torch.optim.SGD(self.rnn.parameters(), lr=0.001, momentum=0.9)
    
    def train(self, max_iter, eval_every, batch_size, num_train_samples):

        for it in range(max_iter):            
            batch_num = 0
            for X,L,Y in self.data_set.get_next_batch(batch_size, num_train_samples):
                batch_num += 1
                loss = self.train_iter(X,L,Y)                
                print("{3}/{0}/{1}, Loss: {2:0.3f}".format(it + 1, max_iter, loss, batch_num))
    
    def train_iter(self, input, input_lengths, target):
        self.optimizer.zero_grad()
        output = self.rnn(input, input_lengths)
        output = output[0]
        loss = self.criterion(output, target)
        loss.backward()
        self.optimizer.step()
        return loss.item()


FEATURE_DIM = 1000
HIDDEN_SIZE = 32
MAX_SEQ_LEN = 1500
NUM_TRAIN_SAMPLES = 2000
MAX_ITER = 30
EVAL_EVERY = 1
BATCH_SIZE = 1

data_set = DataGenerator(FEATURE_DIM, MAX_SEQ_LEN, NUM_TRAIN_SAMPLES)

trainer = Trainer(FEATURE_DIM, HIDDEN_SIZE, data_set)

trainer.train(MAX_ITER, EVAL_EVERY, BATCH_SIZE, NUM_TRAIN_SAMPLES)

See https://github.com/pytorch/pytorch/issues/7198