LSTM for text generation

Diego · February 20, 2018, 6:45am

Hello everyone. I am trying to train an LSTM network for text generation and it seems like it won’t train very well. The loss decreases very slowly, or not at all and the accuracy doesn’t seem to improve. What am I doing wrong? Sorry if it looks like a lot of code, its actually just like 100 lines, it looks longer because of the docstring.

class LSTM(nn.Module):

    """LSTM neural network

    Args:
         params (dict): holds the program hyperparameters
    """

    def __init__(self, params):
        super(LSTM, self).__init__()

        self.hidden_dim = params['h_dim']
        self.n_layers = params['n_layers']
        self.batch = params['batch']
        self.seq = params['seq']
        alphabet_size = output_size = params['alphabet_size']

        self.i2h = nn.Linear(alphabet_size, self.hidden_dim)
        self.lstm = nn.LSTM(self.hidden_dim, self.hidden_dim, self.n_layers,
                            batch_first=True, dropout=True)

        self.h2O = nn.Linear(self.hidden_dim, output_size)
        self.hidden = self.init_hidden(params['type'])

    def init_hidden(self, type):
        """Initialize the LSTM hidden and cell state

        Args:
            type: the tensor type e.g: torch.FloatTensor, torch.cuda.FloatTensor

        Returns:
            h_0,c_0 (Variable,Variable): Tensors of size (L,B,H) where:
            L: number of LSTM layers
            B: batch size
            H: hidden dimension of the lstm
        """
        h_0 = Variable(
            torch.zeros(self.n_layers, self.batch, self.hidden_dim).type(type))

        c_0 = Variable(
            torch.zeros(self.n_layers, self.batch, self.hidden_dim).type(type))

        return h_0, c_0

    def forward(self, sequence):
        """Computes the neural net forward pass

        Args:
            sequence (Variable): one-hot Tensor of size (B,SL-1,AS) where:
            B: batch size
            SL: sequence lenght
            AS: alphabet size


        Returns:
            out (Variable): one-hot Tensor of size (B*(SL-1),AS)

        """

        out = self.i2h(sequence)
        lstm_out, self.hidden = self.lstm(
            out.view(self.batch, self.seq - 1, -1), self.hidden)
        out = self.h2O(lstm_out.contiguous().view(-1, self.hidden_dim))
        return out

def sequence_to_one_hot(sequence, char_to_ix, params):
    """Turns a sequence of chars into a o ne-hot Tensor

    Args:
        sequence (str): sequence of chars
        char_to_ix (dict): mapping from chars to integers (indexes)
        params (dict): holds the program hyperparameters

    Returns:
        tensor (Tensor): one-hot tensor of size (B,SL,AS) where:
        B: batch size
        SL: sequence lenght
        AS: alphabet size
    """

    batch_size = params['batch'] * params['seq']
    assert len(sequence) == batch_size, 'Sequence must be a batch'

    tensor = torch.zeros(len(sequence), params['alphabet_size']).type(params['type'])

    for i, c in enumerate(sequence):
        tensor[i][char_to_ix[c]] = 1

    return tensor.view(params['batch'], params['seq'], params['alphabet_size'])

def train(dataloaders, char_to_ix, model, optimizer, criterion, params):
    """Trains the neural net

    Args:
        dataloaders (dict): holds PyTorch Dataloaders for training and validation
        char_to_ix: (dict): mapping from chars to integers (indexes)
        model (LSTM): the model to be trained
        optimizer (Optimizer): PyTorch optimizer
        criterion: Loss function
        params (dict): holds the program hyperparameters

        Returns:
            model (LSTM): the trained model
    """

    assert len(dataloaders['train']) != 0, 'Not enough data for training'
    assert len(dataloaders['val']) != 0, 'Not enough data for validation'

    since = time.time()

    best_loss = float('inf')
    epoch = 1
    bad_epochs = 0

    dataset_size = {x: len(dataloaders[x]) * dataloaders[x].batch_size
                    for x in ['train', 'val']}

    while True:

        print('Epoch {}'.format(epoch))
        print('=' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:

            if phase == 'train':
                model.train(True)  # Training Mode
            else:
                model.train(False)  # Evaluate mode

            running_loss = 0
            running_corrects = 0

            # Iterate over the data
            for batch in dataloaders[phase]:

                model.zero_grad()
                model.hidden = model.init_hidden(params['type'])

                inputs = Variable(sequence_to_one_hot(batch, char_to_ix, params))

                out = model(inputs[:, :-1, :])
                _, preds = out.max(1)

                # Get the targets (indexes where the one-hot vector is 1)
                _, target = inputs[:, 1:, :].topk(1)

                loss = criterion(out, target.view(-1))

                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                running_loss += loss.data[0]
                running_corrects += torch.sum(preds == target).data[0]

            # Compute mean epoch loss and accuracy
            epoch_loss = running_loss / len(dataloaders[phase])
            epoch_acc = running_corrects / dataset_size[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'val':

                # Save best weights
                if epoch_loss < best_loss:
                    bad_epochs = 0
                    best_loss = epoch_loss
                    torch.save(model.state_dict(), 'rnn.pkl')

                else:
                    bad_epochs += 1

        # Hara-kiri
        if bad_epochs == 10:
            break

        epoch += 1

    time_elapsed = time.time() - since

    print('\nTraining completed in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    print('Best Loss: {:.4f}\n\n'.format(best_loss))

    # Load best wts
    model.load_state_dict(torch.load('rnn.pkl'))

    return model