How to train LSTM with GPU

Hi everybody,

I am replying to this topic since I am facing a similar problem to the one of @Probe, but his solution of using a custom collate function in the DataLoader is not working for me.

I have a recurrent autoencoder, of which I have to gauge the enconding capability, therefore my net is composed of two layers (code below):

  1. an encoding layer composed by the LSTM;
  2. a decoding layer, which is nothing but a dense layer that tries to reconstruct the input from the LSTM output.
class RnnLSTMAutoEncoder(nn.Module):
    """ Rnn based on the LSTM model

        Args:
              input_length (int): input dimension
              code_length (int): LSTM output dimension
              num_layers (int): LSTM layers' number
    """

    ##  Constructor
    def __init__(self, input_length, code_length, num_layers=1):
        super(RnnLSTMAutoEncoder, self).__init__()

        #  Attributes
        self.input_length = input_length
        self.code_length = code_length
        self.num_layers = num_layers

        #  Nets
        self.encodeLayer = nn.LSTM(self.input_length, self.code_length, num_layers=self.num_layers, batch_first=True)
        self.decodeLayer = nn.Linear(self.code_length, self.input_length)

        # Decode layer parameters' initialization
        torch.nn.init.uniform_(self.decodeLayer.weight)
        self.decodeLayer.bias = nn.Parameter(torch.zeros_like(self.decodeLayer.bias))


    ##  Encode function
    def encode(self, x):
        # CODING
        output, _ = self.encodeLayer(x)
        return output

    ##  Decode function
    def decode(self, x):
        # DECODING (linear dense layer followed by an activation function [identity in this case, so none])
        x = self.decodeLayer(x)
        return x

    ##  Forward function
    def forward(self, x):
        encoded = self.encode(x)
        if isinstance(encoded, torch.Tensor):
            decoded = self.decode(encoded)
        else:
            unpacked, unpacked_length = nn.utils.rnn.pad_packed_sequence(encoded, batch_first=True)
            vectors = list()
            for i, vector in enumerate(unpacked):
                vectors.append(unpacked[i, unpacked_length[i] - 1, :].view(1, -1))
            decoded = self.decode(torch.cat(vectors, 0))
        return decoded

Following Probe’s suggestion, I wrote my custom collate function as follows:

def my_collate(batch):
    data = [item[0] for item in batch]
    x = torch.stack(data)

    # Lengths vector for the correct packing of the input
    lengths = torch.zeros(x.size()[0])
    for i in range(x.size()[0]):
        for j in range(seq_length):
            if sum(1 for k in x[i, j, :] if k != 0) == x.size()[2]:
                lengths[i] += 1

    # Both padded sequences and lengths should be ordered descendingly wrt to the sequence length
    lengths, indices = torch.sort(lengths, descending=True)
    lengths = lengths.type(torch.ByteTensor)
    x = x[indices, :, :]

    y = torch.zeros(train_batch_size, x.size()[2])
    for i in range(train_batch_size):
        seq_el_idx = lengths[i].item() - 1
        y[i, :] = x[i, seq_el_idx, :]

    # Packing the data
    x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)

    return [x, y]

My dataset is made of vectors of features extracted from video frames, so what I give to the LSTM is a sequence of vectors that from step t goes back in time till step t-seq_length.
Obviously, for the first time steps (for example 1, the first video frame), I have nothing that goes back in time. Thus, I wrote a custom Dataset class which in this case fills the sequence with zeros till it reaches seq_length, while my collate function converts it in a PackedSequence object (the x element returned in the batch).
For evaluating the net’s perfomance instead, I just need to compute the loss between the last element of the sequence (rearranged in the y element returned in the batch), and the last element of the packed sequence I receive as output.

As Probe did in his code, with the custom collate function the DataLoader gives packedSequences as inputs to the autoencoder, while the padding of the output of the LSTM is handled in the forward function.
Everything works fine, but nonetheless my code is not running on the GPU.

I have debugged my code with PyCharm, and everything seems to be on the GPU: the input sequences, the LSTM output, the final autoencoder output, etc…, and in fact I can see the data uploaded to the GPU memory, but still, the whole training procedure takes place on the CPU.

I am currently managing the whole training procedure with Ignite, and my training code is the following:

##  Data loader helper

def get_data_loaders(train_batch_size, val_batch_size, num_workers, train_dir, val_dir, seq_length):
    #  Custom data transformation
    #  example: data_transform = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))])
    data_transform = transforms.Lambda(lambda x: normalize_feature_vector(x))

    #  Dataset instantiation

    co_t_set = CoOccurrencesDatasetRnnTime(train_dir, seq_length, data_transform)
    co_v_set = CoOccurrencesDatasetRnnTime(val_dir, seq_length, data_transform)

    #  Training set DataLoader

    train_loader = Data.DataLoader(co_t_set, train_batch_size, collate_fn=my_collate, shuffle=False,
                                   num_workers=num_workers)

    #  Validation set DataLoader

    val_loader = Data.DataLoader(co_v_set, val_batch_size, collate_fn=my_collate, shuffle=False,
                                 num_workers=num_workers)

    return train_loader, val_loader

##  Batch preparation

def autoencoder_batch(batch, device, non_blocking=False):
    # Simply sends the data to GPU
    x, y = batch

    if device == 'cuda':
        x = x.cuda().to(device)
        y = y.cuda().to(device)

    return x, y


##  Training routine

def autoencoder_training(trainer, batch):
    # Extract the input and "label"
    bx, by = autoencoder_batch(batch, device)

    # Send the model to GPU (if available)
    if device == 'cuda':
        model.to(device)

    # Forwarding
    model.train()
    optimizer.zero_grad()
    decoded = model(bx)

    # Compute the loss
    loss = loss_func(decoded, by)

    # Optimize
    loss.backward()
    optimizer.step()

    return loss.item()


###     Model training    ###

##  Dataset loading parameters

train_path = 'training_set_path_on_my_machine'
val_path = 'validation_set_path_on_my_machine'
num_workers = 4

##  Training parameters

epochs = 30
train_batch_size = 5
val_batch_size = 5
LR = 0.005  # learning rate
input_length = 625
code_length = 100
seq_length = 25
es_patience = 10
exp_decay = 0.95
log_dir = 'logging_directory_on_my_machine'
log_interval = 10000  # number of batches for each log on the console

##  Logging configuration

logging.basicConfig(filename='logging_directory_on_my_machine',
                    filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)

if __name__ == '__main__':

    #  Dataloaders instantiation
    print('Loading the datasets and extracting the features...')
    logging.info('Loading the datasets and extracting the features...')
    train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size,
                                                num_workers, train_path, val_path, seq_length)
    print('Features extracted!')
    logging.info('Features extracted!')

    #  Model instantiation
    model = RnnLSTMAutoEncoder(input_length, code_length)

    #  Writer instantiation for TensorboardX
    writer = create_summary_writer(model, train_loader, log_dir)  # creates a summary write with tensorboardX

    #  GPU loading (if available)
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'

    #  Optimizer, trainer and evaluator instantiation
    optimizer = optim.Adam(model.parameters(), lr=LR)
    loss_func = nn.MSELoss()
    trainer = Engine(autoencoder_training)
    evaluator = create_supervised_evaluator(model,
                                            metrics={'MSE': Loss(nn.MSELoss())},
                                            device=device,
                                            prepare_batch=autoencoder_batch)

    ##          EVENTS HANDLER FOR IGNITE          ##

    #  HANDLER FOR EACH COMPLETED ITERATION
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.5f}"
                  "".format(engine.state.epoch, iter, len(train_loader), engine.state.output))
            writer.add_scalar("training/loss", engine.state.output, engine.state.iteration)
            logging.info("Epoch[{}] Iteration[{}/{}] Loss: {:.5f}"
                         "".format(engine.state.epoch, iter, len(train_loader), engine.state.output))


    #  HANDLERS FOR EACH COMPLETED EPOCH

    # Early stopping implementation
    def score_function(engine):
        val_loss = engine.state.metrics['MSE']
        return -val_loss


    handler = EarlyStopping(patience=es_patience, score_function=score_function, trainer=trainer)
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler)

    # training results logging
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_MSE = metrics['MSE']
        print("Training Results - Epoch: {}, Avg loss: {:.5f}"
              .format(engine.state.epoch, avg_MSE))
        writer.add_scalar("training/avg_loss", avg_MSE, engine.state.epoch)
        logging.info('Training Results - Epoch: {}, Avg loss: {:.5f}'.format(engine.state.epoch, avg_MSE))


    # validation results logging
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_MSE = metrics['MSE']
        print("Validation Results - Epoch: {}, Avg loss: {:.5f}"
              .format(engine.state.epoch, avg_MSE))
        writer.add_scalar("valdation/avg_loss", avg_MSE, engine.state.epoch)
        logging.info('Validation Results - Epoch: {}, Avg loss: {:.5f}'.format(engine.state.epoch, avg_MSE))


    ##     RUNNING

    print('Training...')
    trainer.run(train_loader, max_epochs=epochs)

    writer.close()

Any suggestion on what it might be? Any help or hint is greatly appreciated.

Thanks for your time!