Improve my training iteration


I have created the following model:

def __init__(self, input_size, output_size, hidden_dim, n_layers, n_feats, drop_prob=0.5):
        super(MySpeechRecognition, self).__init__()
        #output_dim = will be the alphabet + '' and space = 28 chars
        self.input_size = input_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.drop_prob = drop_prob
        self.output_dim = output_size

        # GRU Layer --> input (batch, channel*features, time)
        # Input size = number of features
        self.gru = nn.GRU(input_size, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        # shape output (batch, channel*features, time * hidden_size)
        self.layer_norm = nn.LayerNorm(n_feats)
        # (batch, channel, features, time)
        #Fully Connected 
        self.fc1 = nn.Linear(self.hidden_dim,512)
        self.fc2 = nn.Linear(512, self.output_dim)
        self.dropout = nn.Dropout(0.2)
input_size = 128
n_classes = 29
hidden_dim = 250
n_layers = 2
n_feats = 128
batch_size = 32

And this is my training loop:

def train(n_epochs, train_loader, valid_loader, model, optimizer, criterion, clip, save_path):
    data_len = len(train_loader.dataset)
    t0 = time.time()
    for e in range(n_epochs):
        #Initialize hidden state
        running_losses = 0.0
        test_losses = 0.0
        loss_values = []
        test_loss_values = []

        h = model.init_hidden(batch_size)

        #batch loop
        for batch_idx, _data in enumerate(train_loader):

            specs, labels, input_lengths, label_lengths = _data
            if (train_on_gpu):
                specs, labels = specs.cuda(), labels.cuda()
            # Break if it is the last batch or the length is not the same as batch size because otherwise it will get error.
            if (len(specs) != batch_size):
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h =
            # zero accumulated gradients

            # get the output from the model

            output, h = model(specs, h)
            output = F.log_softmax(output, dim=2)
            output = output.transpose(0,1)
            # calculate the loss and perform backprop
            loss = criterion(output, labels.float(), input_lengths, label_lengths)
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            # loss stats
            running_loss =+ loss.item()*specs.size(0)

            if batch_idx % 100 == 0 or batch_idx == data_len:
                # Get Validation loss

                val_losses = []
                val_h = model.init_hidden(batch_size)

                for batch_idx_v, _data in enumerate(valid_loader):

                    val_h =

                    specs, labels, input_lengths, label_lengths = _data

                    if (len(specs) != batch_size):
                        specs, labels = specs.cuda(), labels.cuda()

                    with torch.no_grad():
                        output, h = model(specs, h)
                        output = F.log_softmax(output, dim=2)
                    output = output.transpose(0,1)
                    val_loss = criterion(output, labels.float(), input_lengths, label_lengths)
                    test_losses =+ val_loss.item()*specs.size(0)

                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\Valid Loss: {:.6f}'.format(
                        e+1, batch_idx * len(specs), data_len,
                        100. * batch_idx / len(train_loader), loss.item(), val_loss.item()))

        test_loss_values.append(test_losses), save_path)
        print('Epoch {} + took {} seconds'.format(e+1, time.time() - t0))
    return (model, loss_values, test_loss_values)

I am not sure if I can improve the training iteration in any way. It is taking too long even for 10 epochs…

I tried both with CPU and GPU on kaggle.


You could try to profile the code and determine where the bottleneck is at the moment.
E.g. if the data loading is the bottleneck, you could potentially increase the number of workers, if that’s not already done. I don’t know what kind of systems are used on Kaggle, so I cannot comment on the data loading speed etc.

Unrelated to this issue, but you should not use the .data attribute, but instead call .detach() on the tensor you would like to detach from the computation graph.

Thank you so much!

I didn’t know about .detach().

Sir , is it necessary to use previous batch’s hidden state as , initial hidden state to the next batch .
Should not we use typically all zeros as the initial hidden state ?

something like this -->

output, h = model( specs, None) 

Thank You.

I initialized the hidden weights with zeros at first:

h = model.init_hidden(batch_size)

The init_hidden function is inside the model’s class
if (train_on_gpu):
hidden = (torch.zeros(self.n_layersself.n_direction, batch_size, self.hidden_dim).zero_()).cuda()
hidden = (torch.zeros(self.n_layers
self.n_direction, batch_size, self.hidden_dim).zero_())

I have another question for detach() and data

For initializing weights, should I also use detach() rather than data?



You should initialize the parameters directly via xavier(m.weight). There is no need to call detach, as this operation shouldn’t be tracked by Autograd.