Padded_sequences and batch input made my model slower?

Hello,

I modified my LSTM based network so that it’s input be packed_padded_sequences, thinking that batch processing might be faster or parralleliized better (I am a noob in optimization), and also modified the training loop accordingly but now my model is 3 times solwer than before … any idea as to why it’s that way ?

the RNN before :

class myLSTM(nn.Module):
    def __init__(self,pitch_size,pos_size,util_size,chord_size,hidden_size):
        super().__init__()
        self.input_size = pitch_size + pos_size + util_size + chord_size
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(self.input_size, hidden_size, batch_first = True)
        self.notes_layer = nn.Linear(hidden_size,pitch_size)
        self.pos_layer = nn.Linear(hidden_size,pos_size)
        self.utils_layer = nn.Linear(hidden_size,util_size - 1)
        self.tanh = nn.Tanh()
        self.tmp_pos = pitch_size + pos_size
        self.softmax = nn.LogSoftmax(dim = 2)
        self.sigmoid = nn.Sigmoid()
        self.drop_layer = nn.Dropout(p = 0.5)
    
    def forward(self, input, hidden = None):

        if hidden == None:
            out, hidden = self.lstm(input,hidden)
            out = self.drop_layer(self.sigmoid(out))
            out_notes = self.softmax(self.notes_layer(out))
            out_pos = self.sigmoid(self.pos_layer(out))
            out_utils = self.softmax(self.utils_layer(out))
            out = torch.cat((out_notes,out_pos,out_utils),2)
        elif hidden != None:
            out, hidden = self.lstm(input,hidden)
            out = self.drop_layer(self.sigmoid(out))
            out_notes = self.softmax(self.notes_layer(out))
            out_pos = self.sigmoid(self.pos_layer(out))
            out_utils = self.softmax(self.utils_layer(out))
            out = torch.cat((out_notes,out_pos,out_utils),2)
        return out, hidden

and after :

class myLSTM(nn.Module):
    def __init__(self,pitch_size,pos_size,util_size,chord_size,hidden_size):
        super().__init__()
        self.input_size = pitch_size + pos_size + util_size + chord_size
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(self.input_size, hidden_size, batch_first = True)
        self.notes_layer = nn.Linear(hidden_size,pitch_size)
        self.pos_layer = nn.Linear(hidden_size,pos_size)
        self.tempo_layer = nn.Linear(hidden_size,1)
        self.utils_layer = nn.Linear(hidden_size,util_size - 1)
        self.tanh = nn.Tanh()
        self.tmp_pos = pitch_size + pos_size
        self.softmax = nn.LogSoftmax(dim = 2)
        self.sigmoid = nn.Sigmoid()
        self.drop_layer = nn.Dropout(p = 0.5)
    
    def forward(self, input, lengths, hidden = None):
        input = nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first = True, enforce_sorted = False)
        if hidden == None:
            out, hidden = self.lstm(input,hidden)
            out = nn.utils.rnn.pad_packed_sequence(out, batch_first = True, padding_value= -1)[0]
            out = self.drop_layer(self.sigmoid(out))
            out_notes = self.softmax(self.notes_layer(out))
            out_pos = self.sigmoid(self.pos_layer(out))
            out_utils = self.softmax(self.utils_layer(out))
            out = torch.cat((out_notes,out_pos,out_utils),2)
        elif hidden != None:
            out, hidden = self.lstm(input,hidden)
            out = nn.utils.rnn.pad_packed_sequence(out, batch_first = True, padding_value= -1)[0]
            out = self.drop_layer(self.sigmoid(out))
            out_notes = self.softmax(self.notes_layer(out))
            out_pos = self.sigmoid(self.pos_layer(out))
            out_utils = self.softmax(self.utils_layer(out))
            out = torch.cat((out_notes,out_pos,out_utils),2)
        
        return out, hidden

the training loop now looks something like that :

for iter in range(1,n_iters+1):
        batch = np.random.randint(0,10,10)
        lengths = torch.as_tensor([dataSet[b]["inputTensor"].size(0) for b in batch], dtype=torch.int64, device='cpu')
        inputTensor = nn.utils.rnn.pad_sequence([dataSet[b]["inputTensor"] for b in batch], batch_first = True, padding_value= -1)
        target = [dataSet[b]["target"] for b in batch]
        optimizer.zero_grad()       
        loss = 0
        output, hidden = model(inputTensor, lengths)
    
        for b in batch:
            pads = max(lengths) - lengths[b]
            dim = output[b,:,:].size(0) - pads
            masked_out = output[b,:,:].view(-1)[:-pads*output[b,:,:].size(1)].reshape(dim,-1)
            if pads == 0:
                masked_out = output[b,:,:]
            ln = criterion(masked_out[:,0:n_pitch], utilities.targetTensor(target[b][:,:n_pitch]))
            lp = pos_criterion(masked_out[:, n_pitch:n_pitch+n_pos], target[b][:,n_pitch:n_pitch + n_pos])
            lu = criterion(masked_out[:, n_pitch + n_pos:n_pitch + n_pos + n_util - 1], utilities.targetTensor(target[b][:,n_pitch+n_pos:]))
            loss += 5*lp + ln + lu
            
            if iter == n_iters - 1:
                print("ln : %.3f,lp : %.3f, lu : %.3f" %(ln,lp,lu))
                    
        loss.backward()
        optimizer.step()    

Both version of the code are learning properly, but I was hoping for a speedup of what is already pretty slow, not the other way around :confused: