Using LSTM with Mel Spectrograms as input

I am trying to use LSTM networks with Mel spectrograms as input. But I do not manage to understand the two parameters ‘Input_Size’ and ‘Hidden_Size’. I’ve a Tensor with the Size [16, 1, 64, 626] where 16 is the number of Batches, 1 is the number of channels, 64 mel frequ. and 626 is the time frame.

Question 1:
Am I correct that I can freely select the “Hidden_Size” parameter? Is there a recommended way to define this parameter?

Question 2:
According to the Pytorch documentation, the input must be in the format [batches, sequence, input_size]. In my case this would be [16, 626, 64]? In this case I have to swap input and sequence and the parameter “Input_Size” of the LSTM would be 64?

 class RNN(nn.Module):
     def __init__(self, input_size, hidden_size, num_layers, num_classes,device):
         super(RNN, self).__init__()
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.device = device
         self.lstm = nn.LSTM(input_size, self.hidden_size, self.num_layers, batch_first=True, dropout=0.4)#, bidirectional=True)
         self.fc1 = nn.Linear(hidden_size, int(hidden_size/2))
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(int(hidden_size/2), int(hidden_size/2))
         self.fc3 = nn.Linear(int(hidden_size/2), num_classes)
     def forward(self, x):
        x = x.float()
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device) 
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device) 
        out, _ = self.lstm(x, (h0, c0)) 
        out = self.relu(self.fc1(out[:, -1, :]))
        out = self.relu(self.fc2(out))
        out = self.fc3(out) 
        return out
# Create the model and put it on the GPU if available
Input_Size = ?
Hidden_Size = ?
Num_Layers = 2
Num_Classes = 4

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = RNN(Input_Size, Hidden_Size, Num_Layers, Num_Classes, device)
myModel =
# Check that it is on Cuda
 # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        #model.train()     # Optional when not using Model Specific layer
        # Get the input features and target labels, and put them on the GPU
        inputs, labels  = data[0].to(device),data[1].to(device) 

        inputs = inputs.reshape(-1,64,626).to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

  1. Yes, the hidden_size defines the number of features in the hidden state h and you can pick it.

  2. Yes, your suggestion looks correct and you should permute the input instead of using reshape as the latter will interleave the values.

