Hey everyone,
I am trying to use LSTM networks with Mel spectrograms as input. But I do not manage to understand the two parameters ‘Input_Size’ and ‘Hidden_Size’. I’ve a Tensor with the Size [16, 1, 64, 626] where 16 is the number of Batches, 1 is the number of channels, 64 mel frequ. and 626 is the time frame.
Question 1:
Am I correct that I can freely select the “Hidden_Size” parameter? Is there a recommended way to define this parameter?
Question 2:
According to the Pytorch documentation, the input must be in the format [batches, sequence, input_size]. In my case this would be [16, 626, 64]? In this case I have to swap input and sequence and the parameter “Input_Size” of the LSTM would be 64?
This is my first attempt at DL and I am really grateful for any help.
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes,device):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.device = device
self.lstm = nn.LSTM(input_size, self.hidden_size, self.num_layers, batch_first=True, dropout=0.4)#, bidirectional=True)
self.fc1 = nn.Linear(hidden_size, int(hidden_size/2))
self.relu = nn.ReLU()
self.fc2 = nn.Linear(int(hidden_size/2), int(hidden_size/2))
self.fc3 = nn.Linear(int(hidden_size/2), num_classes)
def forward(self, x):
x = x.float()
batch_size = x.size(0)
h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device)
c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device)
out, _ = self.lstm(x, (h0, c0))
out = self.relu(self.fc1(out[:, -1, :]))
out = self.relu(self.fc2(out))
out = self.fc3(out)
return out
# Create the model and put it on the GPU if available
Input_Size = ?
Hidden_Size = ?
Num_Layers = 2
Num_Classes = 4
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = RNN(Input_Size, Hidden_Size, Num_Layers, Num_Classes, device)
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device
print(myModel)
# Repeat for each batch in the training set
for i, data in enumerate(train_dl):
#model.train() # Optional when not using Model Specific layer
# Get the input features and target labels, and put them on the GPU
inputs, labels = data[0].to(device),data[1].to(device)
#reshape
inputs = inputs.reshape(-1,64,626).to(device)
# Normalize the inputs
inputs_m, inputs_s = inputs.mean(), inputs.std()
inputs = (inputs - inputs_m) / inputs_s
# Zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
scheduler.step()
# Keep stats for Loss and Accuracy
running_loss += loss.item()
# Get the predicted class with the highest score
_, prediction = torch.max(outputs,1)
# Count of predictions that matched the target label
correct_prediction += (prediction == labels).sum().item()
total_prediction += prediction.shape[0]