Hello everyone!
I am trying to classify (3-class classification problem) speech spectrograms with a CNN-BiLSTM model. The input to my model is a spectrogram split into N-splits. Here, a common base 1D-CNN model extracts features from the splits and feeds it to a BiLSTM model for classification. Here’s my code for the same:
#IMPORTS
import torch
from torch import nn
#MODELS
class SpeechCNN(nn.Module):
'''
Base 1-D CNN to extract features from spectrogram
'''
def __init__(self, p_drop=0.1):
super(SpeechCNN, self).__init__()
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=p_drop)
self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2)
n_kernels = [64, 128, 256, 512]
#ConvLayer1
self.conv1 = nn.Conv1d(128, n_kernels[0], kernel_size=3, stride=1, padding=1, bias=True)
self.bn1 = nn.BatchNorm1d(n_kernels[0], eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#ConvLayer2
self.conv2 = nn.Conv1d(n_kernels[0], n_kernels[1], kernel_size=3, stride=1, padding=1, bias=True)
self.bn2 = nn.BatchNorm1d(n_kernels[1], eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#ConvLayer3
self.conv3 = nn.Conv1d(n_kernels[1], n_kernels[2], kernel_size=3, stride=1, padding=1, bias=True)
self.bn3 = nn.BatchNorm1d(n_kernels[2], eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#ConvLayer4
self.conv4 = nn.Conv1d(n_kernels[2], n_kernels[3], kernel_size=3, stride=1, padding=1, bias=True)
self.bn4 = nn.BatchNorm1d(n_kernels[3], eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#DenseLayers
self.avgpool = nn.AdaptiveAvgPool1d(1)
self.fc1 = nn.Linear(512, 64)
def forward(self, x):
bs, ts, channels, length = x.size()
cnn_in = x.view(bs*ts, channels, length)
out = self.maxpool(self.relu(self.bn1(self.conv1(cnn_in))))
out = self.maxpool(self.relu(self.bn2(self.conv2(out))))
out = self.maxpool(self.relu(self.bn3(self.conv3(out))))
out = self.maxpool(self.relu(self.bn4(self.conv4(out))))
out = self.avgpool(out)
out = out.view(bs*ts, 512)
out = self.dropout(self.relu(self.fc1(out)))
return out
class CNN_BiLSTM(nn.Module):
'''
CNN-BiLSTM Model for one split
'''
def __init__(self, p_drop=0.1):
super(CNN_BiLSTM, self).__init__()
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=p_drop)
self.cnn = SpeechCNN(p_drop=p_drop)
self.bilstm = nn.LSTM(input_size=64, hidden_size=64,
num_layers=1, batch_first=True, bidirectional=True)
self.fc1 = nn.Linear(128, 64)
def forward(self, x):
#Time-Distributed CNN
bs, ts, channels, length = x.size()
cnn_out = self.cnn(x)
#BiLSTM
bilstm_in = cnn_out.view(bs, ts, -1)
bilstm_out, (h_n, c_n) = self.bilstm(bilstm_in)
final_bilstm_output = torch.mean(bilstm_out, dim=1)
#Dense
out = self.dropout(self.relu(self.fc1(final_bilstm_output)))
return out
class SpeechClassifier(nn.Module):
'''
Classification layer appended
'''
def __init__(self, p_drop=0.05):
super(SpeechClassifier, self).__init__()
self.CNN_BiLSTM = CNN_BiLSTM(p_drop=p_drop)
self.fc1 = nn.Linear(64, 64)
self.fc2 = nn.Linear(64, 3)
def forward(self, x):
out = self.CNN_BiLSTM(x)
out = self.fc1(out)
out = self.fc2(out)
return out
This model fails to learn anything at all. The same architecture achieves good convergence in Keras with its TimeDistributed CNN models. Can someone kindly point out any bugs in my code that might be affecting my learning process?