Issues with CNN-BiLSTM Speech Classification Model

Hello everyone!
I am trying to classify (3-class classification problem) speech spectrograms with a CNN-BiLSTM model. The input to my model is a spectrogram split into N-splits. Here, a common base 1D-CNN model extracts features from the splits and feeds it to a BiLSTM model for classification. Here’s my code for the same:

import torch
from torch import nn

class SpeechCNN(nn.Module):
    Base 1-D CNN to extract features from spectrogram
    def __init__(self, p_drop=0.1):
        super(SpeechCNN, self).__init__()

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2)
        n_kernels = [64, 128, 256, 512]

        self.conv1 = nn.Conv1d(128, n_kernels[0], kernel_size=3, stride=1, padding=1, bias=True)
        self.bn1 = nn.BatchNorm1d(n_kernels[0], eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv2 = nn.Conv1d(n_kernels[0], n_kernels[1], kernel_size=3, stride=1, padding=1, bias=True)
        self.bn2 = nn.BatchNorm1d(n_kernels[1], eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv3 = nn.Conv1d(n_kernels[1], n_kernels[2], kernel_size=3, stride=1, padding=1, bias=True)
        self.bn3 = nn.BatchNorm1d(n_kernels[2], eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv4 = nn.Conv1d(n_kernels[2], n_kernels[3], kernel_size=3, stride=1, padding=1, bias=True)
        self.bn4 = nn.BatchNorm1d(n_kernels[3], eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(512, 64)

    def forward(self, x):
        bs, ts, channels, length = x.size()
        cnn_in = x.view(bs*ts, channels, length) 
        out = self.maxpool(self.relu(self.bn1(self.conv1(cnn_in))))
        out = self.maxpool(self.relu(self.bn2(self.conv2(out))))
        out = self.maxpool(self.relu(self.bn3(self.conv3(out))))
        out = self.maxpool(self.relu(self.bn4(self.conv4(out))))

        out = self.avgpool(out)
        out = out.view(bs*ts, 512)
        out = self.dropout(self.relu(self.fc1(out)))
        return out

class CNN_BiLSTM(nn.Module):
    CNN-BiLSTM Model for one split
    def __init__(self, p_drop=0.1):
        super(CNN_BiLSTM, self).__init__()

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=p_drop)

        self.cnn = SpeechCNN(p_drop=p_drop)
        self.bilstm = nn.LSTM(input_size=64, hidden_size=64,
                            num_layers=1, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(128, 64)

    def forward(self, x):
        #Time-Distributed CNN
        bs, ts, channels, length = x.size()    
        cnn_out = self.cnn(x)

        bilstm_in = cnn_out.view(bs, ts, -1)
        bilstm_out, (h_n, c_n) = self.bilstm(bilstm_in)

        final_bilstm_output = torch.mean(bilstm_out, dim=1)
        out = self.dropout(self.relu(self.fc1(final_bilstm_output)))

        return out

class SpeechClassifier(nn.Module):
    Classification layer appended
    def __init__(self, p_drop=0.05):
        super(SpeechClassifier, self).__init__()

        self.CNN_BiLSTM = CNN_BiLSTM(p_drop=p_drop)
        self.fc1 = nn.Linear(64, 64)
        self.fc2 = nn.Linear(64, 3)
    def forward(self, x):
        out = self.CNN_BiLSTM(x)
        out = self.fc1(out)
        out = self.fc2(out)

        return out

This model fails to learn anything at all. The same architecture achieves good convergence in Keras with its TimeDistributed CNN models. Can someone kindly point out any bugs in my code that might be affecting my learning process?