How to modify the Network to combine CNN and LSTM Model

Hi, I’m trying to put together a CNN and a LSTM for a regression problem. I want to estimate BPM from video frame.
I am quite a new gamer of torch so I will appriciate an explanation and a solution.
Here is the code:

class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.conv2D_1 = nn.Conv2d(3, 8, kernel_size=1, stride=1, padding=0)
        self.conv2D_2 = nn.Conv2d(8, 16, kernel_size=2, stride=2, padding=0)
        self.conv2D_3 = nn.Conv2d(16, 16, kernel_size=1, stride=1, padding=0)
        self.conv2D_4 = nn.Conv2d(16, 32, kernel_size=2, stride=2, padding=0)
        self.conv2D_5 = nn.Conv2d(32, 32, kernel_size=1, stride=1, padding=0)
        self.conv2D_6 = nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0)
        self.conv2D_7 = nn.Conv2d(64, 64, kernel_size=1, stride=1, padding=0)
        self.conv2D_8 = nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0)
        self.conv2D_9 = nn.Conv2d(128, 128, kernel_size=1, stride=1, padding=0)
        self.conv2D_10 = nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0)
        self.conv2D_11 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)
        self.conv2D_12 = nn.Conv2d(256, 384, kernel_size=2, stride=2, padding=0)
        self.conv2D_13 = nn.Conv2d(384, 384, kernel_size=1, stride=1, padding=0)
        self.conv2D_14 = nn.Conv2d(384, 512, kernel_size=1, stride=2, padding=0)
        self.conv2D_15 = nn.Conv2d(512, 512, kernel_size=1, stride=2, padding=0)

        self.reshape_conv3D = nn.Flatten()
        self.lstm_1 = nn.LSTM(512, 128, batch_first=True)
        self.dropout_1 = nn.Dropout(0.3)
        self.lstm_2 = nn.LSTM(128, 32, batch_first=True)
        self.dropout_2 = nn.Dropout(0.3)
        self.lstm_3 = nn.LSTM(32, 1, batch_first=True)
        self.reshape_lstm = nn.Flatten()
        self.dense = nn.Linear(1, 1)

    def forward(self, x):
        x = self.conv2D_1(x)
        x = self.conv2D_2(x)
        x = self.conv2D_3(x)
        x = self.conv2D_4(x)
        x = self.conv2D_5(x)
        x = self.conv2D_6(x)
        x = self.conv2D_7(x)
        x = self.conv2D_8(x)
        x = self.conv2D_9(x)
        x = self.conv2D_10(x)
        x = self.conv2D_11(x)
        x = self.conv2D_12(x)
        x = self.conv2D_13(x)
        x = self.conv2D_14(x)
        x = self.conv2D_15(x)

        x = self.reshape_conv3D(x)
        x, _ = self.lstm_1(x)
        x = self.dropout_1(x)
        x, _ = self.lstm_2(x)
        x = self.dropout_2(x)
        x, _ = self.lstm_3(x)
        x = self.reshape_lstm(x)
        x = self.dense(x)

        return x

I know that LSTM has a different input shape with respect to CNN output but it’s not clear how to adapt the whole thing.
Thank you for the answer.