Next frame prediction using CNN-LSTM

Thabang_Lukhetho · April 15, 2021, 9:29am

I’m doing next frame prediction from static images extracted from video and save into disk. I’m using CNN-LSTM, during training feed the model 5 frames and predict the 6th frame, but during evaluation I want the CNN-LSTM model to take it’s prediction and use it as input to predict the next future frame it should repeat until predict the 6th frame.

How do you pass the predicted frame from then CNN-LSTM as an input in the next step for 6 iterations? Here is the sample code for my CNN-LSTM model:

class CRNN(nn.Module):
def __init__(self, in_channels=3, sample_size=64, num_classes=100,
            hidden_size=512, num_layers=1, rnn_unit='LSTM'):
    super(CRNN, self).__init__()
    self.in_channels=in_channels
    self.sample_size = sample_size
    self.num_classes = num_classes
    self.rnn_unit=rnn_unit

    # network params
    self.ch1, self.ch2= 64, 128
    self.k1, self.k2 = (5, 5), (5, 5)
    self.s1, self.s2 = (1, 1), (1, 1)
    self.p1, self.p2 = (0, 0), (0, 0)
    self.d1, self.d2 = (1, 1), (1, 1)
    self.input_size  = self.ch2
    self.hidden_size = hidden_size
    self.num_layers  = num_layers

    # network architecture
    # in_channels=3 for rgb
    self.conv1 = nn.Sequential(
        nn.Conv2d(in_channels=self.in_channels, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1, padding=self.p1, dilation=self.d1),
        nn.BatchNorm2d(self.ch1, momentum=0.01),
        nn.ReLU(inplace=True),
        nn.Dropout(p=0.5),
        nn.Conv2d(in_channels=self.ch1, out_channels=self.ch1, kernel_size=1, stride=1),
        nn.MaxPool2d(kernel_size=2),
    )
    self.conv2 = nn.Sequential(
        nn.Conv2d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2, padding=self.p2, dilation=self.d2),
        nn.BatchNorm2d(self.ch2, momentum=0.01),
        nn.ReLU(inplace=True),
        nn.Dropout(p=0.5),
        nn.Conv2d(in_channels=self.ch2, out_channels=self.ch2, kernel_size=1, stride=1),
        nn.MaxPool2d(kernel_size=2),
    )

    if self.rnn_unit=='LSTM':
        self.lstm = nn.LSTM(
        input_size=self.input_size,
        hidden_size=self.hidden_size,
        dropout= 0.5 if self.num_layers > 1 else 0,
        num_layers=self.num_layers,
        batch_first=True,
    )
    if self.rnn_unit=='GRU':
        self.lstm = nn.GRU(
        input_size=self.input_size,
        hidden_size=self.hidden_size,
        dropout= 0.5 if self.num_layers > 1 else 0,
        num_layers=self.num_layers,
        batch_first=True,
    )
    if self.rnn_unit=='RNN':
        self.lstm = nn.RNN(
        input_size=self.input_size,
        hidden_size=self.hidden_size,
        dropout= 0.5 if self.num_layers > 1 else 0,
        num_layers=self.num_layers,
        batch_first=True,
    )

  
    self.fc1 = nn.Linear(self.hidden_size, self.num_classes)
    self.act = nn.Sigmoid()

def forward(self, X, n_steps=None):
    # CNN
    # x: (batch_size, channel, t, h, w)
    cnn_embed_seq = []
    if n_steps is None: # Code for training
        for t in range(X.size(2)):
            # Conv
            x = self.conv1(X[:, :, t, :, :])
            x = self.conv2(x)

            x = x.view(x.size(0), -1)
            cnn_embed_seq.append(x)

        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0)
        
        # batch first
        cnn_embed_seq = cnn_embed_seq.transpose_(0, 1)

        # LSTM
        hidden=None
        # use faster code paths
        self.lstm.flatten_parameters()
        out, hidden = self.lstm(cnn_embed_seq, hidden)
        # MLP
        # out: (batch, seq, feature), choose the last time step
        out = self.fc1(out[:, -1, :])
        out = self.act(out)

    if n_steps is not None: # Code for prediction
        for t in range(n_steps): 
            # Conv
            x = self.conv1(X[:, :, 0, :, :])
            x = self.conv2(x)
  
            x = x.view(x.size(0), -1)
            # cnn_embed_seq.append(x)

            # cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0)
            # batch first
            # x = x.transpose_(0, 1)
            # print('pred:',x.shape, t)

            x = x[ :, np.newaxis, :]
            # LSTM
            hidden=None
            # use faster code paths
            self.lstm.flatten_parameters()
            out, hidden = self.lstm(x, hidden)
            # print('LSTM out', out.shape)
            # MLP
            # out: (batch, seq, feature), choose the last time step
            out = self.fc1(out[:, -1, :])
            # X[:, 1, :, :, :] = out.view(-1, 1, self.sample_size,self.sample_size)
            out = self.act(out)
            X[:, 1, :, :, :] = out.view(-1, 1, self.sample_size,self.sample_size)

    return out