I need to build a CNN LSTM model in Keras for video classification. However, before using the actual video data, I am supposed to build a testing model for the FashionMNIST dataset.
I first tested both models separetely and they were working. But when trying to combine them, I just can’t manage to fix the dimension shape of the output of the CNN. I tried so many different techniques but I just can’t make it work. Below you can find my model. I am getting the error when calling outputs = model(outputs) which calls the LSTM model with the outputs from the CNN model.
My initial shape of images is: [12, 1, 28, 28]
I then reshape it to: [12, 1, 28, 28] where 12 is batch_size * seq_dim (but I set seq_dim=1 for now)
The output shape of the CNN is: [12, 32, 7, 7] which I now need to reshape to [batch_size, seq_dim, input_dim]
class CNNModel(nn.Module):
def __init__(self):
super(CNNModel, self).__init__()
# Convolution 1
self.cnn1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2)
self.relu1 = nn.ReLU()
# Max pool 1
self.maxpool1 = nn.MaxPool2d(kernel_size=2)
# Convolution 2
self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2)
self.relu2 = nn.ReLU()
# Max pool 2
self.maxpool2 = nn.MaxPool2d(kernel_size=2)
# Fully connected 1 (readout)
self.fc1 = nn.Linear(32 * 7 * 7, 10)
def forward(self, x):
# Convolution 1
out = self.cnn1(x)
out = self.relu1(out)
# Max pool 1
out = self.maxpool1(out)
# Convolution 2
out = self.cnn2(out)
out = self.relu2(out)
# Max pool 2
out = self.maxpool2(out)
# Resize
# Original size: (100, 32, 7, 7)
# out.size(0): 100
# New out size: (100, 32*7*7)
out = out.view(out.size(0), -1)
return out
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).__init__()
self.cnn = CNNModel()
# LSTM
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
# Building your LSTM
# batch_first=True causes input/output tensors to be of shape
# (batch_dim, seq_dim, feature_dim)
self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Initialize hidden state with zeros
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
# Initialize cell state
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
out = self.fc(out[:, -1, :])
return out
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model1 = CNNModel()
# Number of steps to unroll
seq_dim = 1
iter = 0
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Load images as a torch tensor with gradient accumulation abilities
# Clear gradients w.r.t. parameters
optimizer.zero_grad()
# Forward pass to get output/logits
''' x size: (batch_size, time_steps, in_channels, height, width) '''
batch_size, C, H, W = images.size()
images = images.view(batch_size * seq_dim, C, H, W)
outputs = model1(images)
print(outputs.shape)
outputs = outputs.view(batch_size, seq_dim, input_dim)
outputs = model(outputs)