Hi!
I am designing a model to search for patterns in videos and trace these patterns behaviour over time.
I’m using Conv3D+LSTM layers (model is present below).
The idea is to track patterns of different size over different periods of time.
Is my model architecture too complex for that task?
I am facing serious lack of memory to train the model.
I’d really appreciate a profeccional look at my case.
class VideoModel(nn.Module):
def __init__(self, num_frames, num_channels, num_classes):
super(VideoModel, self).__init__()
self.conv1 = nn.Conv3d(num_channels, 64, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv3d(64, 128, kernel_size=5, stride=1, padding=2)
self.conv3 = nn.Conv3d(128, 256, kernel_size=10, stride=1, padding=5)
self.conv4 = nn.Conv3d(256, 512, kernel_size=25, stride=1, padding=12)
self.conv5 = nn.Conv3d(512, 1024, kernel_size=25, stride=1, padding=12) # Add this line
self.lstm1 = nn.LSTM(512, 512, num_layers=2, batch_first=True, bidirectional=True)
self.lstm2 = nn.LSTM(512, 512, num_layers=2, batch_first=True, bidirectional=True)
self.fc = nn.Linear(512, num_classes)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.conv5(x)
x = x.view(x.size(0), x.size(1), -1)
x, _ = self.lstm1(x)
x, _ = self.lstm2(x)
x = self.fc(x[:, -1, :])
return x
model = VideoModel(num_frames=151, num_channels=3, num_classes=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)