My ConvLSTM network requires the following input shape: (64, 10, 3, 32, 32) [batch size, length of the seq., color channel, img. size). Previously I have only worked with the Conv part of the network which required input shape: (64, 3, 32, 32). Can I now simply stack my tensors or is another procedure better suited?
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(3, 10, 5)
self.conv2 = nn.Conv2d(10, 20, 5)
self.conv3 = nn.Conv2d(20, 30, 5)
self.pool = nn.MaxPool2d(2, 2)
def forward(self, i):
x = i.view(-1, i.shape[2], i.shape[3], i.shape[4])
x = F.relu(self.conv1(x))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = x.view(i.shape[0], i.shape[1], -1)
return x
class LSTM(nn.Module):
def __init__(self):
super(LSTM, self).__init__()
self.lstm = nn.LSTM(480, 100)
self.fc = nn.Linear(100*10, 2)
def forward(self, x):
x, _ = self.lstm(x)
x = x.view(x.shape[0], -1)
print('here', x.shape)
x = self.fc(x)
return x
EDIT: I think what I’m looking for is a dataloader for video that takes as input a list of images.