I work on a CNN model which classify videos. I extract frames from videos and put all frames in a variable as input.
Frame number which is extracted from all videos - 10
Batch size - 32
Frame size - 100x100
My model is as shown below:
class Net(nn.Module):
def init(self, img_x=100, img_y=100, fc_hidden1=1024, fc_hidden2=768, drop_p=0.5, output=8):
super().init()self.img_x = img_x self.img_y = img_y self.output = output # CNN architechtures self.ch1, self.ch2, self.ch3, self.ch4 = 32, 64, 128, 256 self.k1, self.k2, self.k3, self.k4 = (5, 5), (3, 3), (3, 3), (3, 3) # 2d kernal size self.s1, self.s2, self.s3, self.s4 = (2, 2), (2, 2), (2, 2), (2, 2) # 2d strides self.pd1, self.pd2, self.pd3, self.pd4 = (0, 0), (0, 0), (0, 0), (0, 0) # 2d padding # conv2D output shapes self.conv1_outshape = self.conv2D_output_size((self.img_x, self.img_y), self.pd1, self.k1, self.s1) # Conv1 output shape self.conv2_outshape = self.conv2D_output_size(self.conv1_outshape, self.pd2, self.k2, self.s2) self.conv3_outshape = self.conv2D_output_size(self.conv2_outshape, self.pd3, self.k3, self.s3) self.conv4_outshape = self.conv2D_output_size(self.conv3_outshape, self.pd4, self.k4, self.s4) # fully connected layer hidden nodes self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2 self.drop_p = drop_p self.conv1 = nn.Sequential( nn.Conv2d(in_channels=2, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1, padding=self.pd1), nn.BatchNorm2d(self.ch1, momentum=0.5), nn.ReLU(inplace=True), #nn.MaxPool2d(kernel_size= 2), ) self.conv2 = nn.Sequential( nn.Conv2d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2, padding=self.pd2), nn.BatchNorm2d(self.ch2, momentum=0.5), nn.ReLU(inplace=True), #nn.MaxPool2d(kernel_size=2), ) self.conv3 = nn.Sequential( nn.Conv2d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, stride=self.s3, padding=self.pd3), nn.BatchNorm2d(self.ch3, momentum=0.5), nn.ReLU(inplace=True), #nn.MaxPool2d(kernel_size=2), ) self.conv4 = nn.Sequential( nn.Conv2d(in_channels=self.ch3, out_channels=self.ch4, kernel_size=self.k4, stride=self.s4, padding=self.pd4), nn.BatchNorm2d(self.ch4, momentum=0.5), nn.ReLU(inplace=True), #nn.MaxPool2d(kernel_size=2), ) self.drop = nn.Dropout2d(self.drop_p) self.pool = nn.MaxPool2d(kernel_size = 3) self.fc1 = nn.Linear(self.ch4 * self.conv4_outshape[0] * self.conv4_outshape[1], self.fc_hidden1) # fully connected layer, output k classes self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2) self.fc3 = nn.Linear(self.fc_hidden2, self.output) def forward(self, x): # CNNs x = self.conv1(x) x = self.conv2(x) x = self.conv3(x) x = self.conv4(x) x = x.view(x.size(0), -1) # flatten the output of conv # FC layers x = F.dropout(x, p=self.drop_p, training=self.training) x = F.relu(self.fc1(x)) # x = F.dropout(x, p=self.drop_p, training=self.training) x = F.relu(self.fc2(x)) x = self.fc3(x) return x def conv2D_output_size(self, img_size, padding, kernel_size, stride): # compute output shape of conv2D outshape = (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int), np.floor((img_size[1] + 2 * padding[1] - (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int)) return outshape
I got the error below:
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [32, 2, 5, 5], but got 5-dimensional input of size [32, 10, 2, 100, 100] instead
Could you help me to solve this issue? Thanks in advance