I have implemented this model
import torch
import torch.nn as nn
import torchvision.models as models
class SimpleVideoClassifier(nn.Module):
def __init__(self, num_classes=4):
super(SimpleVideoClassifier, self).__init__()
# Use ResNet18 as a feature extractor (without the final fully connected layer)
self.resnet = models.resnet18(pretrained=True)
self.resnet.fc = nn.Identity() # Remove final fully connected layer
self.feature_size = 512 # ResNet-18 output feature size
# Fully connected layer to classify the aggregated features
self.fc = nn.Linear(self.feature_size, num_classes)
def forward(self, x):
# x shape: (batch_size, sequence_length, channels, height, width)
print(f"Original input shape: {x.shape}") # Debugging print
batch_size, seq_len = x.size(0), x.size(1)
# Process each frame through ResNet
frame_features = []
for i in range(seq_len):
frame = x[:, i, :, :, :] # Extract each frame (shape: [batch_size, 3, 224, 224])
print(f"Frame shape (batch {i}): {frame.shape}") # Debugging print
features = self.resnet(frame) # Pass frame through ResNet
# Stack frame features along the sequence dimension
frame_features = torch.stack(frame_features, dim=1) # Shape: (batch_size, seq_len, feature_size)
print(f"Stacked frame features shape: {frame_features.shape}") # Debugging print
# Average the features over the sequence
avg_features = frame_features.mean(dim=1) # Shape: (batch_size, feature_size)
# Final classification layer
output = self.fc(avg_features) # Shape: (batch_size, num_classes)
print(f"Output shape: {output.shape}") # Debugging print
return output
# Initialize the model
model = SimpleVideoClassifier(num_classes=4)
There is not problem while i train it
import torch.optim as optim
import torch.nn.functional as F
# Setup device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
epochs = 10
# Training loop
for epoch in range(epochs):
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = F.cross_entropy(outputs, labels)
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")
But when I try to evaluate it using this code:
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
# Apply softmax to get probabilities, then use torch.argmax to get predicted class
_, predicted = torch.max(F.softmax(outputs, dim=1), 1) # Shape: [batch_size]
# Compare the predicted class with the actual class (from one-hot encoded labels)
correct += torch.sum(torch.argmax(labels, dim=1) == predicted).item()
total += labels.size(0) # Update total samples
all_labels.extend(torch.argmax(labels, dim=1).cpu().numpy()) # Get the index of true class
all_preds.extend(predicted.cpu().numpy()) # Store predicted class indices
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")
i get the following error:
RuntimeError Traceback (most recent call last)
in <cell line: 1>()
3 inputs, labels = inputs.to(device), labels.to(device)
----> 5 outputs = model(inputs)
6 # Apply softmax to get probabilities, then use torch.argmax to get predicted class
7 _, predicted = torch.max(F.softmax(outputs, dim=1), 1) # Shape: [batch_size]
10 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias)
547 self.groups,
548 )
→ 549 return F.conv2d(
550 input, weight, bias, self.stride, self.padding, self.dilation, self.groups
551 )
RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[8, 224, 224, 3] to have 3 channels, but got 224 channels instead
The original and frame shapes at this point are:
Original input shape: torch.Size([8, 30, 224, 224, 3])
Frame shape (batch 0): torch.Size([8, 224, 224, 3])
Is there anything in the architecture that changes during evaluation ?