Hi! I’m currently trying to implement a video classification model on PyTorch using a CNN-BiLSTM. I have tried this specific architecture of the model before with UCF-101 and it managed to get to around 50% accuracy with 50 or so epochs and it was still slowly converging.

However, when I started to work my own dataset which is made up of 48 videos each for the 30 classes available, first epoch alone only managed to get to around 3% accuracy and 3.5 loss compared to UCF-101’s run which got around 12% even at the first epoch.

Already tried augmenting my data from 48 videos → 96 videos → 384 videos per class but it seems like it’s the same trend with the loss in the first epoch. Training takes quite a bit so I’m trying to squeeze out whatever I can since I’m running this on my local 3060 and it takes around an hour for each epoch.

Is there anything I’m doing wrong in the model architecture or in my loops? Thank you!

```
class CNN_BiLSTM(nn.Module):
def __init__(self, num_classes, lstm_hidden_dim, lstm_num_layers, dropout_prob):
super(CNN_BiLSTM, self).__init__()
# CNN part
self.cnn = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3), # 126 x 126 x 32
nn.ReLU(),
nn.BatchNorm2d(32),
nn.Conv2d(32, 32, kernel_size=3), # 124 x 124 x 32
nn.ReLU(),
nn.BatchNorm2d(32),
nn.MaxPool2d(kernel_size=2, stride=2), # 62 x 62 x 32
nn.Conv2d(32, 64, kernel_size=3), # 60 x 60 x 64
nn.ReLU(),
nn.BatchNorm2d(64),
nn.Conv2d(64, 64, kernel_size=3), # 58 x 58 x 64
nn.ReLU(),
nn.BatchNorm2d(64),
nn.MaxPool2d(kernel_size=2), # 29 x 29 x 64
nn.Flatten(),
nn.Linear(29*29*64, 64),
nn.ReLU(),
nn.Dropout(0.35),
nn.Linear(64, 128),
nn.ReLU(),
)
# BiLSTM part
self.bilstm = nn.LSTM(128, lstm_hidden_dim, num_layers=lstm_num_layers,
dropout=dropout_prob, bidirectional=True)
# Final layers
self.dropout = nn.Dropout(dropout_prob)
self.hidden2label = nn.Linear(lstm_hidden_dim * 2, num_classes)
def forward(self, x):
# x is of shape [batch_size, channels, frames, height, width]
batch_size, channels, frames, height, width = x.shape
# Process all frames in parallel using CNN
x = x.view(batch_size * frames, channels, height, width) # Merge batch and frames
# CNN processing
cnn_out = self.cnn(x) # [batch_size * frames, features]
cnn_out = cnn_out.view(batch_size, frames, -1) # Reshape to [batch_size, frames, features]
# BiLSTM processing
lstm_out, _ = self.bilstm(cnn_out)
lstm_out = lstm_out[:, -1, :] # Take the last time step
# Dropout and classifier
out = self.dropout(lstm_out)
out = self.hidden2label(out)
return out
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = len(label_dict)
model = CNN_BiLSTM(
num_classes=num_labels, lstm_hidden_dim=16, lstm_num_layers=2, dropout_prob=0.1
)
model.to(device)
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
num_batches = len(dataloader)
# Set the model to training mode - important for batch normalization and dropout layers
# Unnecessary in this situation but added for best practices
model.train()
train_loss, correct = 0, 0
for batch, (X, y) in enumerate(dataloader):
X, y = X.float().to(device), y.long().to(device)
# Compute prediction and loss
pred = model(X)
loss = loss_fn(pred, y)
train_loss += loss.item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
# Backpropagation
loss.backward()
optimizer.step()
optimizer.zero_grad()
if batch % 4 == 0:
loss, current = loss.item(), batch * CFG.BATCH_SIZE + len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
average_train_loss = train_loss / num_batches
train_accuracy = correct / size
print(
f"Training Error: \n Accuracy: {(100*train_accuracy):>0.1f}%, Avg loss: {average_train_loss:>8f} \n"
)
return average_train_loss, train_accuracy
def val_loop(dataloader, model, loss_fn):
# Set the model to evaluation mode - important for batch normalization and dropout layers
# Unnecessary in this situation but added for best practices
model.eval()
size = len(dataloader.dataset)
num_batches = len(dataloader)
val_loss, correct = 0, 0
# Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
# also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
with torch.no_grad():
for X, y in dataloader:
X, y = X.float().to(device), y.long().to(device)
pred = model(X)
val_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
average_val_loss = val_loss / num_batches
val_accuracy = correct / size
print(
f"Validation Error: \n Accuracy: {(100*val_accuracy):>0.1f}%, Validation loss: {average_val_loss:>8f} \n"
)
return average_val_loss, val_accuracy
def evaluate_model(loader, model, loss_fn):
model.eval()
y_true = []
y_pred = []
total_loss = 0
correct_examples = []
incorrect_examples = []
with torch.no_grad():
for X, y in loader:
X, y = X.float().to(device), y.long().to(device)
outputs = model(X)
loss = loss_fn(outputs, y)
total_loss += loss.item()
_, predicted = torch.max(outputs, 1)
y_true.extend(y.tolist())
y_pred.extend(predicted.tolist())
matches = predicted == y
for i in range(len(matches)):
example = (
X[i].cpu(),
y[i].item(),
predicted[i].item(),
) # Store tensor as CPU tensor, labels as items
if matches[i]:
correct_examples.append(example)
else:
incorrect_examples.append(example)
average_loss = total_loss / len(loader)
accuracy = (np.array(y_true) == np.array(y_pred)).mean()
return y_true, y_pred, average_loss, accuracy, correct_examples, incorrect_examples
learning_rate = 0.0001
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
train_losses = []
test_losses = []
val_losses = []
train_accs = []
test_accs = []
val_accs = []
max_acc = 0
for t in range(CFG.EPOCHS):
print(f"Epoch {t+1}\n-------------------------------")
train_loss, train_acc = train_loop(train_loader, model, loss_fn, optimizer)
val_loss, val_acc = val_loop(val_loader, model, loss_fn)
train_losses.append(train_loss)
train_accs.append(train_acc)
val_losses.append(val_loss)
val_accs.append(val_acc)
if max_acc < val_acc:
print(
f"[SAVING] Validation Accuracy Increased({(100*max_acc):>0.1f}% ---> {(100*val_acc):>0.1f}%)"
)
max_acc = val_acc
# Saving State Dict
torch.save(model.state_dict(), CFG.OUTPUT_MODEL)
print("Done!")
```