High loss and low accuracy with CNN-BiLSTM for Video Classification

Hi! I’m currently trying to implement a video classification model on PyTorch using a CNN-BiLSTM. I have tried this specific architecture of the model before with UCF-101 and it managed to get to around 50% accuracy with 50 or so epochs and it was still slowly converging.

However, when I started to work my own dataset which is made up of 48 videos each for the 30 classes available, first epoch alone only managed to get to around 3% accuracy and 3.5 loss compared to UCF-101’s run which got around 12% even at the first epoch.

Already tried augmenting my data from 48 videos → 96 videos → 384 videos per class but it seems like it’s the same trend with the loss in the first epoch. Training takes quite a bit so I’m trying to squeeze out whatever I can since I’m running this on my local 3060 and it takes around an hour for each epoch.

Is there anything I’m doing wrong in the model architecture or in my loops? Thank you!

class CNN_BiLSTM(nn.Module):
    def __init__(self, num_classes, lstm_hidden_dim, lstm_num_layers, dropout_prob):
        super(CNN_BiLSTM, self).__init__()
        
        # CNN part
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3),  # 126 x 126 x 32
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 32, kernel_size=3), # 124 x 124 x 32
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size=2, stride=2),  # 62 x 62 x 32
            
            nn.Conv2d(32, 64, kernel_size=3), # 60 x 60 x 64
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.Conv2d(64, 64, kernel_size=3), # 58 x 58 x 64
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=2),  # 29 x 29 x 64
 
            nn.Flatten(),
            nn.Linear(29*29*64, 64),
            nn.ReLU(),
            nn.Dropout(0.35),
            nn.Linear(64, 128),
            nn.ReLU(),
        )
 
        # BiLSTM part
        self.bilstm = nn.LSTM(128, lstm_hidden_dim, num_layers=lstm_num_layers,
                              dropout=dropout_prob, bidirectional=True)
 
        # Final layers
        self.dropout = nn.Dropout(dropout_prob)
        self.hidden2label = nn.Linear(lstm_hidden_dim * 2, num_classes)
 
    def forward(self, x):
        # x is of shape [batch_size, channels, frames, height, width]
        batch_size, channels, frames, height, width = x.shape
 
        # Process all frames in parallel using CNN
        x = x.view(batch_size * frames, channels, height, width)  # Merge batch and frames
 
        # CNN processing
        cnn_out = self.cnn(x)  # [batch_size * frames, features]
        cnn_out = cnn_out.view(batch_size, frames, -1)  # Reshape to [batch_size, frames, features]
 
        # BiLSTM processing
        lstm_out, _ = self.bilstm(cnn_out)
        lstm_out = lstm_out[:, -1, :]  # Take the last time step
 
        # Dropout and classifier
        out = self.dropout(lstm_out)
        out = self.hidden2label(out)
 
        return out
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = len(label_dict)
model = CNN_BiLSTM(
    num_classes=num_labels, lstm_hidden_dim=16, lstm_num_layers=2, dropout_prob=0.1
)
model.to(device)
 
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    train_loss, correct = 0, 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.float().to(device), y.long().to(device)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
 
        train_loss += loss.item()
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
 
        if batch % 4 == 0:
            loss, current = loss.item(), batch * CFG.BATCH_SIZE + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
 
    average_train_loss = train_loss / num_batches
    train_accuracy = correct / size
    print(
        f"Training Error: \n Accuracy: {(100*train_accuracy):>0.1f}%, Avg loss: {average_train_loss:>8f} \n"
    )
    return average_train_loss, train_accuracy
 
 
def val_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    val_loss, correct = 0, 0
 
    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.float().to(device), y.long().to(device)
            pred = model(X)
            val_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
 
    average_val_loss = val_loss / num_batches
    val_accuracy = correct / size
    print(
        f"Validation Error: \n Accuracy: {(100*val_accuracy):>0.1f}%, Validation loss: {average_val_loss:>8f} \n"
    )
    return average_val_loss, val_accuracy
 
 
def evaluate_model(loader, model, loss_fn):
    model.eval()
    y_true = []
    y_pred = []
    total_loss = 0
    correct_examples = []
    incorrect_examples = []
    with torch.no_grad():
        for X, y in loader:
            X, y = X.float().to(device), y.long().to(device)
            outputs = model(X)
            loss = loss_fn(outputs, y)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            y_true.extend(y.tolist())
            y_pred.extend(predicted.tolist())
            matches = predicted == y
            for i in range(len(matches)):
                example = (
                    X[i].cpu(),
                    y[i].item(),
                    predicted[i].item(),
                )  # Store tensor as CPU tensor, labels as items
                if matches[i]:
                    correct_examples.append(example)
                else:
                    incorrect_examples.append(example)
 
    average_loss = total_loss / len(loader)
    accuracy = (np.array(y_true) == np.array(y_pred)).mean()
    return y_true, y_pred, average_loss, accuracy, correct_examples, incorrect_examples
 
learning_rate = 0.0001
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
 
train_losses = []
test_losses = []
val_losses = []
train_accs = []
test_accs = []
val_accs = []
max_acc = 0
for t in range(CFG.EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss, train_acc = train_loop(train_loader, model, loss_fn, optimizer)
    val_loss, val_acc = val_loop(val_loader, model, loss_fn)
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    if max_acc < val_acc:
        print(
            f"[SAVING] Validation Accuracy Increased({(100*max_acc):>0.1f}% ---> {(100*val_acc):>0.1f}%)"
        )
        max_acc = val_acc
 
        # Saving State Dict
        torch.save(model.state_dict(), CFG.OUTPUT_MODEL)
print("Done!")