Video Classification networker hanging on loss.backwards()

Hey all,

Currently using a neural network to perform a binary classification of video data. However, the model seems to hang on loss.backwards(). I expect this could take awhile as I am running on the CPU, but I let it run for 2-3 hours and it was not able to complete one pass.

Below is my model structure

class VestibularNetwork(nn.Module):
def init(self, input_shape: int, hidden_units: int, output_shape: int, num_classes: int, batch_size: int = 1):
self.video_block = nn.Sequential(
kernel_size=3, # how big is the square that’s going over the image?
stride=1, # default
padding=1),# options = “valid” (no padding) or “same” (output has same shape as input) or int for specific number
stride=2) # default stride value is same as kernel_size
#self.label_block = nn.SequenThe batch_size parameter is set to 1 by default, but it’s also used as an argument in the method signature. This could lead to confusion about which value is actually being used for the batch size during the forward pass.tial(
# nn.Linear(in_features=1, out_features=1),
# nn.ReLU()

    self.common_block = nn.Sequential(
        nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, padding=1),
        nn.Conv2d(hidden_units, hidden_units, 3, padding=1),

    # Define LSTM layer
    self.lstm = nn.LSTM(input_size=hidden_units, hidden_size=hidden_units, batch_first=True)

    self.classifier = nn.Linear(in_features=hidden_units, out_features=1)

def forward(self, video_frames, batch_size):
    # Process video frames
    video_frames = video_frames.float()
    (batch_size, sequence_length, channels, height, width) = video_frames.size()
    video_frames_reshaped = video_frames.view(batch_size*sequence_length, channels, height, width)
    #video_frames = video_frames.permute(2, 0, 1).unsqueeze(0)
    #video_frames = video_frames.repeat(batch_size, 1, 1, 1)
    print("Video size:", video_frames_reshaped.shape)
    x_list = []
    for i in range(batch_size):
        # Process each batch separately
        x_batch = self.video_block(video_frames_reshaped[i * sequence_length:(i + 1) * sequence_length])
        x_batch = self.common_block(x_batch)
        x_batch = torch.mean(x_batch, dim=[2, 3])  # Pooling or flattening operation, adjust as needed

        # Concatenate the outputs of all batches
    x =, dim=0)
    print(f"after cat: {x.shape}")

    # Reshape for LSTM input
    x = x.view(batch_size, sequence_length, -1)

    # Apply LSTM
    lstm_out, _ = self.lstm(x)

    # Aggregate predictions using the final state of the LSTM
    lstm_out = lstm_out[:, -1, :]

    # Apply classifier
    x = self.classifier(lstm_out)
    print(f"classifier: {x.shape}")

    return x

Any tips or insight would be great as I am not sure how long it should take or if there is some issue in my underlying code causing inefficiencies.

Below is some additional code for my dataloader, train loop, and Bayesian optimization.

Dataloader code

class VideoDataset(Dataset):
def init(self, video_dir, video_transform = None, clip_length = 10):
self.video_dir = video_dir
self.video_files = [file for file in os.listdir(video_dir) if file.endswith(‘.mp4’)]
self.labels = [self.extract_label(file) for file in self.video_files]
self.outcomes = [label[2] for label in self.labels]
self.label_features = [self.extract_label(file)[:2] for file in self.video_files] # Ignore outcome label
self.video_transform = video_transform
self.clip_length = clip_length

def __len__(self):
    return len(self.video_files)

def __getitem__(self, index):
    video_file= self.video_files[index]
    label = self.labels[index]
    # Ensuring outcome is a 1D tensor
    outcome = torch.tensor(label[2])
    # Create a new tuple for label without modifying the original label
    label_without_outcome = (torch.tensor(label[0]), torch.tensor(label[1]))

    # Update the variable name to reflect the change
    label = label_without_outcome

    video_path = os.path.join(self.video_dir, video_file)

    # Open video file
    cap = cv2.VideoCapture(video_path)

    # Read frames dynamically
    frames = []
    frame_count = 0
    while True:
        ret, frame =
        if not ret:
        frame_count += 1

        # Check if enough frames are read
        if frame_count >= self.clip_length:

    # Release video capture object

    # Apply transformations if specified
    if self.video_transform:
        frames = [self.video_transform(frame) for frame in frames]

    # Convert frames to tensor
    frames = torch.stack(frames)

    return frames, outcome

def extract_label(self, filename):
    # Example: Extract labels using regular expression
    match ='clip\d+_(\w+)_(\w+)_(\w+)', filename)
    if match:
        eye_side = 1 if == 'L' else 0
        test_pos = 1 if == 'LHPD' else 0
        outcome = 0 if == 'N' else 1
        return (eye_side, test_pos, outcome)
        return (-1, -1, -1)  # Default labels if not found

def adjust_clip_length(self, frames):
    # Adjust clip length by selecting a subset of frames
    if len(frames) > self.clip_length:
        start_idx = (len(frames) - self.clip_length) // 2
        frames = frames[start_idx:start_idx + self.clip_length]
    return frames

def create_dataloaders(
data_dir: str,
transform: transforms.Compose,
test_size: int,
batch_size: int,
num_workers: int=NUM_WORKERS,
random_seed: int = 33

Here is the train loop

Utilise device agnostic code

device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)

botorch_model = BotorchModel(acquisition_function_type=‘qExpectedImprovement’)


Set up BotorchModel for Bayesian optimization

botorch_model = BotorchModel(acquisition_function_type=‘qExpectedImprovement’)

ax_client = AxClient()
parameters = [
“name”: “batch_size”,
“type”: “range”,
“bounds”: [16, 128]
“name”: “max_epochs”,
“type”: “range”,
“bounds”: [10, 100]
“name”: “learning_rate”,
“type”: “range”,
“bounds”: [1e-4, 1e-1],
“log_scale”: True
“name”: “weight_decay”,
“type”: “range”,
“bounds”: [1e-6, 1e-2],
“log_scale”: True
objective_name= “auc_score_fold”,
minimize= False,

for i in range(5):
parameters, trial_index = ax_client.get_next_trial()
ax_client.complete_trial(trial_index=trial_index, raw_data=evaluate(parameters=parameters,
cv = cross_validate(model=ax_client.generation_strategy.model, folds=-1)

model = ax_client.generation_strategy.model
render(interact_contour(model=model, metric_name=“auc_score_fold”))


And lastly, the bayesian optimisation

Define the evaluation function for Bayesian optimization

def evaluate(parameters: str,
model: torch.nn.Module,
loss_fn: torch.nn.Module,
optimizer: torch.optim.Optimizer,
device: torch.device) → Tuple[float, float]:
auc_values =
loss_values =
X_train_list =
y_train_list =

max_norm = 1.0

model =

# Your existing code for setting up the optimizer

for batch_idx, (video_frames_batch, outcomes) in enumerate(dataloader):
    X_train_tensor = torch.tensor(video_frames_batch, dtype=torch.float).to(device)
    y_train_tensor = outcomes.float().unsqueeze(1).to(device)
    if torch.any(torch.isnan(X_train_tensor)) | (torch.any(torch.isnan(y_train_tensor))):
        print("nan values found")

    print(f"show the train tensor {X_train_tensor}")

    # Perform forward pass
    outputs = model(X_train_tensor, batch_size = 1)
    print(f"show the outputs {outputs}")
    #print(f"show the train tensor {X_train_tensor}")
    #print(f"show the true outcomes {y_train_tensor}")
    # Compute loss
    print("Output shape:", outputs.shape)
    #print("Target shape:", y_train_tensor.shape)
    loss = loss_fn(outputs, y_train_tensor)

    print("calculate loss")
    # Perform backward pass and optimization
    print("zero grad")
    print(f"show the loss {loss}")
    print(f"clip grads")
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
    print(f"start backwards")
    print("backward loss")
    # Compute ROC AUC score
    auc_value = roc_auc_score(y_train_tensor.cpu().numpy(), outputs.detach().cpu().numpy())
    print("auc complete")

# Compute average loss
avg_loss = sum(loss_values) / len(loss_values)

# Compute average ROC AUC score
avg_auc = sum(auc_values) / len(auc_values)

return avg_loss, avg_auc

Thank you and please let me know if there’s anything else needed.

A little update, for some reason it is now running through multiple passes, but still hangs after a few passes at the same point, loss.backwards()