Profiling time spent during DP training

I ran something small scale.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import time
from opacus import PrivacyEngine
import copy
from torch.profiler import profile, record_function, ProfilerActivity

import warnings
warnings.filterwarnings("ignore")

class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def train_epoch(model, train_loader, optimizer, device, privacy_engine=None, epsilon=None):
    model.train()
    total_loss = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        output = model(data)
        loss = nn.functional.cross_entropy(output, target)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)

    if privacy_engine:
        return avg_loss, epsilon
    else:
        return avg_loss, None

def create_dummy_data(num_samples, input_size, num_classes):
    data = torch.randn(num_samples, input_size)
    labels = torch.randint(0, num_classes, (num_samples,))
    return data, labels

def main():
    # Hyperparameters
    batch_size = 64
    epochs = 5
    lr = 0.01
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # DP parameters
    max_grad_norm = 1.0
    delta = 1e-5
    epsilon = 8.0

    # Create dummy data
    num_samples = 10000
    input_size = 20
    hidden_size = 50
    num_classes = 5
    
    data, labels = create_dummy_data(num_samples, input_size, num_classes)
    dummy_dataset = TensorDataset(data, labels)
    train_loader = DataLoader(dummy_dataset, batch_size=batch_size, shuffle=True)

    # Initialize models and optimizers
    model_normal = SimpleNet(input_size, hidden_size, num_classes).to(device)
    model_dp = copy.deepcopy(model_normal)
    
    optimizer_normal = optim.SGD(model_normal.parameters(), lr=lr)
    optimizer_dp = optim.SGD(model_dp.parameters(), lr=lr)

    # Initialize PrivacyEngine
    privacy_engine = PrivacyEngine()

    # Make model and optimizer compatible with Opacus
    model_dp, optimizer_dp, train_loader_dp = privacy_engine.make_private_with_epsilon(
        module=model_dp,
        optimizer=optimizer_dp,
        data_loader=train_loader,
        epochs=epochs,
        max_grad_norm=max_grad_norm,
        target_epsilon=epsilon,
        target_delta=delta,
    )

    print("Epoch | Normal Loss | DP Loss | Epsilon")
    print("-" * 40)

    # Training loop with profiling
    for epoch in range(1, epochs + 1):
        # Normal training
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof_normal:
            with record_function("normal_training"):
                normal_results = train_epoch(model_normal, train_loader, optimizer_normal, device)
        
        # DP training
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof_dp:
            with record_function("dp_training"):
                dp_results = train_epoch(model_dp, train_loader_dp, optimizer_dp, device, privacy_engine, epsilon)

        print(f"{epoch:5d} | {normal_results[0]:11.4f} | {dp_results[0]:7.4f} | {dp_results[1]:7.2f}")

        # Print profiling results
        print("\nNormal Training Profiling:")
        print(prof_normal.key_averages().table(sort_by="cpu_time_total", row_limit=10))
        
        print("\nDP Training Profiling:")
        print(prof_dp.key_averages().table(sort_by="cpu_time_total", row_limit=10))

if __name__ == '__main__':
    main()

Seems to me that it is less to do with what you mentioned and more to do with the per-sample gradient itself. Per-sample gradient, should we design each layer differently?

This is a nice blog recently discussing how to reduce this.