Profiling time spent during DP training

Hi, I notice under differential privacy context, backward pass spent much more time than the counterpart under standard training process (non differential privacy). Which operation is the main contributor to this time increase? Is it L2 norm calculation, memory movement, norm clipping or adding noise?

Does anyone have some ways to profile this details?

         # Forward pass
        start_time = time.time()
        outputs = model(inputs)
        torch.cuda.synchronize()
        forward_time += time.time() - start_time

        # Compute loss
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        
        # Backward pass
        start_time = time.time()
        loss.backward()
        torch.cuda.synchronize()
        backward_time += time.time() - start_time

        # Parameter update
        start_time = time.time()
        optimizer.step()
        torch.cuda.synchronize()
        step_time += time.time() - start_time

I ran something small scale.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import time
from opacus import PrivacyEngine
import copy
from torch.profiler import profile, record_function, ProfilerActivity

import warnings
warnings.filterwarnings("ignore")

class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def train_epoch(model, train_loader, optimizer, device, privacy_engine=None, epsilon=None):
    model.train()
    total_loss = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        output = model(data)
        loss = nn.functional.cross_entropy(output, target)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)

    if privacy_engine:
        return avg_loss, epsilon
    else:
        return avg_loss, None

def create_dummy_data(num_samples, input_size, num_classes):
    data = torch.randn(num_samples, input_size)
    labels = torch.randint(0, num_classes, (num_samples,))
    return data, labels

def main():
    # Hyperparameters
    batch_size = 64
    epochs = 5
    lr = 0.01
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # DP parameters
    max_grad_norm = 1.0
    delta = 1e-5
    epsilon = 8.0

    # Create dummy data
    num_samples = 10000
    input_size = 20
    hidden_size = 50
    num_classes = 5
    
    data, labels = create_dummy_data(num_samples, input_size, num_classes)
    dummy_dataset = TensorDataset(data, labels)
    train_loader = DataLoader(dummy_dataset, batch_size=batch_size, shuffle=True)

    # Initialize models and optimizers
    model_normal = SimpleNet(input_size, hidden_size, num_classes).to(device)
    model_dp = copy.deepcopy(model_normal)
    
    optimizer_normal = optim.SGD(model_normal.parameters(), lr=lr)
    optimizer_dp = optim.SGD(model_dp.parameters(), lr=lr)

    # Initialize PrivacyEngine
    privacy_engine = PrivacyEngine()

    # Make model and optimizer compatible with Opacus
    model_dp, optimizer_dp, train_loader_dp = privacy_engine.make_private_with_epsilon(
        module=model_dp,
        optimizer=optimizer_dp,
        data_loader=train_loader,
        epochs=epochs,
        max_grad_norm=max_grad_norm,
        target_epsilon=epsilon,
        target_delta=delta,
    )

    print("Epoch | Normal Loss | DP Loss | Epsilon")
    print("-" * 40)

    # Training loop with profiling
    for epoch in range(1, epochs + 1):
        # Normal training
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof_normal:
            with record_function("normal_training"):
                normal_results = train_epoch(model_normal, train_loader, optimizer_normal, device)
        
        # DP training
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof_dp:
            with record_function("dp_training"):
                dp_results = train_epoch(model_dp, train_loader_dp, optimizer_dp, device, privacy_engine, epsilon)

        print(f"{epoch:5d} | {normal_results[0]:11.4f} | {dp_results[0]:7.4f} | {dp_results[1]:7.2f}")

        # Print profiling results
        print("\nNormal Training Profiling:")
        print(prof_normal.key_averages().table(sort_by="cpu_time_total", row_limit=10))
        
        print("\nDP Training Profiling:")
        print(prof_dp.key_averages().table(sort_by="cpu_time_total", row_limit=10))

if __name__ == '__main__':
    main()

Seems to me that it is less to do with what you mentioned and more to do with the per-sample gradient itself. Per-sample gradient, should we design each layer differently?

This is a nice blog recently discussing how to reduce this.

Hi Soumya,

Thanks for your solution and that link. They are very useful.

I’ve tried your code, it seems the output profile the training process as a whole under one row in the profiling output table. What I want to get is to profile each step of DP-Training, like forward pass, first backward pass, per-sample gradient calculation, norm clipping, second backward pass and adding noise.

Do I need to wrap the profiling code around each part in Opacus source code?