I ran something small scale.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import time
from opacus import PrivacyEngine
import copy
from torch.profiler import profile, record_function, ProfilerActivity
import warnings
warnings.filterwarnings("ignore")
class SimpleNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(SimpleNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x
def train_epoch(model, train_loader, optimizer, device, privacy_engine=None, epsilon=None):
model.train()
total_loss = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = nn.functional.cross_entropy(output, target)
total_loss += loss.item()
loss.backward()
optimizer.step()
avg_loss = total_loss / len(train_loader)
if privacy_engine:
return avg_loss, epsilon
else:
return avg_loss, None
def create_dummy_data(num_samples, input_size, num_classes):
data = torch.randn(num_samples, input_size)
labels = torch.randint(0, num_classes, (num_samples,))
return data, labels
def main():
# Hyperparameters
batch_size = 64
epochs = 5
lr = 0.01
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DP parameters
max_grad_norm = 1.0
delta = 1e-5
epsilon = 8.0
# Create dummy data
num_samples = 10000
input_size = 20
hidden_size = 50
num_classes = 5
data, labels = create_dummy_data(num_samples, input_size, num_classes)
dummy_dataset = TensorDataset(data, labels)
train_loader = DataLoader(dummy_dataset, batch_size=batch_size, shuffle=True)
# Initialize models and optimizers
model_normal = SimpleNet(input_size, hidden_size, num_classes).to(device)
model_dp = copy.deepcopy(model_normal)
optimizer_normal = optim.SGD(model_normal.parameters(), lr=lr)
optimizer_dp = optim.SGD(model_dp.parameters(), lr=lr)
# Initialize PrivacyEngine
privacy_engine = PrivacyEngine()
# Make model and optimizer compatible with Opacus
model_dp, optimizer_dp, train_loader_dp = privacy_engine.make_private_with_epsilon(
module=model_dp,
optimizer=optimizer_dp,
data_loader=train_loader,
epochs=epochs,
max_grad_norm=max_grad_norm,
target_epsilon=epsilon,
target_delta=delta,
)
print("Epoch | Normal Loss | DP Loss | Epsilon")
print("-" * 40)
# Training loop with profiling
for epoch in range(1, epochs + 1):
# Normal training
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof_normal:
with record_function("normal_training"):
normal_results = train_epoch(model_normal, train_loader, optimizer_normal, device)
# DP training
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof_dp:
with record_function("dp_training"):
dp_results = train_epoch(model_dp, train_loader_dp, optimizer_dp, device, privacy_engine, epsilon)
print(f"{epoch:5d} | {normal_results[0]:11.4f} | {dp_results[0]:7.4f} | {dp_results[1]:7.2f}")
# Print profiling results
print("\nNormal Training Profiling:")
print(prof_normal.key_averages().table(sort_by="cpu_time_total", row_limit=10))
print("\nDP Training Profiling:")
print(prof_dp.key_averages().table(sort_by="cpu_time_total", row_limit=10))
if __name__ == '__main__':
main()
Seems to me that it is less to do with what you mentioned and more to do with the per-sample gradient itself. Per-sample gradient, should we design each layer differently?
This is a nice blog recently discussing how to reduce this.