Relation between Batch_size and Gradients

zoher · April 22, 2024, 10:32am

Hello Guys!

I have this code with applying DP-SGD with max_grad_norm =1

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from opacus import PrivacyEngine

# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(784, 10, bias=False)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        return x

# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)
model = SimpleNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

privacy_engine = PrivacyEngine(accountant='rdp')
model_dp, optimizer_dp,data_loader_dp = privacy_engine.make_private_with_epsilon(
                module= model,
                optimizer= optimizer,
                data_loader=train_loader,
                target_epsilon=1 ,
                target_delta= 1 / len(train_loader),
                epochs = 1,
                max_grad_norm = 1
                )
# Training loop
epochs = 1
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(data_loader_dp, 0):
        inputs, labels = data ## input has shape 64 (1x24x24)
        #print(inputs.shape)
        # Zero the parameter gradients
        optimizer_dp.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs, labels)
     
        # Backward pass
        loss.backward()

        original_gradients = [p.grad.norm().item() for p in model_dp.parameters() if p.grad is not None]  
        avg_original_gradients =  sum(original_gradients)/ len(original_gradients)
        #print('avg gradient norm before noise', original_gradients)
        
        # Print average gradient magnitude for each parameter after each backward pass
        if (i+1) % 10 == 0:
            for name, param in model_dp.named_parameters():
                if param.grad is not None:
                    print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Parameter: {name}, Gradient Magnitude: {param.grad.norm()}')

        # Optimize
        optimizer_dp.step()

        if (i+1) % 10 == 0:
            for name, param in model_dp.named_parameters():
                if param.grad is not None:
                    print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Parameter: {name}, Gradient Magnitude After Noise: {param.grad.norm()}')

        valid_gradients = [p.grad.norm().item() for p in model_dp.parameters() if p.grad is not None]

        gradients_af_noise = sum(valid_gradients)/ len(valid_gradients)
        #print('gradient after noise', gradients_af_noise)
        running_loss += loss.item()
        
    
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss / len(train_loader)}')

print('Finished Training')

here is a sample from the output

Epoch [1/1], Step [10/600], Parameter: _module.fc1.weight, Gradient Magnitude: 2.7776570320129395
Epoch [1/1], Step [10/600], Parameter: _module.fc1.weight, Gradient Magnitude After Noise: 0.6592360138893127
Epoch [1/1], Step [20/600], Parameter: _module.fc1.weight, Gradient Magnitude: 2.715763568878174
Epoch [1/1], Step [20/600], Parameter: _module.fc1.weight, Gradient Magnitude After Noise: 0.6330609321594238

where for each step I was printing the norm of gradients for my model before and after adding the noise when the batch_size parameter = 100 and the gradient norm after noise always less than 1 which make sense because I set max_grad_norm = 1 , but when I change the batch_size to 8
I got this output where the gradient norm after noise exceed max_grad_norm value e.g it become 7.058923721313477

Epoch [1/1], Step [10/7500], Parameter: _module.fc1.weight, Gradient Magnitude: 10.976356506347656
Epoch [1/1], Step [10/7500], Parameter: _module.fc1.weight, Gradient Magnitude After Noise: 7.058923721313477
Epoch [1/1], Step [20/7500], Parameter: _module.fc1.weight, Gradient Magnitude: 2.659587860107422
Epoch [1/1], Step [20/7500], Parameter: _module.fc1.weight, Gradient Magnitude After Noise: 7.106380939483643
Epoch [1/1], Step [30/7500], Parameter: _module.fc1.weight, Gradient Magnitude: 0.4995321035385132
Epoch [1/1], Step [30/7500], Parameter: _module.fc1.weight, Gradient Magnitude After Noise: 7.189002990722656

I would like to know how the batch_size affected on the amount of noise added, and why the gradient norms after adding the noise and clipping the original gradients in Opacus exceed the max_grad_norm which is equal =1 (does it make sense ?) or maybe the way I am calculated the norm for the gradients after the noise added is not correct?

Can anyone please explain why smaller batch size make the gradients after clipping and adding noise is small and vice versa?