“Per sample gradient is not initialized. Not updated in backward pass?”
Hi I get the above error message at optimizer.step() when training with opacus. I have already read other forum entries with the same or similar error messages. Usually the problem seemed to be that the models parameters were reinitialized (e.g. with ModuleValidator.fix(net)) after the creation of the optimizer, leading to a mismatch of net and optimizers parameters that causes the problem.
To my understanding, I do not have such a missmatch in my code and I am already trying for days to figure out the problem. Would really appreciate if someone has an idea about what causes this problem thanks
Here is the code of my train function:
def train(net, trainloader, epochs: int, verbose=True):
net = ModuleValidator.fix(net)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr = 0.001)
privacy_engine = opacus.PrivacyEngine()
net, optimizer, trainloader = privacy_engine.make_private(
module=net,
optimizer=optimizer,
data_loader=trainloader,
noise_multiplier=1.1,
max_grad_norm=1.1,
)
for epoch in range(epochs):
correct, total, epoch_loss = 0, 0, 0.0
with BatchMemoryManager(
data_loader=trainloader,
max_physical_batch_size=8,
optimizer=optimizer
) as memory_safe_data_loader:
for images, labels in memory_safe_data_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
net.train()
outputs = net(images)
loss = criterion(net(images), labels)
loss.backward()
optimizer.step()
epoch_loss += loss
total += labels.size(0)
correct += (torch.max(outputs.data, 1)[1] == labels).sum().item()
epoch_loss /= len(memory_safe_data_loader.dataset)
epoch_acc = correct / total
if verbose:
print(f"epoch {epoch+1}: train loss {epoch_loss}, accuracy {epoch_acc}")