Hi,
I am trying to implement a minimalistic federated SGD algorithm for MNIST classification.
For which I am trying to accumulate the gradients and then make optimizer.step.
Here is my gradient accumulation code
def server_aggregate_gradients(n_clients, device): # For FedSGD
grads_info = []
for c in range(n_clients):
grads_info.append(torch.load(f'./model_cache/client_{c}.pkl', map_location=device))
total_grads = {}
n_total_samples = 0
# acc_gradient = summation[ (number_samples_per_client/total_samples) * gradient_per_client]
for info in grads_info:
n_samples = info['n_samples']
for name, grad_value in info['named_grads'].items():
if name not in total_grads:
total_grads[name] = grad_value * n_samples
else:
total_grads[name] += (grad_value * n_samples)
n_total_samples += n_samples
gradients = {}
for name, grad_value in total_grads.items():
gradients[name] = grad_value/ n_total_samples
return gradients
After accumulating gradients(weighted average of gradients) from all clients, I am setting those gradients to the main model as below and the perform gradient descent
def server_step(model, optimizer, gradients=None, weights=None):
model.train()
optimizer.zero_grad()
for name, parameter in model.named_parameters():
print(gradients[name].shape) # torch.Size([32, 1, 5, 5])
print(parameter.shape) # torch.Size([32, 1, 5, 5])
print(gradients[name].dtype) # torch.float32
print(parameter.dtype) # torch.float32
parameter.grad = gradients[name]
optimizer.step()
torch.save(model.state_dict(), './model_cache/global_model_state.pkl')
But I get error at line parameter.grad = gradients[name]
Here is the error I get;
Traceback (most recent call last):
File "/Users/muhammadwaseem/Documents/GitHub/CV-projects/federated-learning/server.py", line 86, in <module>
fedSGD(n_clients, n_server_epochs, batch_size, device=device)
File "/Users/muhammadwaseem/Documents/GitHub/CV-projects/federated-learning/server.py", line 73, in fedSGD
server_step(model, optimizer, gradients=gradients)
File "/Users/muhammadwaseem/Documents/GitHub/CV-projects/federated-learning/server.py", line 41, in server_step
parameter.grad = gradients[name]
RuntimeError: assigned grad has data of a different type
Why: Shape and dtype of both parameter.grad
and gradients[name]
is same, then why do I get this error?
Edit: Also parameter.grad
seems to be having the value None
, does it have anything to do with the error
@ptrblck , any thoughts on this issue?