Hello,
I seem to have an issue with the input type and the weight type. I’ve checked every input and model parameter multiple times, but I can’t find the discrepancy. It’s possible I’m overlooking the error, so I would appreciate any suggestions for a solution. As we know, a second pair of eyes often helps. I’m using Flower for Federated Learning and have implemented a Scaffold Strategy. However, during training, I encounter an error. Sometimes the process works for two training rounds, and other times it only works for one round, which is quite strange. At the start, I’m using ResNet34, implemented and provided by PyTorch, as I’m trying to get it running on this model first.
model = models.resnet34(pretrained=False)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model.fc = nn.Linear(in_features=512, out_features=25, bias=True)
model = model.to(DEVICE)
Example that it works for the first round after that come the evalaute_fit it also works. But in Round 2 it doesnt work
INFO : Flower ECE: gRPC server running (20 rounds), SSL is disabled
INFO : [INIT]
INFO : Requesting initial parameters from one random client
Number of availbale Clients 1
INFO : Received initial parameters from one random client
INFO : Evaluating initial global parameters
INFO :
INFO : [ROUND 1]
INFO : configure_fit: strategy sampled 2 clients (out of 2)
Number of availbale Clients 2
INFO : aggregate_fit: received 2 results and 0 failures
Here is a cutout of my error in the client.py file
File ".../multisensor_data_preparation_federated_learning/examples/flowers/testtt/testscaffold/client_scaffold.py", line 562, in fit
train(net=self.model, trainloader=self.train, c_local=self.client_cvalue, c_global=server_cv, epochs=1)
File ".../multisensor_data_preparation_federated_learning/examples/flowers/testtt/testscaffold/client_scaffold.py", line 322, in train
outputs = net(x_reshaped).to(DEVICE)
File ".../miniconda/envs/f1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File ".../miniconda/envs/f1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File ".../miniconda/envs/f1/lib/python3.10/site-packages/torchvision/models/resnet.py", line 285, in forward
return self._forward_impl(x)
File ".../miniconda/envs/f1/lib/python3.10/site-packages/torchvision/models/resnet.py", line 268, in _forward_impl
x = self.conv1(x)
File ".../miniconda/envs/f1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File ".../miniconda/envs/f1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File ".../miniconda/envs/f1/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 460, in forward
return self._conv_forward(input, self.weight, self.bias)
File ".../miniconda/envs/f1/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 456, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Input type (torch.cuda.DoubleTensor) and weight type (torch.cuda.FloatTensor) should be the same
And the Server Error. Its resulting from the client error
File ".../multisensor_data_preparation_federated_learning/examples/flowers/testtt/testscaffold/server_scaffold.py", line 527, in <module>
flwr.server.start_server(server_address="0.0.0.0:8080", server=server, config=flwr.server.ServerConfig(num_rounds=20))
File ".../miniconda/envs/f1/lib/python3.10/site-packages/flwr/server/app.py", line 171, in start_server
hist = run_fl(
File ".../miniconda/envs/f1/lib/python3.10/site-packages/flwr/server/server.py", line 483, in run_fl
hist, elapsed_time = server.fit(
File ".../miniconda/envs/f1/lib/python3.10/site-packages/flwr/server/server.py", line 113, in fit
res_fit = self.fit_round(
File ".../multisensor_data_preparation_federated_learning/examples/flowers/testtt/testscaffold/server_scaffold.py", line 237, in fit_round
aggregated_result_combined = parameters_to_ndarrays(aggregated_result[0])
File ".../miniconda/envs/f1/lib/python3.10/site-packages/flwr/common/parameter.py", line 34, in parameters_to_ndarrays
return [bytes_to_ndarray(tensor) for tensor in parameters.tensors]
AttributeError: 'NoneType' object has no attribute 'tensors'
I have also tried my from scratch implemented models, but it also dont work. In the following i share my train method. PLEASE DON’T BE CONFUSED THE CODE IS QUITE MESSY, SINCE I HAVE MADE MANY TRYES TO FIND THE ERROR
def train( net, trainloader,c_local, c_global,epochs):
"""Train the network on the training set."""
criterion = torch.nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
for epoch in range(epochs):
running_loss = 0.0
correct = 0
total = 0
prebatch_params = [param.detach().clone() for param in net.parameters()]
for param in prebatch_params:
print('param', param.dtype)
for batch in trainloader:
images = batch[0]
labels = batch[1]
images, labels = images.to(DEVICE, dtype=torch.float), labels.to(DEVICE)
x_reshaped = images.permute(0, 3, 1, 2).to(DEVICE)
print(f"Input type: {images.dtype}, Weight type: {net.conv1.weight.dtype}")
#print('images', x_reshaped.dtype)
#print('labels', labels.dtype)
optimizer.zero_grad()
outputs = net(x_reshaped).to(DEVICE)
#print('outputs', outputs.dtype)
labels = labels.argmax(dim=1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
print('len_clocal', len(c_local))
print('g_local', len(c_global))
with open('c_local.pkl', 'wb') as file:
pickle.dump(c_local, file)
with open('c_global.pkl', 'wb') as file2:
pickle.dump(c_global, file2)
skip = 0
for param, y_i, c_l, c_g in zip(net.parameters(), prebatch_params, c_local, c_global):
if param.requires_grad:
if param.grad is not None:
#print('param_data', param.grad.data)
#print('param_data', param.grad.data)
c_l = c_l.to(DEVICE)
c_g = c_g.to(DEVICE)
print('c_l', c_l.dtype)
print('c_g', c_g.dtype)
print('param', param.dtype)
c_l = c_l.to(param.dtype).to(DEVICE)
c_g = c_g.to(param.dtype).to(DEVICE)
print('c_g22', c_g.dtype)
if param.shape == c_g.shape == c_l.shape:
# print('param_data', param.grad.data)
#print(f"Processing layer: {param.shape}")
#print(f"Grad: {param.shape}, c_l: {c_l.shape}, c_g: {c_g.shape}")
correction = 0.01 * (param.grad.data - c_l + c_g)
param.data = y_i - correction
skip += 1
else:
pass
else:
pass
print('skip', skip)
epoch_loss = running_loss / len(trainloader)
epoch_acc = correct / total
print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
Here are some types that are important to know. Everything is in the requirred dtype.
param torch.float32
Input type: torch.float32, Weight type: torch.float32
also the c_g and c_l parameter
Here is my fit function in the FlowerClient1 class.
def fit(self, parameters, config):
model_parameter = parameters[len(parameters) // 2 :]
server_C = parameters[: len(parameters) // 2]
self.set_parameters(model_parameter)
self.client_cvalue = []
#for param in self.model.parameters():
# self.client_cvalue.append(param.clone().detach())
#for param in model.parameters():
# self.client_cvalue.append(torch.zeros_like(param))
self.client_cvalue = [torch.zeros_like(val) for _, val in self.model.state_dict().items()]
self.client_cvalue = [cv.cpu().numpy() for cv in self.client_cvalue]
print(len(self.client_cvalue))
print('model_parameter',len(model_parameter))
if os.path.exists(f"{self.dir}/client_cv_{self.client_index}.pkl"):
with open(f"{self.dir}/client_cv_{self.client_index}.pkl", 'rb') as f:
self.client_cvalue = pickle.load(f)
else:
with open(f"{self.dir}/client_cv_{self.client_index}.pkl", 'wb') as f:
pickle.dump(self.client_cvalue, f)
'''
if os.path.exists(f"{self.dir}/client_cv_{self.client_index}.pt"):
# Load the data if the file exists
self.client_cvalue = torch.load(f"{self.dir}/client_cv_{self.client_index}.pt")
else:
# Save the data if the file does not exist
torch.save(self.client_cvalue, f"{self.dir}/client_cv_{self.client_index}.pt")
'''
server_cv = [torch.Tensor(sv) for sv in server_C]
print('server_cv_len', len(server_cv))
#for i in range(len(server_cv)):
#print('server_cv',server_cv[i].dtype)
with open('server_cvfit.pkl', 'wb') as f:
pickle.dump(server_cv, f)
print('len_get_parameters', len(self.get_parameters(config={})))
print('client_cvalue', len(self.client_cvalue))
self.client_cvalue = [torch.Tensor(cv) for cv in self.client_cvalue]
with open('client_cvaluefit.pkl', 'wb') as f:
pickle.dump(self.client_cvalue, f)
#for i in range(len(self.client_cvalue)):
#print('client_cvalue', self.client_cvalue[i].dtype)
print('torch_client_cvalue', len(self.client_cvalue))
train(net=self.model, trainloader=self.train, c_local=self.client_cvalue, c_global=server_cv, epochs=1)
c_i_n = []
x = model_parameter
y_i = self.get_parameters(config={})
for ci, c, x_model, local_model in zip(self.client_cvalue, server_cv, x, y_i):
c_i_n.append(ci - c + (1.0 / (0.001 * 1 * len(self.train)))* (x_model - local_model) # 1 = num_epochs
)
with open(f"{self.dir}/client_cv_{self.client_index}.pkl", 'wb') as f:
pickle.dump(c_i_n, f)
print('c_i_n', len( c_i_n))
with open(f"c_in.pkl", 'wb') as f:
pickle.dump(c_i_n, f)
c_delta = []
y_delta = []
for param_yi, param_x in zip(y_i, x):
y_delta.append(param_yi - param_x)
for c_new, c_old in zip(c_i_n, self.client_cvalue):
c_delta.append(c_new - c_old)
combined_parameters = y_delta + c_delta
return combined_parameters, len(self.train.dataset), {}
I check the input types and the weight types with print statetsments. I also tried it to load in pickle file. To check after it a model parameter and input is in a different type. But every type is correct. Can me someone give a adivise what is possible wrong and how to change it