I am training a GRU/LSTM for time-series prediction, I already tested the model in a classic way by splitting the data between sequence and labels and everything is working fine and the shapes looks like the following inside the forward pass and the main training :
x in forward torch.Size([32, 90, 7])
h in forward torch.Size([2, 32, 128])
x in train torch.Size([32, 90, 7])
label in train torch.Size([32, 90, 7])
When I tried to to distribute the training set on diffirent clients/gatweways to train the model in a federated way like this :
federated_train_loader = sy.FederatedDataLoader(federated_train_dataset, shuffle=True, batch_size=batch_size, drop_last=True)
federated_test_loader = sy.FederatedDataLoader(federated_test_dataset, shuffle=False, batch_size=batch_size, drop_last=True)
I checked the federated data loader using pysft and has the same inputs and labels shapes as the previous experiments. but when I try to train and print out the tensors I got the following shapes :
x :torch.Size([32, 90, 7])
labels :torch.Size([32, 1])
Obviously my label shape is not in th correct size anymore but I don’t know where the changes happened , I am providing the training function for more information:
def train(federated_train_loader, learn_rate, hidden_dim=256, EPOCHS=1, model_type="GRU"):
# Setting common hyperparameters
input_dim = next(iter(federated_train_loader))[0].shape[2]
print('input dim {}'.format(input_dim))
output_dim = 1
n_layers = 2
# Instantiating the models
if model_type == "GRU":
model = GRUNet(input_dim, hidden_dim, output_dim, n_layers)
else:
model = LSTMNet(input_dim, hidden_dim, output_dim, n_layers)
#model.to(device)
# Defining loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate)
model.train()
print("Starting Training of {} model".format(model_type))
epoch_times = []
# Start training loop
for epoch in range(1,EPOCHS+1):
start_time = time.perf_counter()
avg_loss = 0.
counter = 0
for inputs, labels in federated_train_loader:
print('inputs :{}'.format(inputs.shape))
print('labels :{}'.format(labels.shape))
labels.view(-1, 2)
print('labels :{}'.format(labels.shape))
counter += 1
worker = inputs.location
h = model.init_hidden(batch_size).send(worker)
if model_type == "GRU":
h = h.data
else:
h = tuple([e.data for e in h])
model.send(worker)
model.zero_grad()
out, h = model(inputs.to(device).float(), h)
loss = criterion(out, label.to(device).float())
loss.backward()
optimizer.step()
avg_loss += loss.item()
if counter%200 == 0:
print("Epoch {}......Step: {}/{}....... Average Loss for Epoch: {}".format(epoch, counter, len(federated_train_loader), avg_loss/counter))
current_time = time.clock()
print("Epoch {}/{} Done, Total Loss: {}".format(epoch, EPOCHS, avg_loss/len(federated_train_loader)))
print("Time Elapsed for Epoch: {} seconds".format(str(current_time-start_time)))
epoch_times.append(current_time-start_time)
print("Total Training Time: {} seconds".format(str(sum(epoch_times))))
return model
It is been while I am trying to solve it and I appreciate if anyone has a clue ?