I try to broadcast a tensor(which is a scalar in practice) from rank0 to other ranks in a machine. But i found the tensor in rank1,2,3 not equals to rank0.
Code
if ddp.get_rank() == 0:
model.eval()
fit_accurary = utils.AverageMeter()
for batch_idx, (inputs, targets) in enumerate(testLoader):
inputs, targets = inputs.to(device), targets.to(device)
with torch.no_grad():
outputs = model(inputs)
predicted = utils.accuracy(outputs, targets,topk=(1,5))
fit_accurary.update(predicted[1], inputs.size(0))
if fit_accurary.avg == 0:
fit_accurary.avg = 0.01
if fit_accurary.avg > best_honey.fitness:
best_honey_state = copy.deepcopy(model.module.state_dict())
best_honey.code = copy.deepcopy(honey)
best_honey.fitness = fit_accurary.avg
avg_acc = float(fit_accurary.avg)
else:
avg_acc = float(0)
avg_acc = (torch.tensor(avg_acc, dtype=torch.float)).to(device)
dist.broadcast(avg_acc, src=0)
#avg_acc = utils.send_and_wait(r=0, data=[avg_acc])
avg_acc = avg_acc.cpu().detach().numpy().item()
print(avg_acc)
return avg_acc
Results
0.009999999776482582 rank0
-0.0011623682221397758 rank1
-0.0011623682221397758 rank2
-0.0011623682221397758 rank3