I have error when use profile on evaluate.py(1 node 8 gpus) https://github.com/microsoft/Megatron-DeepSpeed/blob/main/tasks/eval_harness/evaluate.py .
from torch.profiler import profile, record_function, ProfilerActivity
def main():
start = time.time()
model = load_ds_checkpoint_and_setup_megatron(extra_args_provider=tasks_args)
args = get_args()
if args.deepspeed and args.adaptive_seq_len:
# adaptive_seq_len hack #1:
# CL automatically enables reset_activation_shape() which allows us to change input shapes
# and it also reshapes the attenion scores in attention_mask_func
args.curriculum_learning_legacy = 1
task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
task_dict = tasks.get_task_dict(task_list)
model.module.activation_checkpoint_interval = 0
model._compute_loss = False
model.fwd_outputs = []
tokenizer = get_tokenizer()
adaptor = EvalHarnessAdaptor(model, tokenizer)
with profile(activities=[
ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
results = evaluator.evaluate(adaptor, task_dict, False, args.num_fewshot, None)
prof.export_chrome_trace("trace.json")
if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
print(json.dumps(results, indent=2))
with open(args.results_path, 'w') as outfile:
json.dump(results, outfile, indent = 4)
end = time.time()
print("evaluation of {} ends in {:.2f} sec, or {:.2f} min, or {:.2f} hr".format(args.task_list, end-start, (end-start)/60.0, (end-start)/3600.0))
But i get a error.
output_json.cpp:468 failed to rename trace.json.tmp to trace.json
How should I use profile tools to fix this.