PyTorch Error: Compiler: cl is not found

I started generating an error whenever I try to build with PyTorch:
BackendCompilerFailed: backend=‘inductor’ raised:
RuntimeError: Compiler: cl is not found.
I cannot figure out what changed.

To test this on a machine running Windows 11 Education, I created a Conda Environment with the following packages:

I know “cl” is PATHed correctly in windows powershell:
PS C:\Users\croda> which cl
/c/Program Files/Microsoft Visual Studio/2022/Enterprise/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl

This is the code I tested:

import torch
def foo(x, y):
    a = torch.sin(x)
    b = torch.cos(y)
    return a + b
opt_foo1 = torch.compile(foo)
print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10)))

This is the generated error:

---------------------------------------------------------------------------
BackendCompilerFailed                     Traceback (most recent call last)
Cell In[1], line 8
      6     return a + b
      7 opt_foo1 = torch.compile(foo)
----> 8 print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10)))

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\eval_frame.py:574, in _TorchDynamoContext.__call__.<locals>._fn(*args, **kwargs)
    569 saved_dynamic_layer_stack_depth = (
    570     torch._C._functorch.get_dynamic_layer_stack_depth()
    571 )
    573 try:
--> 574     return fn(*args, **kwargs)
    575 finally:
    576     # Restore the dynamic layer stack depth if necessary.
    577     torch._C._functorch.pop_dynamic_layer_stack_and_undo_to_depth(
    578         saved_dynamic_layer_stack_depth
    579     )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\convert_frame.py:1380, in CatchErrorsWrapper.__call__(self, frame, cache_entry, frame_state)
   1374             return hijacked_callback(
   1375                 frame, cache_entry, self.hooks, frame_state
   1376             )
   1378 with compile_lock, _disable_current_modes():
   1379     # skip=1: skip this frame
-> 1380     return self._torchdynamo_orig_callable(
   1381         frame, cache_entry, self.hooks, frame_state, skip=1
   1382     )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\convert_frame.py:1164, in ConvertFrame.__call__(self, frame, cache_entry, hooks, frame_state, skip)
   1162 counters["frames"]["total"] += 1
   1163 try:
-> 1164     result = self._inner_convert(
   1165         frame, cache_entry, hooks, frame_state, skip=skip + 1
   1166     )
   1167     counters["frames"]["ok"] += 1
   1168     return result

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\convert_frame.py:547, in ConvertFrameAssert.__call__(self, frame, cache_entry, hooks, frame_state, skip)
    544     dynamo_tls.traced_frame_infos.append(info)
    546 with compile_context(CompileContext(compile_id)):
--> 547     return _compile(
    548         frame.f_code,
    549         frame.f_globals,
    550         frame.f_locals,
    551         frame.f_builtins,
    552         frame.closure,
    553         self._torchdynamo_orig_callable,
    554         self._one_graph,
    555         self._export,
    556         self._export_constraints,
    557         hooks,
    558         cache_entry,
    559         cache_size,
    560         frame,
    561         frame_state=frame_state,
    562         compile_id=compile_id,
    563         skip=skip + 1,
    564     )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\convert_frame.py:986, in _compile(code, globals, locals, builtins, closure, compiler_fn, one_graph, export, export_constraints, hooks, cache_entry, cache_size, frame, frame_state, compile_id, skip)
    984 guarded_code = None
    985 try:
--> 986     guarded_code = compile_inner(code, one_graph, hooks, transform)
    988     # NB: We only put_code_state in success case.  Success case here
    989     # does include graph breaks; specifically, if a graph break still
    990     # resulted in a partially compiled graph, we WILL return here.  An
   (...)    995     # to upload for graph break though, because this can prevent
    996     # extra graph break compilations.)
    997     put_code_state()

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\convert_frame.py:715, in _compile.<locals>.compile_inner(code, one_graph, hooks, transform)
    713     stack.enter_context(torch._dynamo.callback_handler.install_callbacks())
    714     stack.enter_context(CompileTimeInstructionCounter.record())
--> 715     return _compile_inner(code, one_graph, hooks, transform)
    717 return None

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_utils_internal.py:95, in compile_time_strobelight_meta.<locals>.compile_time_strobelight_meta_inner.<locals>.wrapper_function(*args, **kwargs)
     92     kwargs["skip"] = skip + 1
     94 if not StrobelightCompileTimeProfiler.enabled:
---> 95     return function(*args, **kwargs)
     97 return StrobelightCompileTimeProfiler.profile_compile_time(
     98     function, phase_name, *args, **kwargs
     99 )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\convert_frame.py:750, in _compile.<locals>._compile_inner(code, one_graph, hooks, transform)
    748 CompileContext.get().attempt = attempt
    749 try:
--> 750     out_code = transform_code_object(code, transform)
    751     break
    752 except exc.RestartAnalysis as e:

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\bytecode_transformation.py:1361, in transform_code_object(code, transformations, safe)
   1358 instructions = cleaned_instructions(code, safe)
   1359 propagate_line_nums(instructions)
-> 1361 transformations(instructions, code_options)
   1362 return clean_and_assemble_instructions(instructions, keys, code_options)[1]

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\convert_frame.py:231, in preserve_global_state.<locals>._fn(*args, **kwargs)
    229 exit_stack.enter_context(torch_function_mode_stack_state_mgr)
    230 try:
--> 231     return fn(*args, **kwargs)
    232 finally:
    233     cleanup.close()

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\convert_frame.py:662, in _compile.<locals>.transform(instructions, code_options)
    660 try:
    661     with tracing(tracer.output.tracing_context), tracer.set_current_tx():
--> 662         tracer.run()
    663 except exc.UnspecializeRestartAnalysis:
    664     speculation_log.clear()

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\symbolic_convert.py:2868, in InstructionTranslator.run(self)
   2867 def run(self):
-> 2868     super().run()

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\symbolic_convert.py:1052, in InstructionTranslatorBase.run(self)
   1050 try:
   1051     self.output.push_tx(self)
-> 1052     while self.step():
   1053         pass
   1054 except TensorifyScalarRestartAnalysis:

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\symbolic_convert.py:962, in InstructionTranslatorBase.step(self)
    959 self.update_block_stack(inst)
    961 try:
--> 962     self.dispatch_table[inst.opcode](self, inst)
    963     return not self.output.should_exit
    964 except TensorifyScalarRestartAnalysis:

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\symbolic_convert.py:3048, in InstructionTranslator.RETURN_VALUE(self, inst)
   3047 def RETURN_VALUE(self, inst):
-> 3048     self._return(inst)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\symbolic_convert.py:3033, in InstructionTranslator._return(self, inst)
   3028 _step_logger()(
   3029     logging.INFO,
   3030     f"torchdynamo done tracing {self.f_code.co_name} ({inst.opname})",
   3031 )
   3032 log.debug("%s triggered compile", inst.opname)
-> 3033 self.output.compile_subgraph(
   3034     self,
   3035     reason=GraphCompileReason(
   3036         "return_value", [self.frame_summary()], graph_break=False
   3037     ),
   3038 )
   3039 return_inst = (
   3040     create_instruction("RETURN_VALUE")
   3041     if inst.opname == "RETURN_VALUE"
   3042     else create_instruction("RETURN_CONST", argval=inst.argval)
   3043 )
   3044 self.output.add_output_instructions([return_inst])

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\output_graph.py:1101, in OutputGraph.compile_subgraph(self, tx, partial_convert, reason)
   1098 append_prefix_insts()
   1099 # optimization to generate better code in a common case
   1100 self.add_output_instructions(
-> 1101     self.compile_and_call_fx_graph(
   1102         tx, list(reversed(stack_values)), root, output_replacements
   1103     )
   1104     + [create_instruction("UNPACK_SEQUENCE", arg=len(stack_values))]
   1105 )
   1106 # restore all the live local vars
   1107 self.add_output_instructions(
   1108     [
   1109         PyCodegen(tx, overridden_sources=overridden_sources).create_store(
   (...)   1113     ]
   1114 )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\output_graph.py:1382, in OutputGraph.compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs)
   1379     self.tracing_context.fake_mode = backend_fake_mode
   1381 with self.restore_global_state():
-> 1382     compiled_fn = self.call_user_compiler(gm)
   1384 from torch.fx._lazy_graph_module import _LazyGraphModule
   1386 if isinstance(compiled_fn, _LazyGraphModule) or (
   1387     isinstance(getattr(compiled_fn, "__self__", None), _LazyGraphModule)
   1388     and compiled_fn.__name__ == "_lazy_forward"  # type: ignore[attr-defined]
   (...)   1392     # this is a _LazyGraphModule. This makes it easier for dynamo to
   1393     # optimize a _LazyGraphModule.

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\output_graph.py:1432, in OutputGraph.call_user_compiler(self, gm)
   1425 def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
   1426     with dynamo_timed(
   1427         "OutputGraph.call_user_compiler",
   1428         phase_name="backend_compile",
   1429         log_pt2_compile_event=True,
   1430         dynamo_compile_column_us="aot_autograd_cumulative_compile_time_us",
   1431     ):
-> 1432         return self._call_user_compiler(gm)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\output_graph.py:1483, in OutputGraph._call_user_compiler(self, gm)
   1481     raise e
   1482 except Exception as e:
-> 1483     raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
   1484         e.__traceback__
   1485     ) from None
   1487 signpost_event(
   1488     "dynamo",
   1489     "OutputGraph.call_user_compiler",
   (...)   1495     },
   1496 )
   1498 return compiled_fn

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\output_graph.py:1462, in OutputGraph._call_user_compiler(self, gm)
   1460 if config.verify_correctness:
   1461     compiler_fn = WrapperBackend(compiler_fn)
-> 1462 compiled_fn = compiler_fn(gm, self.example_inputs())
   1463 _step_logger()(logging.INFO, f"done compiler function {name}")
   1464 assert callable(compiled_fn), "compiler_fn did not return callable"

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\repro\after_dynamo.py:130, in WrapBackendDebug.__call__(self, gm, example_inputs, **kwargs)
    128             raise
    129 else:
--> 130     compiled_gm = compiler_fn(gm, example_inputs)
    132 return compiled_gm

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\__init__.py:2340, in _TorchCompileInductorWrapper.__call__(self, model_, inputs_)
   2337 def __call__(self, model_, inputs_):
   2338     from torch._inductor.compile_fx import compile_fx
-> 2340     return compile_fx(model_, inputs_, config_patches=self.config)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\compile_fx.py:1863, in compile_fx(model_, example_inputs_, inner_compile, config_patches, decompositions)
   1856         return inference_compiler(unlifted_gm, example_inputs_)
   1858 with V.set_fake_mode(fake_mode), torch._guards.tracing(
   1859     tracing_context
   1860 ), compiled_autograd._disable(), functorch_config.patch(
   1861     unlift_effect_tokens=True
   1862 ):
-> 1863     return aot_autograd(
   1864         fw_compiler=fw_compiler,
   1865         bw_compiler=bw_compiler,
   1866         inference_compiler=inference_compiler,
   1867         decompositions=decompositions,
   1868         partition_fn=partition_fn,
   1869         keep_inference_input_mutations=True,
   1870         cudagraphs=cudagraphs,
   1871     )(model_, example_inputs_)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\backends\common.py:83, in AotAutograd.__call__(self, gm, example_inputs, **kwargs)
     80 try:
     81     # NB: NOT cloned!
     82     with enable_aot_logging(), patch_config:
---> 83         cg = aot_module_simplified(gm, example_inputs, **self.kwargs)
     84         counters["aot_autograd"]["ok"] += 1
     85         return disable(cg)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_functorch\aot_autograd.py:1155, in aot_module_simplified(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler, cudagraphs)
   1145     compiled_fn = AOTAutogradCache.load(
   1146         dispatch_and_compile,
   1147         mod,
   (...)   1152         remote,
   1153     )
   1154 else:
-> 1155     compiled_fn = dispatch_and_compile()
   1157 if isinstance(mod, torch._dynamo.utils.GmWrapper):
   1158     # This function is called by the flatten_graph_inputs wrapper, which boxes
   1159     # the inputs so that they can be freed before the end of this scope.
   1160     # For overhead reasons, this is not the default wrapper, see comment:
   1161     # https://github.com/pytorch/pytorch/pull/122535/files#r1560096481
   1162     def boxed_forward(runtime_args: List[Any]):

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_functorch\aot_autograd.py:1131, in aot_module_simplified.<locals>.dispatch_and_compile()
   1129 functional_call = create_functional_call(mod, params_spec, params_len)
   1130 with compiled_autograd._disable():
-> 1131     compiled_fn, _ = create_aot_dispatcher_function(
   1132         functional_call,
   1133         fake_flat_args,
   1134         aot_config,
   1135         fake_mode,
   1136         shape_env,
   1137     )
   1138 return compiled_fn

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_functorch\aot_autograd.py:580, in create_aot_dispatcher_function(flat_fn, fake_flat_args, aot_config, fake_mode, shape_env)
    572 def create_aot_dispatcher_function(
    573     flat_fn,
    574     fake_flat_args: FakifiedFlatArgs,
   (...)    577     shape_env: Optional[ShapeEnv],
    578 ) -> Tuple[Callable, ViewAndMutationMeta]:
    579     with dynamo_timed("create_aot_dispatcher_function", log_pt2_compile_event=True):
--> 580         return _create_aot_dispatcher_function(
    581             flat_fn, fake_flat_args, aot_config, fake_mode, shape_env
    582         )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_functorch\aot_autograd.py:830, in _create_aot_dispatcher_function(flat_fn, fake_flat_args, aot_config, fake_mode, shape_env)
    826         return aot_dispatch_base
    828 compiler_fn = choose_dispatcher(needs_autograd, aot_config)
--> 830 compiled_fn, fw_metadata = compiler_fn(
    831     flat_fn,
    832     _dup_fake_script_obj(fake_flat_args),
    833     aot_config,
    834     fw_metadata=fw_metadata,
    835 )
    836 return compiled_fn, fw_metadata

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_functorch\_aot_autograd\jit_compile_runtime_wrappers.py:203, in aot_dispatch_base(flat_fn, flat_args, aot_config, fw_metadata)
    201         assert isinstance(fw_module, GraphModule)
    202         tensorify_python_scalars(fw_module, fake_mode.shape_env, fake_mode)
--> 203     compiled_fw = compiler(fw_module, updated_flat_args)
    205 if fakified_out_wrapper.needs_post_compile:
    206     fakified_out_wrapper.set_fwd_output_strides(fwd_output_strides)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_functorch\aot_autograd.py:489, in SerializableAOTDispatchCompiler.__call__(self, gm, example_inputs)
    484 def __call__(
    485     self,
    486     gm: torch.fx.GraphModule,
    487     example_inputs: Sequence[InputType],
    488 ) -> OutputCode:
--> 489     return self.compiler_fn(gm, example_inputs)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\compile_fx.py:1741, in compile_fx.<locals>.fw_compiler_base(gm, example_inputs, is_inference)
   1738 else:
   1739     model_outputs_node.meta["user_visible_output_idxs"] = []
-> 1741 return inner_compile(
   1742     gm,
   1743     example_inputs,
   1744     static_input_idxs=get_static_input_idxs(fixed),
   1745     cudagraphs=cudagraphs,
   1746     graph_id=graph_id,
   1747     is_inference=is_inference,
   1748     boxed_forward_device_index=forward_device,
   1749 )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\compile_fx.py:569, in compile_fx_inner(gm, example_inputs, **kwargs)
    562 stack.enter_context(DebugContext())
    564 get_chromium_event_logger().add_event_data(
    565     "inductor_compile",
    566     is_backward=kwargs["is_backward"],
    567 )
--> 569 return wrap_compiler_debug(_compile_fx_inner, compiler_name="inductor")(
    570     gm,
    571     example_inputs,
    572     **kwargs,
    573 )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_dynamo\repro\after_aot.py:102, in wrap_compiler_debug.<locals>.debug_wrapper(gm, example_inputs, **kwargs)
     97 assert config.repro_after in ("dynamo", "aot", None)
     99 try:
    100     # Call the compiler_fn - which is either aot_autograd or inductor
    101     # with fake inputs
--> 102     inner_compiled_fn = compiler_fn(gm, example_inputs)
    103 except Exception as e:
    104     # TODO: Failures here are troublesome because no real inputs,
    105     # need a different serialization strategy
    106     if config.repro_after == "aot":

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\compile_fx.py:685, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
    683 TritonBundler.begin_compile()
    684 try:
--> 685     mb_compiled_graph = fx_codegen_and_compile(
    686         gm, example_inputs, inputs_to_check, **graph_kwargs
    687     )
    688     assert mb_compiled_graph is not None
    689     mb_compiled_graph._time_taken_ns = time.time_ns() - start_time

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\compile_fx.py:1129, in fx_codegen_and_compile(gm, example_inputs, inputs_to_check, **graph_kwargs)
   1119 def fx_codegen_and_compile(
   1120     gm: GraphModule,
   1121     example_inputs: Sequence[InputType],
   (...)   1125     **graph_kwargs: Unpack[_CompileFxKwargs],
   1126 ) -> OutputCode:
   1127     scheme: FxCompile = _InProcessFxCompile()
-> 1129     return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\compile_fx.py:1044, in _InProcessFxCompile.codegen_and_compile(self, gm, example_inputs, inputs_to_check, graph_kwargs)
   1036             compiled_fn = AotCodeCompiler.compile(
   1037                 graph,
   1038                 code,
   (...)   1041                 additional_files=additional_files,
   1042             )
   1043     else:
-> 1044         compiled_fn = graph.compile_to_module().call
   1046 num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
   1047 metrics.num_bytes_accessed += num_bytes

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\graph.py:2027, in GraphLowering.compile_to_module(self)
   2020 def compile_to_module(self) -> ModuleType:
   2021     with dynamo_timed(
   2022         "GraphLowering.compile_to_module",
   2023         phase_name="code_gen",
   2024         log_pt2_compile_event=True,
   2025         dynamo_compile_column_us="inductor_code_gen_cumulative_compile_time_us",
   2026     ):
-> 2027         return self._compile_to_module()

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\graph.py:2033, in GraphLowering._compile_to_module(self)
   2029 def _compile_to_module(self) -> ModuleType:
   2030     from .codecache import PyCodeCache
   2032     code, linemap = (
-> 2033         self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
   2034     )
   2035     if config.triton.autotune_at_compile_time:
   2036         tuning_code = (
   2037             '"""\n'
   2038             + "Compile-time auto-tuning block: \n"
   (...)   2041             + '"""\n'
   2042         )

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\graph.py:1968, in GraphLowering.codegen(self)
   1965 V.debug.draw_orig_fx_graph(self.orig_gm, self.scheduler.nodes)
   1967 self.wrapper_code.push_codegened_graph(self)
-> 1968 self.scheduler.codegen()
   1970 log.debug(
   1971     "Finished codegen for all nodes. The list of kernel names available: %s",
   1972     V.graph.all_codegen_kernel_names,
   1973 )
   1975 result = self.wrapper_code.generate(self.is_inference)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\scheduler.py:3477, in Scheduler.codegen(self)
   3475 def codegen(self) -> None:
   3476     with dynamo_timed("Scheduler.codegen"):
-> 3477         return self._codegen()

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\scheduler.py:3554, in Scheduler._codegen(self)
   3552     backend.codegen_combo_kernel(node)
   3553 elif isinstance(node, (FusedSchedulerNode, SchedulerNode)):
-> 3554     self.get_backend(device).codegen_node(node)
   3555 else:
   3556     assert isinstance(node, NopKernelSchedulerNode)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\codegen\cpp.py:4781, in CppScheduling.codegen_node(self, node)
   4779 nodes: List[SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
   4780 nodes = self.try_loop_split(nodes)
-> 4781 cpp_kernel_proxy = CppKernelProxy(kernel_group)
   4782 cpp_kernel_proxy.codegen_nodes(nodes)
   4783 kernel_group.finalize_kernel(cpp_kernel_proxy, nodes)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\codegen\cpp.py:3632, in CppKernelProxy.__init__(self, kernel_group)
   3630 self.loop_nest = None
   3631 self.call_ranges = None
-> 3632 self.picked_vec_isa: cpu_vec_isa.VecISA = cpu_vec_isa.pick_vec_isa()
   3633 self.kernels: List[CppKernel] = []

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpu_vec_isa.py:414, in pick_vec_isa()
    411 if config.is_fbcode() and (platform.machine() in ["x86_64", "AMD64"]):
    412     return VecAVX2()
--> 414 _valid_vec_isa_list: List[VecISA] = valid_vec_isa_list()
    415 if not _valid_vec_isa_list:
    416     return invalid_vec_isa

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpu_vec_isa.py:401, in valid_vec_isa_list()
    397     """
    398     arch value is x86_64 on Linux, and the value is AMD64 on Windows.
    399     """
    400     _cpu_supported_x86_isa = x86_isa_checker()
--> 401     isa_list.extend(
    402         isa
    403         for isa in supported_vec_isa_list
    404         if all(flag in _cpu_supported_x86_isa for flag in str(isa).split()) and isa
    405     )
    407 return isa_list

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpu_vec_isa.py:404, in <genexpr>(.0)
    397     """
    398     arch value is x86_64 on Linux, and the value is AMD64 on Windows.
    399     """
    400     _cpu_supported_x86_isa = x86_isa_checker()
    401     isa_list.extend(
    402         isa
    403         for isa in supported_vec_isa_list
--> 404         if all(flag in _cpu_supported_x86_isa for flag in str(isa).split()) and isa
    405     )
    407 return isa_list

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpu_vec_isa.py:142, in VecISA.__bool__(self)
    141 def __bool__(self) -> bool:
--> 142     return self.__bool__impl(config.cpp.vec_isa_ok)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpu_vec_isa.py:152, in VecISA.__bool__impl(self, vec_isa_ok)
    149 if config.is_fbcode():
    150     return True
--> 152 return self.check_build(VecISA._avx_code)

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpu_vec_isa.py:102, in VecISA.check_build(self, code)
     92 from torch._inductor.codecache import get_lock_dir, LOCK_TIMEOUT, write
     93 from torch._inductor.cpp_builder import (
     94     CppBuilder,
     95     CppTorchOptions,
     96     normalize_path_separator,
     97 )
     99 key, input_path = write(
    100     code,
    101     "cpp",
--> 102     extra=_get_isa_dry_compile_fingerprint(self._arch_flags),
    103 )
    104 from filelock import FileLock
    106 lock_dir = get_lock_dir()

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpu_vec_isa.py:28, in _get_isa_dry_compile_fingerprint(isa_flags)
     19 def _get_isa_dry_compile_fingerprint(isa_flags: str) -> str:
     20     # ISA dry compile will cost about 1 sec time each startup time.
     21     # Please check the issue: https://github.com/pytorch/pytorch/issues/100378
   (...)     24     # and generated them to output binary hash path.
     25     # It would optimize and skip compile existing binary.
     26     from torch._inductor.cpp_builder import get_compiler_version_info, get_cpp_compiler
---> 28     compiler_info = get_compiler_version_info(get_cpp_compiler())
     29     torch_version = torch.__version__
     30     fingerprint = f"{compiler_info}={isa_flags}={torch_version}"

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpp_builder.py:144, in get_cpp_compiler()
    142 if _IS_WINDOWS:
    143     compiler = os.environ.get("CXX", "cl")
--> 144     check_compiler_exist_windows(compiler)
    145 else:
    146     if config.is_fbcode():

File ~\AppData\Local\anaconda3\envs\FUBAR\Lib\site-packages\torch\_inductor\cpp_builder.py:135, in check_compiler_exist_windows(compiler)
    129     output_msg = (
    130         subprocess.check_output([compiler, "/help"], stderr=subprocess.STDOUT)
    131         .strip()
    132         .decode(*SUBPROCESS_DECODE_ARGS)
    133     )
    134 except FileNotFoundError as exc:
--> 135     raise RuntimeError(f"Compiler: {compiler} is not found.") from exc
    136 except subprocess.SubprocessError:
    137     # Expected that some compiler(clang, clang++) is exist, but they not support `/help` args.
    138     pass

BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Compiler: cl is not found.

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True

I’ve been blocked for days.
Any Guidance is appreciated!

I think I figured out how to address the ‘cl’ issue.
It turns out the location of cl was not in the Anaconda PATH path variable during execution.
To remedy this, within the anaconda powershell, I set the PATH variable to include the cl location folder:

conda env config vars set PATH="\C\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Tools\MSVC\14.44.35207\bin\Hostx64\x64"

This resolved the cl issue and it executed.
However, soon after it generated a “InductorError: CppCompileError: C++ compile error”
This is the stack trace:

---------------------------------------------------------------------------
InductorError                             Traceback (most recent call last)
Cell In[1], line 10
      8     return a + b
      9 opt_foo1 = torch.compile(foo)
---> 10 print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10)))

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_dynamo\eval_frame.py:663, in _TorchDynamoContext.__call__.<locals>._fn(*args, **kwargs)
    659     raise e.with_traceback(None) from None
    660 except ShortenTraceback as e:
    661     # Failures in the backend likely don't have useful
    662     # data in the TorchDynamo frames, so we strip them out.
--> 663     raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    664 finally:
    665     # Restore the dynamic layer stack depth if necessary.
    666     set_eval_frame(None)

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\compile_fx.py:760, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
    758     raise
    759 except Exception as e:
--> 760     raise InductorError(e, currentframe()).with_traceback(
    761         e.__traceback__
    762     ) from None
    763 finally:
    764     TritonBundler.end_compile()

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\compile_fx.py:745, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
    743 TritonBundler.begin_compile()
    744 try:
--> 745     mb_compiled_graph = fx_codegen_and_compile(
    746         gm, example_inputs, inputs_to_check, **graph_kwargs
    747     )
    748     assert mb_compiled_graph is not None
    749     mb_compiled_graph._time_taken_ns = time.time_ns() - start_time

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\compile_fx.py:1295, in fx_codegen_and_compile(gm, example_inputs, inputs_to_check, **graph_kwargs)
   1291     from .compile_fx_subproc import _SubprocessFxCompile
   1293     scheme = _SubprocessFxCompile()
-> 1295 return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\compile_fx.py:1197, in _InProcessFxCompile.codegen_and_compile(self, gm, example_inputs, inputs_to_check, graph_kwargs)
   1184             compiled_fn = AotCodeCompiler.compile(
   1185                 graph,
   1186                 wrapper_code.value,
   (...)   1194                 ],
   1195             )
   1196     else:
-> 1197         compiled_fn = graph.compile_to_module().call
   1199 num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
   1200 metrics.num_bytes_accessed += num_bytes

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\graph.py:2083, in GraphLowering.compile_to_module(self)
   2076 def compile_to_module(self) -> ModuleType:
   2077     with dynamo_timed(
   2078         "GraphLowering.compile_to_module",
   2079         phase_name="code_gen",
   2080         log_pt2_compile_event=True,
   2081         dynamo_compile_column_us="inductor_code_gen_cumulative_compile_time_us",
   2082     ):
-> 2083         return self._compile_to_module()

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\graph.py:2130, in GraphLowering._compile_to_module(self)
   2124     trace_structured(
   2125         "inductor_output_code",
   2126         lambda: {"filename": path},
   2127         payload_fn=lambda: wrapper_code.value,
   2128     )
   2129 with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
-> 2130     mod = PyCodeCache.load_by_key_path(
   2131         key,
   2132         path,
   2133         linemap=linemap,  # type: ignore[arg-type]
   2134         attrs={**self.constants, **self.torchbind_constants},
   2135     )
   2136 self.cache_key = key
   2137 self.cache_path = path

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\codecache.py:2747, in PyCodeCache.load_by_key_path(cls, key, path, linemap, attrs)
   2744 if linemap is None:
   2745     linemap = []
-> 2747 mod = _reload_python_module(key, path)
   2749 # unzip into separate lines/nodes lists
   2750 cls.linemaps[path] = list(zip(*linemap))

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\runtime\compile_tasks.py:36, in _reload_python_module(key, path)
     34 mod.__file__ = path
     35 mod.key = key  # type: ignore[attr-defined]
---> 36 exec(code, mod.__dict__, mod.__dict__)
     37 sys.modules[mod.__name__] = mod
     38 return mod

File ~\AppData\Local\Temp\torchinductor_croda\wo\cwoory2aqk53pzwvty2orkc435qpc5tzio4sq6nvxm6vyfjcbl3w.py:31
     27 async_compile = AsyncCompile()
     28 empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
---> 31 cpp_fused_add_cos_sin_0 = async_compile.cpp_pybinding(['const float*', 'const float*', 'float*'], '''
     32 #include "C:/Users/croda/AppData/Local/Temp/torchinductor_croda/pi/cpicxudqmdsjh5cm4klbtbrvy2cxwr7whxl3md2zzdjdf3orvfdf.h"
     33 extern "C" __declspec(dllexport) void kernel(const float* in_ptr0,
     34                        const float* in_ptr1,
     35                        float* out_ptr0)
     36 {
     37     {
     38         for(int64_t x0=static_cast<int64_t>(0LL); x0<static_cast<int64_t>(100LL); x0+=static_cast<int64_t>(1LL))
     39         {
     40             {
     41                 {
     42                     auto tmp0 = in_ptr0[static_cast<int64_t>(x0)];
     43                     auto tmp2 = in_ptr1[static_cast<int64_t>(x0)];
     44                     auto tmp1 = std::sin(tmp0);
     45                     auto tmp3 = std::cos(tmp2);
     46                     auto tmp4 = decltype(tmp1)(tmp1 + tmp3);
     47                     out_ptr0[static_cast<int64_t>(x0)] = tmp4;
     48                 }
     49             }
     50         }
     51     }
     52 }
     53 ''')
     56 async_compile.wait(globals())
     57 del async_compile

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\async_compile.py:370, in AsyncCompile.cpp_pybinding(self, argtypes, source_code)
    368 kernel_code_log.info("CPP+Bindings Kernel:\n%s", source_code)
    369 if get_compile_threads() <= 1:
--> 370     return CppPythonBindingsCodeCache.load_pybinding(argtypes, source_code)
    371 else:
    372     get_result = CppPythonBindingsCodeCache.load_pybinding_async(
    373         argtypes, source_code, submit_fn=self.submit
    374     )

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\codecache.py:2250, in CppPythonBindingsCodeCache.load_pybinding(cls, *args, **kwargs)
   2248 @classmethod
   2249 def load_pybinding(cls, *args: Any, **kwargs: Any) -> Any:
-> 2250     return cls.load_pybinding_async(*args, **kwargs)()

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\codecache.py:2242, in CppPythonBindingsCodeCache.load_pybinding_async.<locals>.future()
   2240 nonlocal result
   2241 if result is None:
-> 2242     result = get_result()
   2243     assert isinstance(result, ModuleType)
   2244 return getattr(result, cls.entry_function)

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\codecache.py:2051, in CppCodeCache.load_async.<locals>.load_fn()
   2049 if future is not None:
   2050     future.result()
-> 2051 result = worker_fn()
   2052 assert result is None
   2053 lib = cls._load_library(binary_path, key)

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\codecache.py:2079, in _worker_compile_cpp(lock_path, cpp_builder)
   2077 with FileLock(lock_path, timeout=LOCK_TIMEOUT):
   2078     if not os.path.exists(cpp_builder.get_target_file_path()):
-> 2079         cpp_builder.build()

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\cpp_builder.py:1601, in CppBuilder.build(self)
   1598 _create_if_dir_not_exist(_build_tmp_dir)
   1600 build_cmd = self.get_command_line()
-> 1601 run_compile_cmd(build_cmd, cwd=_build_tmp_dir)
   1602 _remove_dir(_build_tmp_dir)

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\cpp_builder.py:355, in run_compile_cmd(cmd_line, cwd)
    353 def run_compile_cmd(cmd_line: str, cwd: str) -> None:
    354     with dynamo_timed("compile_file"):
--> 355         _run_compile_cmd(cmd_line, cwd)

File ~\AppData\Local\anaconda3\envs\SANFU\Lib\site-packages\torch\_inductor\cpp_builder.py:350, in _run_compile_cmd(cmd_line, cwd)
    340     instruction = (
    341         "\n\nOpenMP support not found. Please try one of the following solutions:\n"
    342         "(1) Set the `CXX` environment variable to a compiler other than Apple clang++/g++ "
   (...)    347         " with `include/omp.h` under it."
    348     )
    349     output += instruction
--> 350 raise exc.CppCompileError(cmd, output) from e

InductorError: CppCompileError: C++ compile error

Command:
cl /I C:/Users/croda/AppData/Local/anaconda3/envs/SANFU/Include /I C:/Users/croda/AppData/Local/anaconda3/envs/SANFU/Lib/site-packages/torch/include /I C:/Users/croda/AppData/Local/anaconda3/envs/SANFU/Lib/site-packages/torch/include/torch/csrc/api/include /D TORCH_INDUCTOR_CPP_WRAPPER /D STANDALONE_TORCH_HEADER /D C10_USING_CUSTOM_GENERATED_MACROS /DLL /MD /O2 /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /openmp /openmp:experimental C:/Users/croda/AppData/Local/Temp/torchinductor_croda/jm/cjmqzws47b73f3xuzb3do7ir43gv5qqby4rpjbn3qztjfa7gpz2v.cpp /LD /FeC:/Users/croda/AppData/Local/Temp/torchinductor_croda/jm/cjmqzws47b73f3xuzb3do7ir43gv5qqby4rpjbn3qztjfa7gpz2v.pyd /link /LIBPATH:C:/Users/croda/AppData/Local/anaconda3/envs/SANFU/libs /LIBPATH:C:/Users/croda/AppData/Local/anaconda3/envs/SANFU/Lib/site-packages/torch/lib torch.lib torch_cpu.lib torch_python.lib sleef.lib

Output:
Microsoft (R) C/C++ Optimizing Compiler Version 19.44.35207.1 for x64
Copyright (C) Microsoft Corporation.  All rights reserved.

cl : Command line warning D9025 : overriding '/openmp' with '/openmp:experimental'
cjmqzws47b73f3xuzb3do7ir43gv5qqby4rpjbn3qztjfa7gpz2v.cpp
C:/Users/croda/AppData/Local/Temp/torchinductor_croda/pi/cpicxudqmdsjh5cm4klbtbrvy2cxwr7whxl3md2zzdjdf3orvfdf.h(3): fatal error C1083: Cannot open include file: 'algorithm': No such file or directory


Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"

As always, any assistance is appreciated.

I’m not sure what’s the status of inductor on windows. Would you be able to open an issue on github with the details please?