Got it! I looked into the terminal output and found when the error is mentioned. It happens at 69% progress, so not sure why it keeps going until it reaches 84% and displays the error I showed above.
In any case, this is what I found:
[ 69%] Building CXX object caffe2/CMakeFiles/torch_cpu.dir/__/aten/src/ATen/core/adaption.cpp.o
prims_ll.h(198): error: identifier "__funnelshift_r" is undefined
detected during:
instantiation of "uint64_t Primitives<T, RedOp, Fan, Direct, ProtoLL>::DataLoader::loadFinish() [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<2>, Direct=1]"
(251): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL>::LLGenericOp<RECV,SEND,SrcBuf,DstBuf>(intptr_t, intptr_t, int, __nv_bool) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<2>, Direct=1, RECV=1, SEND=1, SrcBuf=0, DstBuf=1]"
(384): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL>::recvReduceCopySend(intptr_t, intptr_t, int, __nv_bool) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<2>, Direct=1]"
primitives.h(135): here
instantiation of "void PrimitivesWithoutDirect<RealPrimitives>::directRecvReduceCopySend(intptr_t, intptr_t, intptr_t, int, __nv_bool) [with RealPrimitives=Primitives<int8_t, FuncSum<int8_t>, FanSymmetric<2>, 1, ProtoLL>]"
all_reduce.h(206): here
instantiation of "void <unnamed>::runTreeSplit<T,RedOp,Proto>(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>, Proto=ProtoLL]"
all_reduce.h(372): here
instantiation of "void RunWorkElement<(ncclFunc_t)4, T, RedOp, 0, 0>::run(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>]"
common.h(77): here
instantiation of "void RunWork<Fn, T, RedOp, Algo, Proto>::run(ncclWork *) [with Fn=(ncclFunc_t)4, T=int8_t, RedOp=FuncSum<int8_t>, Algo=0, Proto=0]"
all_reduce.cu(11): here
prims_ll.h(198): error: identifier "__funnelshift_r" is undefined
detected during:
instantiation of "uint64_t Primitives<T, RedOp, Fan, Direct, ProtoLL>::DataLoader::loadFinish() [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanAsymmetric<2, 1>, Direct=1]"
(251): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL>::LLGenericOp<RECV,SEND,SrcBuf,DstBuf>(intptr_t, intptr_t, int, __nv_bool) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanAsymmetric<2, 1>, Direct=1, RECV=0, SEND=1, SrcBuf=0, DstBuf=-1]"
(363): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL>::send(intptr_t, int) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanAsymmetric<2, 1>, Direct=1]"
all_reduce.h(224): here
instantiation of "void <unnamed>::runTreeSplit<T,RedOp,Proto>(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>, Proto=ProtoLL]"
all_reduce.h(372): here
instantiation of "void RunWorkElement<(ncclFunc_t)4, T, RedOp, 0, 0>::run(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>]"
common.h(77): here
instantiation of "void RunWork<Fn, T, RedOp, Algo, Proto>::run(ncclWork *) [with Fn=(ncclFunc_t)4, T=int8_t, RedOp=FuncSum<int8_t>, Algo=0, Proto=0]"
all_reduce.cu(11): here
prims_ll.h(198): error: identifier "__funnelshift_r" is undefined
detected during:
instantiation of "uint64_t Primitives<T, RedOp, Fan, Direct, ProtoLL>::DataLoader::loadFinish() [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanAsymmetric<1, 2>, Direct=1]"
(251): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL>::LLGenericOp<RECV,SEND,SrcBuf,DstBuf>(intptr_t, intptr_t, int, __nv_bool) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanAsymmetric<1, 2>, Direct=1, RECV=1, SEND=0, SrcBuf=-1, DstBuf=1]"
(369): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL>::recv(intptr_t, int, __nv_bool) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanAsymmetric<1, 2>, Direct=1]"
primitives.h(125): here
instantiation of "void PrimitivesWithoutDirect<RealPrimitives>::directRecv(intptr_t, int) [with RealPrimitives=Primitives<int8_t, FuncSum<int8_t>, FanAsymmetric<1, 2>, 1, ProtoLL>]"
all_reduce.h(243): here
instantiation of "void <unnamed>::runTreeSplit<T,RedOp,Proto>(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>, Proto=ProtoLL]"
all_reduce.h(372): here
instantiation of "void RunWorkElement<(ncclFunc_t)4, T, RedOp, 0, 0>::run(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>]"
common.h(77): here
instantiation of "void RunWork<Fn, T, RedOp, Algo, Proto>::run(ncclWork *) [with Fn=(ncclFunc_t)4, T=int8_t, RedOp=FuncSum<int8_t>, Algo=0, Proto=0]"
all_reduce.cu(11): here
op128.h(51): error: identifier "__funnelshift_r" is undefined
detected during:
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL128>::loadRegsBegin(uint64_t (&)[WordPerThread], const T *, int) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<2>, Direct=1, WordPerThread=8]"
prims_ll128.h(305): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL128>::GenericOp<RECV,SEND,SrcBuf,DstBuf>(intptr_t, intptr_t, int, __nv_bool) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<2>, Direct=1, RECV=1, SEND=1, SrcBuf=0, DstBuf=1]"
prims_ll128.h(422): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL128>::recvReduceCopySend(intptr_t, intptr_t, int, __nv_bool) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<2>, Direct=1]"
primitives.h(135): here
instantiation of "void PrimitivesWithoutDirect<RealPrimitives>::directRecvReduceCopySend(intptr_t, intptr_t, intptr_t, int, __nv_bool) [with RealPrimitives=Primitives<int8_t, FuncSum<int8_t>, FanSymmetric<2>, 1, ProtoLL128>]"
all_reduce.h(206): here
instantiation of "void <unnamed>::runTreeSplit<T,RedOp,Proto>(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>, Proto=ProtoLL128]"
all_reduce.h(386): here
instantiation of "void RunWorkElement<(ncclFunc_t)4, T, RedOp, 0, 1>::run(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>]"
common.h(77): here
instantiation of "void RunWork<Fn, T, RedOp, Algo, Proto>::run(ncclWork *) [with Fn=(ncclFunc_t)4, T=int8_t, RedOp=FuncSum<int8_t>, Algo=0, Proto=1]"
all_reduce.cu(11): here
prims_ll.h(198): error: identifier "__funnelshift_r" is undefined
detected during:
instantiation of "uint64_t Primitives<T, RedOp, Fan, Direct, ProtoLL>::DataLoader::loadFinish() [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<1>, Direct=1]"
(251): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL>::LLGenericOp<RECV,SEND,SrcBuf,DstBuf>(intptr_t, intptr_t, int, __nv_bool) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<1>, Direct=1, RECV=0, SEND=1, SrcBuf=0, DstBuf=-1]"
(363): here
instantiation of "void Primitives<T, RedOp, Fan, Direct, ProtoLL>::send(intptr_t, int) [with T=int8_t, RedOp=FuncSum<int8_t>, Fan=FanSymmetric<1>, Direct=1]"
all_reduce.h(64): here
instantiation of "void <unnamed>::runRing<T,RedOp,Proto>(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>, Proto=ProtoLL]"
all_reduce.h(365): here
instantiation of "void RunWorkElement<(ncclFunc_t)4, T, RedOp, 1, 0>::run(ncclWorkElem *) [with T=int8_t, RedOp=FuncSum<int8_t>]"
common.h(77): here
instantiation of "void RunWork<Fn, T, RedOp, Algo, Proto>::run(ncclWork *) [with Fn=(ncclFunc_t)4, T=int8_t, RedOp=FuncSum<int8_t>, Algo=1, Proto=0]"
all_reduce.cu(11): here
5 errors detected in the compilation of "/tmp/tmpxft_000064a1_00000000-6_all_reduce.cpp1.ii".
/home/simulations/pytorch/build/nccl/obj/collectives/device/Makefile.rules:227: recipe for target '/home/simulations/pytorch/build/nccl/obj/collectives/device/all_reduce_sum_i8.o' failed
make[5]: *** [/home/simulations/pytorch/build/nccl/obj/collectives/device/all_reduce_sum_i8.o] Error 1
Makefile:50: recipe for target '/home/simulations/pytorch/build/nccl/obj/collectives/device/colldevice.a' failed
make[4]: *** [/home/simulations/pytorch/build/nccl/obj/collectives/device/colldevice.a] Error 2
Makefile:25: recipe for target 'src.build' failed
make[3]: *** [src.build] Error 2
CMakeFiles/nccl_external.dir/build.make:129: recipe for target 'nccl_external-prefix/src/nccl_external-stamp/nccl_external-build' failed
make[2]: *** [nccl_external-prefix/src/nccl_external-stamp/nccl_external-build] Error 2
CMakeFiles/Makefile2:1986: recipe for target 'CMakeFiles/nccl_external.dir/all' failed
make[1]: *** [CMakeFiles/nccl_external.dir/all] Error 2
Thanks again!!