I’m also learning from this tutorial. One question is does the simplest case running on GPU or CPU? I tried to use nvprof to checkout, but no kernels detected, so it spawns 2 processes on CPU, am I right? How to make it to run on 2 GPU processes? I tried to create the tensor on CUDA device like this:
tensor = torch.zeros(1).cuda(rank). But still nvprof didn’t find any kernels launched, am I missing something?
@BruceDai003 how are you running nvprof?
"""run.py:"""
#!/usr/bin/env python
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
def run(rank, size):
""" Distributed function to be implemented later. """
x = torch.ones(2, 2).cuda(rank)
y = x + x
print(y)
def init_process(rank, size, fn, backend='gloo'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size)
if __name__ == "__main__":
size = 1
processes = []
mp.set_start_method("spawn")
for rank in range(size):
p = mp.Process(target=init_process, args=(rank, size, run))
p.start()
processes.append(p)
for p in processes:
p.join()
I run the program with nvprof --profile-child-processes python test.py.
==81172== NVPROF is profiling process 81172, command: /fsx/users/gcramer/conda/envs/pytorch1/bin/python -c from multiprocessing.spawn import spawn_main; spawn_main(tracker_fd=5, pipe_handle=7) --multiprocessing-fork
tensor([[2., 2.],
[2., 2.]], device='cuda:0')
==81172== Profiling application: /fsx/users/gcramer/conda/envs/pytorch1/bin/python -c from multiprocessing.spawn import spawn_main; spawn_main(tracker_fd=5, pipe_handle=7) --multiprocessing-fork
==81172== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 27.17% 22.688us 13 1.7450us 1.6000us 2.5280us [CUDA memcpy DtoH]
7.74% 6.4640us 1 6.4640us 6.4640us 6.4640us void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<double, at::native::func_wrapper_t<double, at::native::MinNanFunctor<double>>, unsigned int, double, int=4>>(double)
7.36% 6.1440us 1 6.1440us 6.1440us 6.1440us void at::native::reduce_kernel<int=512, int=1, at::native::ReduceOp<double, at::native::func_wrapper_t<double, at::native::MaxNanFunctor<double>>, unsigned int, double, int=4>>(double)
3.91% 3.2640us 2 1.6320us 1.4080us 1.8560us void at::native::vectorized_elementwise_kernel<int=4, at::native::AbsFunctor<float>, at::detail::Array<char*, int=2>>(int, float, at::native::AbsFunctor<float>)
3.79% 3.1680us 1 3.1680us 3.1680us 3.1680us void at::cuda::detail::cub::DeviceSelectSweepKernel<at::cuda::detail::cub::DispatchSelectIf<at::cuda::detail::cub::CountingInputIterator<long, long>, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__42_tmpxft_00009bab_00000000_7_Nonzero_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, long*, int*, at::cuda::detail::cub::NullType, at::cuda::detail::cub::NullType, int, bool=0>::PtxSelectIfPolicyT, at::cuda::detail::cub::CountingInputIterator<long, long>, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__42_tmpxft_00009bab_00000000_7_Nonzero_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, long*, int*, at::cuda::detail::cub::ScanTileState<int, bool=1>, at::cuda::detail::cub::NullType, at::cuda::detail::cub::NullType, int, bool=0>(long, at::cuda::detail::cub::CountingInputIterator<long, long>, bool, bool, at::native::_GLOBAL__N__42_tmpxft_00009bab_00000000_7_Nonzero_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__42_tmpxft_00009bab_00000000_7_Nonzero_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, int)
3.72% 3.1040us 1 3.1040us 3.1040us 3.1040us _ZN2at6native24index_elementwise_kernelILi128ELi4EZNS0_16gpu_index_kernelIZNS0_17index_kernel_implINS0_10OpaqueTypeILi4EEEEEvRNS_14TensorIteratorEN3c108ArrayRefIlEESA_EUlPcSB_lE_EEvS7_SA_SA_RKT_EUliE_EEviT1_
3.64% 3.0400us 2 1.5200us 1.3120us 1.7280us void at::native::vectorized_elementwise_kernel<int=4, at::native::BUnaryFunctor<at::native::CompareGTFunctor<double>>, at::detail::Array<char*, int=2>>(int, double, at::native::CompareGTFunctor<double>)
3.56% 2.9760us 1 2.9760us 2.9760us 2.9760us void at::native::vectorized_elementwise_kernel<int=4, at::native::DivFunctor<double>, at::detail::Array<char*, int=3>>(int, double, at::native::DivFunctor<double>)
3.52% 2.9440us 2 1.4720us 1.2480us 1.6960us void at::native::vectorized_elementwise_kernel<int=4, at::native::BUnaryFunctor<at::native::CompareNEFunctor<float>>, at::detail::Array<char*, int=2>>(int, float, at::native::CompareNEFunctor<float>)
3.49% 2.9120us 1 2.9120us 2.9120us 2.9120us void at::cuda::detail::cub::DeviceReduceSingleTileKernel<at::cuda::detail::cub::DeviceReducePolicy<bool, int, int, at::cuda::detail::cub::Sum>::Policy600, at::cuda::detail::cub::TransformInputIterator<bool, at::native::_GLOBAL__N__42_tmpxft_00009bab_00000000_7_Nonzero_cpp1_ii_cba1aaa0::NonZeroOp<bool>, bool*, long>, int*, int, at::cuda::detail::cub::Sum, int>(int, int, at::cuda::detail::cub::Sum, at::cuda::detail::cub::DeviceReducePolicy<bool, int, int, at::cuda::detail::cub::Sum>::Policy600, bool)
3.41% 2.8480us 1 2.8480us 2.8480us 2.8480us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_21copy_device_to_deviceERNS_14TensorIteratorEbENKUlvE1_clEvENKUlvE4_clEvEUldE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory12LoadWithCastILi1EEENSD_13StoreWithCastEEEviT_T0_T1_T2_T3_T4_
3.30% 2.7520us 2 1.3760us 1.2480us 1.5040us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_16ceil_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEE23TrivialOffsetCalculatorILi1EjESC_NS0_6memory15LoadWithoutCastENSD_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_
3.29% 2.7510us 2 1.3750us 1.2480us 1.5030us void at::native::unrolled_elementwise_kernel<at::native::CompareNEFunctor<float>, at::detail::Array<char*, int=3>, TrivialOffsetCalculator<int=2, unsigned int>, TrivialOffsetCalculator<int=1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, float, at::native::CompareNEFunctor<float>, char*, int=3, at::detail::Array<char*, int=3>, int=2)
2.68% 2.2400us 1 2.2400us 2.2400us 2.2400us void at::native::vectorized_elementwise_kernel<int=4, at::native::BitwiseAndFunctor<bool>, at::detail::Array<char*, int=3>>(int, bool, at::native::BitwiseAndFunctor<bool>)
2.64% 2.2080us 1 2.2080us 2.2080us 2.2080us [CUDA memcpy HtoD]
2.41% 2.0160us 1 2.0160us 2.0160us 2.0160us void at::native::vectorized_elementwise_kernel<int=4, at::native::MulFunctor<bool>, at::detail::Array<char*, int=3>>(int, bool, at::native::MulFunctor<bool>)
2.30% 1.9200us 1 1.9200us 1.9200us 1.9200us void at::native::vectorized_elementwise_kernel<int=4, at::native::AddFunctor<float>, at::detail::Array<char*, int=3>>(int, float, at::native::AddFunctor<float>)
2.15% 1.7920us 1 1.7920us 1.7920us 1.7920us void at::native::vectorized_elementwise_kernel<int=4, at::native::CompareEqFunctor<float>, at::detail::Array<char*, int=3>>(int, float, at::native::CompareEqFunctor<float>)
2.11% 1.7600us 1 1.7600us 1.7600us 1.7600us void at::native::vectorized_elementwise_kernel<int=2, at::native::CompareNEFunctor<float>, at::detail::Array<char*, int=3>>(int, float, at::native::CompareNEFunctor<float>)
2.11% 1.7600us 1 1.7600us 1.7600us 1.7600us void at::native::vectorized_elementwise_kernel<int=4, at::native::CompareNEFunctor<float>, at::detail::Array<char*, int=3>>(int, float, at::native::CompareNEFunctor<float>)
2.07% 1.7280us 1 1.7280us 1.7280us 1.7280us _ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_16ceil_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
2.03% 1.6960us 1 1.6960us 1.6960us 1.6960us _ZN2at6native29vectorized_elementwise_kernelILi2EZZZNS0_16ceil_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE_clEvENKUlvE2_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_
1.61% 1.3440us 1 1.3440us 1.3440us 1.3440us void at::cuda::detail::cub::DeviceCompactInitKernel<at::cuda::detail::cub::ScanTileState<int, bool=1>, int*>(int, int, bool=1)
API calls: 99.71% 4.88775s 1 4.88775s 4.88775s 4.88775s cudaStreamIsCapturing
0.08% 3.8803ms 26 149.24us 12.256us 3.3780ms cudaLaunchKernel
0.08% 3.8186ms 8 477.32us 471.60us 506.55us cudaGetDeviceProperties
0.06% 2.9157ms 4 728.93us 712.43us 769.01us cuDeviceTotalMem
0.05% 2.2443ms 404 5.5550us 753ns 229.15us cuDeviceGetAttribute
0.01% 370.98us 300 1.2360us 952ns 10.083us cudaGetDevice
0.01% 344.41us 14 24.600us 17.998us 32.813us cudaMemcpyAsync
0.01% 331.36us 1 331.36us 331.36us 331.36us cudaMalloc
0.00% 199.46us 4 49.864us 45.765us 61.253us cuDeviceGetName
0.00% 86.061us 14 6.1470us 4.6040us 9.2300us cudaStreamSynchronize
0.00% 53.606us 59 908ns 737ns 1.3350us cudaGetLastError
0.00% 15.076us 1 15.076us 15.076us 15.076us cudaFuncGetAttributes
0.00% 12.104us 4 3.0260us 2.2260us 4.0030us cuDeviceGetPCIBusId
0.00% 10.313us 12 859ns 754ns 1.5130us cuDevicePrimaryCtxGetState
0.00% 9.4360us 5 1.8870us 1.3530us 3.1390us cudaSetDevice
0.00% 7.8430us 8 980ns 791ns 1.3650us cuDeviceGet
0.00% 5.4070us 3 1.8020us 1.1060us 2.9200us cudaDeviceGetAttribute
0.00% 5.0910us 6 848ns 766ns 1.0290us cudaPeekAtLastError
0.00% 4.2730us 4 1.0680us 747ns 1.5770us cudaGetDeviceCount
0.00% 3.7520us 1 3.7520us 3.7520us 3.7520us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 3.7080us 4 927ns 839ns 1.0140us cuDeviceGetUuid
0.00% 3.6310us 3 1.2100us 784ns 1.6020us cuDeviceGetCount