Here is my simple program:
# test.py
import torch
from torch.cuda import nvtx
with nvtx.range("init"):
stream1 = torch.cuda.Stream()
stream2 = torch.cuda.Stream()
a = torch.zeros(233, device='cuda')
with torch.cuda.stream(stream1):
with nvtx.range("first add"):
a += 1
with torch.cuda.stream(stream2):
with nvtx.range("second add"):
a += 1
with nvtx.range("print"):
torch.cuda.synchronize()
average = a.mean().item()
print(f'Average of the elements in tensor a: {average}')
I’m profiling it using nsys profile -o test python test.py
, and during the init
stage, in nsight system, I see hundreds of cudaStreamCreateWithPriority
and cudaStreamIsCapturing
call. But I only created two cuda streams. What happened?
You can use cuda-gdb
and set a breakpoint at cudaStreamIsCapturing
to check the stacktraces.
This will show that e.g. the CUDAGenerator
calls this method during the initialization e.g. to set its seed
here:
#0 0x00007fffec4775e4 in cudaStreamIsCapturing () from /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12
#1 0x00007fff8894508f in at::cuda::assertNotCapturing(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so
#2 0x00007fff8894228a in at::CUDAGeneratorImpl::set_current_seed(unsigned long) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so
#3 0x00007fff889425d2 in at::CUDAGeneratorImpl::seed() () from /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so
#4 0x00007fff889441fa in at::cuda::detail::getDefaultCUDAGenerator(signed char) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so
#5 0x00007fffeb882c0e in THCPModule_initExtension(_object*, _object*) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so
#6 0x00005555556aba3e in ?? ()
#7 0x000055555569ccfa in _PyEval_EvalFrameDefault ()
#8 0x00005555556ae9fc in _PyFunction_Vectorcall ()
#9 0x0000555555687469 in ?? ()
#10 0x000055555578aa48 in PyObject_CallMethod ()
#11 0x00007fffeb80af2f in torch::utils::device_lazy_init(c10::DeviceType) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so
#12 0x00007fffeb20ecb7 in torch::autograd::THPVariable_zeros(_object*, _object*, _object*) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so
and later the caching allocator will call into cudaMallocMaybeCapturing
here:
#0 0x00007fffec4775e4 in cudaStreamIsCapturing () from /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12
#1 0x00007ffff6596aa6 in c10::cuda::CUDACachingAllocator::Native::(anonymous namespace)::cudaMallocMaybeCapturing(void**, unsigned long) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so
#2 0x00007ffff65a0331 in c10::cuda::CUDACachingAllocator::Native::DeviceCachingAllocator::alloc_block(c10::cuda::CUDACachingAllocator::Native::(anonymous namespace)::AllocParams&, bool, std::shared_ptr<c10::GatheredContext> const&, std::unique_lock<std::recursive_mutex>&) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so
#3 0x00007ffff65a4d7a in c10::cuda::CUDACachingAllocator::Native::DeviceCachingAllocator::malloc(signed char, unsigned long, CUstream_st*) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so
#4 0x00007ffff65a65d5 in c10::cuda::CUDACachingAllocator::Native::NativeCachingAllocator::malloc(void**, signed char, unsigned long, CUstream_st*) () from /usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so
#5 0x00007ffff65a6a5b in c10::cuda::CUDACachingAllocator::Native::NativeCachingAllocator::allocate(unsigned long) const () from /usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so
These checks are performed to fail properly when these disallowed operations are called during a CUDA Graphs capture.