Thank you for your reply! The following is the output of the 1st test using cuda-gdb
[Thread debugging using libthread_db enabled]
Using host libthread_db library “/lib/x86_64-linux-gnu/libthread_db.so.1”.
[New Thread 0x7fff60ac4700 (LWP 1001)]
[New Thread 0x7fff602c3700 (LWP 1002)]
[New Thread 0x7fff5bac2700 (LWP 1003)]
[New Thread 0x7fff55375700 (LWP 1004)]
[Thread 0x7fff5bac2700 (LWP 1003) exited]
[Thread 0x7fff602c3700 (LWP 1002) exited]
[Thread 0x7fff60ac4700 (LWP 1001) exited]
[Detaching after fork from child process 1005]
[New Thread 0x7fff5bac2700 (LWP 1012)]
[New Thread 0x7fff602c3700 (LWP 1013)]
Tensor created successfully on GPU 0
[New Thread 0x7fff60ac4700 (LWP 1015)]
Tensor created successfully on GPU 1
[New Thread 0x7fff48af1700 (LWP 1016)]
Tensor created successfully on GPU 2
[New Thread 0x7fff41dff700 (LWP 1017)]
Tensor created successfully on GPU 3
[New Thread 0x7fff4115d700 (LWP 1018)]
Tensor created successfully on GPU 4
[New Thread 0x7fff2dfff700 (LWP 1019)]
Tensor created successfully on GPU 5
[New Thread 0x7fff2d7fe700 (LWP 1020)]
Tensor created successfully on GPU 6
[New Thread 0x7fff2cb5c700 (LWP 1021)]
Tensor created successfully on GPU 7
GPU 3 memory summary:
|===========================================================================|
PyTorch CUDA memory summary, device ID 3 |
CUDA OOMs: 0 |
=========================================================================== |
Metric |
--------------------------------------------------------------------------- |
Allocated memory |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
Active memory |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
Requested memory |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
GPU reserved memory |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
Non-releasable memory |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
Allocations |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
Active allocs |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
GPU reserved segments |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
Non-releasable allocs |
from large pool |
from small pool |
--------------------------------------------------------------------------- |
Oversize allocations |
--------------------------------------------------------------------------- |
Oversize GPU segments |
=========================================================================== |
a.device: cuda:3
–Type for more, q to quit, c to continue without paging–
CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x7fff27455280
Thread 1 “python” received signal CUDA_EXCEPTION_14, Warp Illegal Address.
[Switching focus to CUDA kernel 0, grid 2, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 1, lane 0]
0x00007fff27455310 in void cutlass::Kernel<cutlass::gemm::kernel::GemmGrouped<cutlass::gemm::threadblock::MmaMultistage<cutlass::gemm::GemmShape<128, 128, 32>, cutlass::transform::threadblock::PredicatedTileAccessIterator<cutlass::MatrixShape<128, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajor, 1, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<32, 128>, 128, cutlass::PitchLinearShape<4, 8>, 8>, cutlass::Array<cutlass::bfloat16_t, 8, false>, false, cutlass::layout::NoPermute>, cutlass::transform::threadblock::RegularTileAccessIterator<cutlass::MatrixShape<128, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<16, 32>, 0, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<32, 128>, 128, cutlass::PitchLinearShape<4, 8>, 8>, 16>, (cutlass::arch::CacheOperation::Kind)1, cutlass::transform::threadblock::PredicatedTileAccessIterator<cutlass::MatrixShape<32, 128>, cutlass::bfloat16_t, cutlass::layout::RowMajor, 0, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<128, 32>, 128, cutlass::PitchLinearShape<8, 4>, 8>, cutlass::Array<cutlass::bfloat16_t, 8, false>, false, cutlass::layout::NoPermute>, cutlass::transform::threadblock::RegularTileAccessIterator<cutlass::MatrixShape<32, 128>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCongruous<16, 64>, 0, cutlass::transform::PitchLinearWarpRakedT–Type for more, q to quit, c to continue without paging–
hreadMap<cutlass::PitchLinearShape<128, 32>, 128, cutlass::PitchLinearShape<8, 4>, 8>, 16>, (cutlass::arch::CacheOperation::Kind)1, float, cutlass::layout::RowMajor, cutlass::gemm::threadblock::MmaPolicy<cutlass::gemm::warp::MmaTensorOp<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<16, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCongruous<16, 64>, float, cutlass::layout::RowMajor, cutlass::gemm::warp::MmaTensorOpPolicy<cutlass::arch::Mma<cutlass::gemm::GemmShape<16, 8, 16>, 32, cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, cutlass::layout::ColumnMajor, float, cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>, cutlass::MatrixShape<1, 1> >, 1, false, bool>, cutlass::MatrixShape<0, 0>, cutlass::MatrixShape<0, 0>, 1>, 4, (cutlass::gemm::SharedMemoryClearOption)0, bool>, cutlass::epilogue::threadblock::Epilogue<cutlass::gemm::GemmShape<128, 128, 32>, cutlass::gemm::warp::MmaTensorOp<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<16, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCongruous<16, 64>, float, cutlass::layout::RowMajor, cutlass::gemm::warp::MmaTensorOpPolicy<cutlass::arch::Mma<cutlass::gemm::GemmShape<16, 8, 16>, 32, cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, cutlass::layout::ColumnMajor, float, cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>, cutlass::MatrixShape<1, 1> >, 1, false, bool>, 1, cutlass::epilogue::threadblock::PredicatedTileIterator<cutlass::epilogue::threadblock::OutputTileOptimalThreadMap<cutlass::epilogue::threadblock::OutputTileShape<128, 8, 2, 1, 1>, cutlass::epilogue::threadblock::OutputTileShape<1, 8, 1, 1, 8>, 128, 8, 16>, cutlass::bfloat16_t, false, cutlass::layout::NoPermute, false>, cutlass::epilogue::warp::FragmentIteratorTensorOp<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, float, cutlass::Array<float, 4, true>, cutlass::layout::RowMajor>, cutlass::epilogue::warp::TileIteratorTensorOp<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, float, cutlass::layout::RowMajor>, cutlass::epilogue::threadblock::SharedLoadIterator<cutlass::epil–Type for more, q to quit, c to continue without paging–
ogue::threadblock::OutputTileOptimalThreadMap<cutlass::epilogue::threadblock::OutputTileShape<128, 8, 2, 1, 1>, cutlass::epilogue::threadblock::OutputTileShape<1, 8, 1, 1, 8>, 128, 8, 16>::CompactedThreadMap, float, 32>, cutlass::epilogue::LinearCombination<cutlass::bfloat16_t, 8, float, float, (cutlass::epilogue::ScaleType::Kind)0, (cutlass::FloatRoundStyle)2, cutlass::bfloat16_t>, cutlass::MatrixShape<0, 8>, 1, 1>, cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, (cutlass::gemm::kernel::GroupScheduleMode)0, false> >(cutlass::gemm::kernel::GemmGrouped<cutlass::gemm::threadblock::MmaMultistage<cutlass::gemm::GemmShape<128, 128, 32>, cutlass::transform::threadblock::PredicatedTileAccessIterator<cutlass::MatrixShape<128, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajor, 1, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<32, 128>, 128, cutlass::PitchLinearShape<4, 8>, 8>, cutlass::Array<cutlass::bfloat16_t, 8, false>, false, cutlass::layout::NoPermute>, cutlass::transform::threadblock::RegularTileAccessIterator<cutlass::MatrixShape<128, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<16, 32>, 0, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<32, 128>, 128, cutlass::PitchLinearShape<4, 8>, 8>, 16>, (cutlass::arch::CacheOperation::Kind)1, cutlass::transform::threadblock::PredicatedTileAccessIterator<cutlass::MatrixShape<32, 128>, cutlass::bfloat16_t, cutlass::layout::RowMajor, 0, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<128, 32>, 128, cutlass::PitchLinearShape<8, 4>, 8>, cutlass::Array<cutlass::bfloat16_t, 8, false>, false, cutlass::layout::NoPermute>, cutlass::transform::threadblock::RegularTileAccessIterator<cutlass::MatrixShape<32, 128>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCongruous<16, 64>, 0, cutlass::transform::PitchLinearWarpRakedThreadMap<cutlass::PitchLinearShape<128, 32>, 128, cutlass::PitchLinearShape<8, 4>, 8>, 16>, (cutlass::arch::CacheOperation::Kind)1, float, cutlass::layout::RowMajor, cutlass::gemm::threadblock::MmaPolicy<cutlass::gemm::warp::MmaTensorOp<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCrosswis–Type for more, q to quit, c to continue without paging–
e<16, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCongruous<16, 64>, float, cutlass::layout::RowMajor, cutlass::gemm::warp::MmaTensorOpPolicy<cutlass::arch::Mma<cutlass::gemm::GemmShape<16, 8, 16>, 32, cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, cutlass::layout::ColumnMajor, float, cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>, cutlass::MatrixShape<1, 1> >, 1, false, bool>, cutlass::MatrixShape<0, 0>, cutlass::MatrixShape<0, 0>, 1>, 4, (cutlass::gemm::SharedMemoryClearOption)0, bool>, cutlass::epilogue::threadblock::Epilogue<cutlass::gemm::GemmShape<128, 128, 32>, cutlass::gemm::warp::MmaTensorOp<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<16, 32>, cutlass::bfloat16_t, cutlass::layout::RowMajorTensorOpMultiplicandCongruous<16, 64>, float, cutlass::layout::RowMajor, cutlass::gemm::warp::MmaTensorOpPolicy<cutlass::arch::Mma<cutlass::gemm::GemmShape<16, 8, 16>, 32, cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, cutlass::layout::ColumnMajor, float, cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>, cutlass::MatrixShape<1, 1> >, 1, false, bool>, 1, cutlass::epilogue::threadblock::PredicatedTileIterator<cutlass::epilogue::threadblock::OutputTileOptimalThreadMap<cutlass::epilogue::threadblock::OutputTileShape<128, 8, 2, 1, 1>, cutlass::epilogue::threadblock::OutputTileShape<1, 8, 1, 1, 8>, 128, 8, 16>, cutlass::bfloat16_t, false, cutlass::layout::NoPermute, false>, cutlass::epilogue::warp::FragmentIteratorTensorOp<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, float, cutlass::Array<float, 4, true>, cutlass::layout::RowMajor>, cutlass::epilogue::warp::TileIteratorTensorOp<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, float, cutlass::layout::RowMajor>, cutlass::epilogue::threadblock::SharedLoadIterator<cutlass::epilogue::threadblock::OutputTileOptimalThreadMap<cutlass::epilogue::threadblock::OutputTileShape<128, 8, 2, 1, 1>, cutlass::epilogue::threadblock::OutputTileShape<1, 8, 1, 1, 8>, 128, 8, 16>::CompactedThreadMap, float, 32>, cutlass::epilogue::LinearCombination<cutlass::bfloat16_t, 8, float, float, (cutlass::epilogue::ScaleType::Kind)0–Type for more, q to quit, c to continue without paging–
, (cutlass::FloatRoundStyle)2, cutlass::bfloat16_t>, cutlass::MatrixShape<0, 8>, 1, 1>, cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, (cutlass::gemm::kernel::GroupScheduleMode)0, false>::Params)<<<(1,1,1),(128,1,1)>>> ()