Type Time(%) Time Calls Avg Min Max Name
GPU activities: 14.53% 3.69784s 45899 80.564us 2.6560us 373.79us void nchwToNhwcKernel<__half, __half, float, bool=1>(int, int, int, int, __half const , __half, float, float)
5.36% 1.36389s 4950 275.53us 83.712us 663.61us void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>(float, float, float, float, cudnnTensorStruct, __half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, __half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, cudnnTensorStruct, float const , float, float const , float const , float const , float, cudnn::reduced_divisor, int, float, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)
5.25% 1.33595s 9702 137.70us 18.528us 605.53us void kernelPointwiseApply3<ThresholdUpdateGradInput<__half>, __half, __half, __half, unsigned int, int=-2, int=-2, int=-2>(TensorInfo<ThresholdUpdateGradInput<__half>, __half>, TensorInfo<__half, __half>, TensorInfo<__half, __half>, __half, __half)
4.45% 1.13237s 4587 246.86us 146.40us 501.88us volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
4.36% 1.10995s 5754 192.90us 121.38us 287.71us volta_s884cudnn_fp16_128x128_ldg8_wgrad_exp_interior_nhwc_nt_v1
3.81% 969.50ms 4975 194.87us 64.127us 662.71us void cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>(cudnnTensorStruct, __half2 const , cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>, cudnnTensorStruct, float const , float const , float, float, float, float const , float const , float const , float, float, cudnn::reduced_divisor, int, float, cudnn::detail::bnFwPersistentState, int, float, float, float, int, float, float, cudnnStatus_t, bool)
3.67% 933.74ms 408 2.2886ms 914.49us 3.7064ms void cudnn::detail::dgrad_engine<__half, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::dgrad_engine<__half, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>, kernel_grad_params, int, int, float, int, int, int)
3.56% 906.06ms 9751 92.919us 11.488us 404.80us void kernelPointwiseApply1<ThresholdUpdateOutputIP<__half>, __half, unsigned int, int=-2>(TensorInfo<ThresholdUpdateOutputIP<__half>, __half>, __half, __half)
3.52% 895.44ms 2382 375.92us 246.17us 470.17us volta_s884cudnn_fp16_64x256_sliced1x4_ldg8_wgrad_exp_interior_nhwc_nt_v1
3.24% 824.41ms 3770 218.68us 141.92us 457.72us volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
3.18% 809.50ms 14019 57.742us 2.1440us 560.16us void kernelPointwiseApply2<TensorAddOp<__half>, __half, __half, unsigned int, int=-2, int=-2>(TensorInfo<TensorAddOp<__half>, __half>, TensorInfo<__half, __half>, __half, __half)
2.85% 723.98ms 3168 228.53us 66.528us 555.55us void kernelPointwiseApply3<TensorAddOp<__half>, __half, __half, __half, unsigned int, int=-2, int=-2, int=-2>(TensorInfo<TensorAddOp<__half>, __half>, TensorInfo<__half, __half>, TensorInfo<__half, __half>, __half, __half)
2.84% 722.40ms 666 1.0847ms 1.1830us 4.3038ms [CUDA memcpy HtoD]
2.64% 671.42ms 2393 280.58us 227.68us 365.60us volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
2.56% 651.84ms 206 3.1643ms 786.01us 3.4006ms void cudnn::detail::wgrad_alg0_engine<__half, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>(int, int, int, __half const , int, cudnn::detail::wgrad_alg0_engine<__half, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>, __half const , kernel_grad_params, int, float, int, int, int, int)
2.28% 580.28ms 200 2.9014ms 2.8869ms 2.9322ms void cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=6, int=6, int=5, int=4, int=4, bool=1, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=6, int=6, int=5, int=4, int=4, bool=1, bool=1>, kernel_grad_params, int, int, float, int, int)
2.25% 571.60ms 198 2.8869ms 2.8822ms 2.9079ms void MaxPoolBackward<__half, float>(int, __half const , long const , int, int, int, int, int, int, int, int, int, int, int, int, int, int, __half)
2.06% 524.42ms 212 2.4737ms 1.1865ms 6.4547ms void cudnn::detail::dgrad_engine<__half, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::dgrad_engine<__half, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>, kernel_grad_params, int, int, float, int, int, int)
1.93% 490.37ms 200 2.4518ms 2.4242ms 2.5017ms void cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>, kernel_grad_params, int, int, float, int, int)
1.80% 458.96ms 222 2.0674ms 450.88us 11.184ms void cudnn::detail::dgrad_engine<__half, int=128, int=6, int=7, int=3, int=3, int=5, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::dgrad_engine<__half, int=128, int=6, int=7, int=3, int=3, int=5, bool=1>, kernel_grad_params, int, int, float, int, int, int)
1.59% 404.37ms 1586 254.96us 217.85us 315.81us volta_fp16_s884cudnn_fp16_128x128_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
1.42% 360.54ms 3762 95.837us 47.199us 180.00us void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=7>(float, float, float, float, cudnnTensorStruct, __half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=7>, __half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=7>, cudnnTensorStruct, float const , float, float const , float const , float const , float, cudnn::reduced_divisor, int, float, cudnn::detail::bnBwPersistentState, int, float, float, float, int, float, cudnnStatus_t*, bool)
1.41% 359.91ms 10318 34.882us 4.4480us 258.72us void nhwcToNchwKernel<float, __half, float, bool=1>(int, int, int, int, float const , __half, float, float)
1.41% 359.58ms 1587 226.58us 199.04us 422.33us volta_s884cudnn_fp16_128x256_sliced1x4_ldg8_wgrad_exp_interior_nhwc_nt_v1
1.33% 338.69ms 63689 5.3170us 1.9520us 51.648us void kernelPointwiseApply2<TensorCAddOp, float, float, unsigned int, int=-2, int=-2>(TensorInfo<TensorCAddOp, float>, TensorInfo<float, float>, float, float)
1.29% 328.45ms 1993 164.80us 148.00us 192.09us volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
1.12% 285.80ms 201 1.4219ms 1.4188ms 1.4311ms volta_fp16_scudnn_fp16_128x64_relu_medium_nn_v1
1.03% 263.05ms 1784 147.45us 138.40us 170.85us volta_fp16_s884cudnn_fp16_128x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
1.02% 260.47ms 3781 68.889us 36.576us 126.82us void cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=10>(cudnnTensorStruct, __half2 const , cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=10>, cudnnTensorStruct, float const , float const , float, float, float, float const , float const , float const , float, float, cudnn::reduced_divisor, int, float, cudnn::detail::bnFwPersistentState, int, float, float, float, int, float, float, cudnnStatus_t, bool)
0.84% 214.78ms 992 216.52us 111.71us 683.10us volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
0.81% 206.46ms 52726 3.9150us 1.9830us 51.328us void kernelPointwiseApply2<TensorAddOp, float, float, unsigned int, int=-2, int=-2>(TensorInfo<TensorAddOp, float>, TensorInfo<float, float>, float, float)
0.76% 194.14ms 1291 150.38us 1.7600us 358.78us void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, float)
0.73% 186.68ms 598 312.18us 295.04us 334.81us volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
0.72% 182.66ms 595 307.00us 293.37us 330.37us volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
0.55% 140.61ms 32240 4.3610us 1.7280us 36.224us void kernelPointwiseApply1<TensorMulConstantOp, float, unsigned int, int=-2>(TensorInfo<TensorMulConstantOp, float>, float, float)
0.47% 120.85ms 599 201.76us 115.17us 261.25us volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
0.47% 120.47ms 1782 67.603us 32.896us 114.02us void cudnn::detail::bn_bw_1C11_singleread<__half, int=512, bool=1, int=1, int=2, int=0>(float, float, float, float, cudnnTensorStruct, __half const , cudnn::detail::bn_bw_1C11_singleread<__half, int=512, bool=1, int=1, int=2, int=0>, __half const , cudnn::detail::bn_bw_1C11_singleread<__half, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct, float const , float, float const , float const , float const , float, cudnn::reduced_divisor, int, float, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)
0.46% 117.00ms 397 294.72us 293.57us 296.57us volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
0.44% 112.01ms 42192 2.6540us 1.5990us 19.648us [CUDA memcpy DtoD]
0.42% 107.89ms 1791 60.241us 29.824us 100.61us void cudnn::detail::bn_fw_tr_1C11_singleread<__half, int=512, bool=1, int=1, int=2, int=0>(cudnnTensorStruct, __half const , cudnn::detail::bn_fw_tr_1C11_singleread<__half, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct, float const , float const , float, float, float, float const , float const , float const , float, float, cudnn::reduced_divisor, int, float, cudnn::detail::bnFwPersistentState, int, float, float, float, int, float, float, cudnnStatus_t, bool)
0.39% 100.32ms 11301 8.8770us 1.9200us 99.679us void kernelPointwiseApply2<CopyOp<__half, float>, __half, float, unsigned int, int=-2, int=-2>(TensorInfo<float, __half>, TensorInfo<CopyOp<__half, float>, __half>, __half, __half)
0.39% 98.802ms 189 522.76us 20.768us 7.1216ms volta_cgemm_32x32_tn
0.34% 87.034ms 199 437.36us 435.23us 446.75us void MaxPoolForward<__half, float>(int, __half const , int, int, int, int, int, int, int, int, int, int, int, int, int, int, __half, long*)
0.31% 78.373ms 73599 1.0640us 959ns 12.479us [CUDA memset]
0.29% 73.676ms 11157 6.6030us 1.7920us 29.760us void kernelPointwiseApply2<CopyOp<float, __half>, float, __half, unsigned int, int=-2, int=-2>(TensorInfo<__half, float>, TensorInfo<CopyOp<float, __half>, float>, float, float)
0.29% 73.177ms 595 122.99us 121.41us 128.16us volta_s884cudnn_fp16_256x128_ldg8_wgrad_exp_interior_nhwc_nt_v1
0.28% 71.223ms 30 2.3741ms 1.0590ms 5.3502ms void cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, __half const , int, cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>, __half const , kernel_grad_params, int, float, int, int, int, int)
0.22% 56.426ms 200 282.13us 280.16us 294.97us volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
0.20% 51.367ms 10318 4.9780us 1.7270us 18.975us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.19% 48.392ms 19733 2.4520us 2.2400us 11.552us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.17% 42.731ms 30 1.4244ms 826.23us 2.3345ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const , int, __half, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.17% 42.278ms 198 213.53us 212.93us 214.94us void AvePoolBackward<__half, float, bool=1>(int, __half const , int, int, int, int, int, int, int, int, int, int, int, int, __half)
0.17% 42.203ms 32 1.3188ms 120.03us 4.9324ms volta_gcgemm_32x32_nt
0.16% 40.706ms 30 1.3569ms 661.34us 2.3880ms volta_fp16_scudnn_fp16_128x128_stridedB_splitK_interior_nn_v1
0.13% 33.018ms 199 165.92us 160.64us 191.04us void AvePoolForward<__half, float, bool=1>(int, __half const , int, int, int, int, int, int, int, int, int, int, int, int, __half)
0.12% 29.295ms 10318 2.8390us 2.7190us 11.328us cudnn::gemm::computeWgradOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.11% 28.355ms 45 630.10us 234.43us 1.9987ms void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams, float2 const , float2, float2 const )
0.10% 25.341ms 14 1.8100ms 587.48us 4.1213ms volta_gcgemm_64x32_nt
0.08% 20.366ms 15 1.3578ms 747.13us 2.2130ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>, kernel_conv_params, int, int, float, float, int, __half const , __half const )
0.08% 19.409ms 42 462.11us 68.480us 2.3088ms void fft2d_r2c_16x16<__half>(float2, __half const , int, int, int, int, int, int, int, int)
0.08% 19.092ms 30 636.40us 220.06us 1.5351ms void fft2d_r2c_64x64<__half>(float2, __half const , int, int, int, int, int, int, int, int)
0.07% 18.422ms 9140 2.0150us 1.8240us 11.520us cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
0.07% 17.810ms 12 1.4842ms 1.4022ms 1.6220ms volta_fp16_scudnn_fp16_128x128_stridedB_splitK_small_nn_v1
0.07% 16.701ms 3 5.5671ms 5.5633ms 5.5720ms volta_fp16_scudnn_fp16_128x128_stridedB_splitK_xregs_large_nn_v1
0.06% 16.212ms 32 506.62us 9.2800us 2.8943ms void flip_filter<__half, __half>(__half, __half const , int, int, int, int)
0.05% 13.378ms 2 6.6888ms 6.6877ms 6.6900ms volta_cgemm_64x32_tn
0.05% 12.655ms 8 1.5818ms 867.77us 2.2093ms void cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>(int, int, int, __half const , int, cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>, __half const , kernel_grad_params, int, float, int, int, int, int)
0.05% 11.573ms 44 263.01us 16.992us 586.91us void fft1d_r2c_32<__half, float, float2, bool=0, bool=0>(float2, __half const , int, int3, int3, int2, int2)
0.04% 11.011ms 8 1.3764ms 775.39us 1.6947ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const , int, __half, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.04% 10.466ms 199 52.590us 51.839us 57.920us volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_tn
0.04% 10.208ms 8 1.2760ms 261.47us 2.0466ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const , int, __half, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.04% 10.116ms 12 842.97us 337.02us 1.1940ms volta_fp16_scudnn_fp16_128x128_stridedB_interior_nn_v1
0.04% 10.038ms 4 2.5094ms 1.3837ms 3.6539ms void cudnn::winograd::winograd3x3Kernel<__half, float, int=4, int=1, int=8, bool=0>(cudnn::maxwell::winograd::KernelParams)
0.04% 9.5547ms 12 796.23us 173.60us 1.2495ms volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1
0.04% 9.2725ms 21 441.55us 75.232us 2.2465ms void fft2d_c2r_16x16<__half, bool=0>(__half, float2, int, int, int, int, int, int, int, int, int, int, float, float, int, __half*, __half*)
0.04% 9.2358ms 16 577.24us 65.023us 2.2708ms void fft2d_r2c_32x32<__half, unsigned int=1, bool=0>(float2*, __half const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool)
0.03% 8.4036ms 15 560.24us 229.89us 1.4916ms void fft2d_c2r_64x64<__half, bool=0>(__half, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, __half*, __half*)
0.03% 8.2520ms 2 4.1260ms 4.1244ms 4.1276ms void cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>, kernel_grad_params, int, int, float, int, int)
0.03% 8.0379ms 14 574.13us 340.32us 1.5127ms volta_sgemm_128x64_nt
0.03% 7.2837ms 4 1.8209ms 1.2608ms 2.5253ms volta_fp16_scudnn_fp16_128x128_stridedB_small_nn_v1
0.03% 7.1027ms 5 1.4205ms 1.3001ms 1.5353ms volta_fp16_scudnn_fp16_128x64_relu_small_nn_v1
0.03% 6.7729ms 23 294.48us 84.000us 696.09us void im2col4d_kernel<__half, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, __half const , __half, int)
0.03% 6.6484ms 2 3.3242ms 3.3134ms 3.3349ms volta_fp16_scudnn_fp16_128x128_stridedB_splitK_medium_nn_v1
0.03% 6.6066ms 32 206.46us 69.376us 407.65us void fft1d_c2r_32<float2, float, __half, bool=0, bool=1, bool=0, bool=0>(__half*, float2 const , int, int3, int3, int2, int, float, float, __half, __half*)
0.02% 6.3018ms 198 31.827us 27.936us 32.416us volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn
0.02% 5.6277ms 26 216.45us 78.176us 511.55us void fft2d_r2c_32x32<__half, unsigned int=0, bool=0>(float2*, __half const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool)
0.02% 5.2845ms 4 1.3211ms 276.25us 2.0459ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>, kernel_conv_params, int, int, float, float, int, __half const , __half const )
0.02% 5.2756ms 4 1.3189ms 736.15us 1.5857ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>(int, int, int, __half const , int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>, kernel_conv_params, int, int, float, float, int, __half const , __half const )
0.02% 3.8684ms 198 19.537us 17.952us 20.640us void gatherTopK<__half, unsigned int, int=2, bool=1>(TensorInfo<__half, unsigned int>, unsigned int, unsigned int, unsigned int, unsigned int, TensorInfo<__half, unsigned int>, unsigned int, unsigned int, TensorInfo<long, unsigned int>, unsigned int)
0.01% 3.6530ms 2 1.8265ms 1.7856ms 1.8673ms void cudnn::winograd::winograd3x3Kernel<__half, float, int=1, int=4, int=8, bool=0>(cudnn::maxwell::winograd::KernelParams)
0.01% 3.5723ms 198 18.042us 17.152us 19.263us volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nt
0.01% 3.3434ms 8 417.93us 154.91us 1.1319ms void fft2d_c2r_32x32<__half, bool=0, unsigned int=1, bool=0, bool=0>(__half, float2 const , int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half, __half)
0.01% 3.2245ms 4 806.13us 54.591us 2.3797ms void fft2d_r2c_32x32<__half, unsigned int=1, bool=1>(float2, __half const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool)
0.01% 3.2015ms 3 1.0672ms 691.19us 1.3466ms volta_fp16_scudnn_fp16_128x128_relu_interior_nn_v1
0.01% 3.1821ms 2 1.5910ms 1.5543ms 1.6277ms void cudnn::winograd::winograd3x3Kernel<__half, float, int=2, int=2, int=8, bool=0>(cudnn::maxwell::winograd::KernelParams)
0.01% 2.7595ms 2 1.3797ms 1.2562ms 1.5033ms volta_fp16_scudnn_fp16_128x128_relu_small_nn_v1
0.01% 2.6409ms 18 146.72us 69.599us 369.12us void fft2d_c2r_32x32<__half, bool=0, unsigned int=0, bool=0, bool=0>(__half, float2 const , int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half, __half)
0.01% 2.4941ms 6 415.68us 345.28us 468.86us volta_sgemm_128x64_nn
0.01% 2.4192ms 16 151.20us 55.903us 313.85us void cudnn::winograd_nonfused::winogradForwardData4x4<float, __half>(cudnn::winograd_nonfused::WinogradDataParams<float, __half>)
0.01% 2.1313ms 16 133.21us 52.512us 269.21us void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, __half>(cudnn::winograd_nonfused::WinogradOutputParams<float, __half>)
0.01% 2.0791ms 198 10.500us 10.272us 10.880us void bitonicSortKVInPlace<__half, long, int=2, int=-1, GTComp<__half>, unsigned int, int=32>(TensorInfo<__half, GTComp<__half>>, GTComp<__half>, GTComp<__half>, GTComp<__half>, TensorInfo<long, GTComp<__half>>, GTComp<__half>, __half const &)
0.01% 1.7048ms 12 142.07us 20.448us 473.02us void fft1d_r2c_32<__half, float, float2, bool=1, bool=0>(float2, __half const , int, int3, int3, int2, int2)
0.01% 1.4985ms 198 7.5680us 7.3920us 9.6320us void cunn_SoftMaxForward<int=2, __half, float, LogSoftMaxForwardEpilogue>(__half, __half*, int)
0.01% 1.4098ms 2 704.89us 704.35us 705.44us volta_sgemm_128x128_nt
0.01% 1.4040ms 2 701.98us 700.57us 703.39us volta_sgemm_128x128_nn
0.00% 1.2654ms 396 3.1950us 2.8160us 3.8080us void kernelReduceContigDim<thrust::identity, ReduceAdd<float, float>, ReduceAdd<float, float>, float, float, unsigned int, int=-2, int=-2>(TensorInfo<float, ReduceAdd<float, float>>, TensorInfo<float, ReduceAdd<float, float>>, ReduceAdd<float, float>, ReduceAdd<float, float>, ReduceAdd<float, float>, float, thrust::identity, float)
0.00% 1.1407ms 8 142.59us 44.928us 299.33us void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, __half>(cudnn::winograd_nonfused::WinogradDeltaParams<float, __half>)
0.00% 1.1289ms 8 141.11us 48.576us 284.93us void cudnn::winograd_nonfused::winogradWgradData4x4<float, __half>(cudnn::winograd_nonfused::WinogradDataParams<float, __half>)
0.00% 1.1188ms 198 5.6500us 5.5040us 5.9520us void cunn_ClassNLLCriterion_updateOutput_kernel<__half, float>(__half*, __half*, __half*, long*, __half*, int, int, int, int, long)
0.00% 1.1180ms 198 5.6460us 5.4710us 6.0160us void cunn_SoftMaxBackward<int=2, __half, float, LogSoftMaxBackwardEpilogue>(__half*, __half*, __half*, int)
0.00% 971.10us 198 4.9040us 4.4800us 5.2480us void cunn_ClassNLLCriterion_updateGradInput_kernel<__half>(__half*, __half*, long*, __half*, __half*, int, int, int, int, long)
0.00% 930.10us 396 2.3480us 1.9840us 6.3350us void kernelPointwiseApply2<TensorMulConstantOp<__half>, __half, __half, unsigned int, int=-2, int=-2>(TensorInfo<TensorMulConstantOp<__half>, __half>, TensorInfo<__half, __half>, __half, __half)
0.00% 826.94us 594 1.3920us 1.2480us 3.6160us [CUDA memcpy DtoH]
0.00% 800.92us 396 2.0220us 1.8230us 2.3360us void kernelPointwiseApply2<CopyOp<float, unsigned char>, float, unsigned char, unsigned int, int=-2, int=-2>(TensorInfo<unsigned char, float>, TensorInfo<CopyOp<float, unsigned char>, float>, float, float)
0.00% 778.62us 198 3.9320us 3.8720us 4.0960us void kernelReduceNoncontigDim_shared<thrust::identity<__half>, ReduceAdd<__half, float>, ReduceAdd<float, float>, __half, float, unsigned int, int=-2, int=-2>(TensorInfo<float, float>, TensorInfo<float, float>, float, float, float, ReduceAdd<__half, float>, __half, thrust::identity<__half>, __half)
0.00% 680.57us 199 3.4190us 3.2320us 3.6800us void kernelPointwiseApply2<CopyOp<__half, __half>, __half, __half, unsigned int, int=-2, int=2>(TensorInfo<__half, __half>, TensorInfo<CopyOp<__half, __half>, __half>, __half, __half)
0.00% 665.37us 198 3.3600us 3.2960us 3.5200us void kernelPointwiseApply3<TensorEQOp<long, unsigned char>, unsigned char, long, long, unsigned int, int=-2, int=2, int=2>(TensorInfo<unsigned char, long>, TensorInfo<TensorEQOp<long, unsigned char>, long>, TensorInfo<unsigned char, long>, long, long)
0.00% 477.31us 16 29.831us 5.7920us 85.887us void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, __half>(cudnn::winograd_nonfused::WinogradFilterParams<float, __half>)
0.00% 417.92us 198 2.1100us 2.0480us 2.2720us void kernelPointwiseApply1<TensorFillOp<__half>, __half, unsigned int, int=-2>(TensorInfo<TensorFillOp<__half>, __half>, __half, __half)
0.00% 245.06us 8 30.631us 7.0400us 83.871us void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, __half>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, __half>)
0.00% 169.60us 47 3.6080us 2.0160us 5.6640us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.00% 142.97us 8 17.871us 6.2070us 45.536us void cudnn::winograd::generateWinogradTilesKernel<int=0, __half, float>(cudnn::winograd::GenerateWinogradTilesParams<__half, float>)
0.00% 107.01us 47 2.2760us 2.1120us 2.8480us cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.00% 106.27us 47 2.2610us 2.1750us 2.5280us cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)