Hi,
I cannot share the complete (and rather complex) model but I extended the minimal example from above a bit to make the effect more obvious (it’s attached below).
I am not sure about the padding though, I would assume the input shape dimensions need to be divisable by 8, hence I first use a normal conv on the 64x64 input then use padded conv.
But I also tried other combinations…
Running the code in 2D mode I get:
2020-07-27 08:12:32,971 - Executing 2D Test
2020-07-27 08:12:36,583 - FP32 duration: 2.420s
2020-07-27 08:12:37,398 - FP16 duration: 0.814s
2020-07-27 08:12:38,229 - AMP duration: 0.828s
When profiling, I set the rounds to 1 to make the report more readable (and because profiling is slow).
The profiler shows tensor core usage for FP16 and AMP part
Running in 3D mode:
2020-07-27 08:25:13,797 - Executing 3D Test
2020-07-27 08:25:17,523 - FP32 duration: 2.203s
2020-07-27 08:25:21,319 - FP16 duration: 3.790s
2020-07-27 08:25:25,137 - AMP duration: 3.811s
For 3D, the profiler doesn’t show any tensor core usage.
I used the nv-nsight-cu
GUI profiler and configured it so that it calls the command line profiler with the following arguments: /opt/nvidia/nsight-compute/2019.5.0/target/linux-desktop-glibc_2_11_3-x64/nv-nsight-cu-cli --export "report" --force-overwrite --target-processes all --kernel-regex-base function --launch-skip-before-match 0 --section LaunchStats --section Occupancy --section SpeedOfLight --sampling-interval auto --sampling-max-passes 5 --sampling-buffer-size 33554432 --profile-from-start 1 --cache-control all --clock-control base --apply-rules --metrics sm__inst_executed_pipe_tensor_op_hmma.sum "conda/envs/pytorchcudnn8/bin/python" torch-try-tensorcores_minimal2d3d.py
The code:
import torch
from torch import nn
import torch.nn.functional as F
from time import time
import logging
from apex import amp
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv = nn_conv(64, 64, 5)
self.conv_padded = nn_conv(64, 64, 5, padding=2)
self.pool = nn_pool(2, 2)
def forward(self, x):
x = F.relu(self.conv(x))
for i in range(5):
x = F.relu(self.conv_padded(x))
return x
if __name__ == "__main__":
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
# === CONFIGURATION ===
dimensionality = 2 # 2 for 2D, 3 for 3D
# === TEST CODE ===
if dimensionality == 2:
logging.info(f"Executing 2D Test")
rounds = 500
nn_conv = nn.Conv2d
nn_pool = nn.MaxPool2d
image_dims = (8, 64, 64, 64)
else:
logging.info(f"Executing 3D Test")
rounds = 2
nn_conv = nn.Conv3d
nn_pool = nn.MaxPool3d
image_dims = (8, 64, 64, 64, 64)
input_image = torch.rand(image_dims, device="cuda")
# Run in FP32 mode
net = Net().to("cuda").to(torch.float32)
start = time()
for i in range(rounds):
net(input_image)
torch.cuda.synchronize()
logging.info(f"FP32 duration: {time() - start:.03f}s")
# Run in FP16 mode
net = Net().to("cuda").to(torch.float32)
fp16_net = net.to(torch.float16)
fp16_input = input_image.to(torch.float16)
start = time()
for i in range(rounds):
fp16_net(fp16_input)
torch.cuda.synchronize()
logging.info(f"FP16 duration: {time() - start:.03f}s")
# Run with AMP
net = Net().to("cuda").to(torch.float32)
amp_net = amp.initialize(net, opt_level="O1", verbosity=0)
start = time()
for i in range(rounds):
amp_net(input_image)
torch.cuda.synchronize()
logging.info(f"AMP duration: {time() - start:.03f}s")