Hi, I’ve found that two downsampling 1x1 convolutions result in very slow backward passes on a Quadro RTX 5000 with deterministic enabled. The timing result is 0.3ms forward, and 115ms backward. On other GPUs, the backward timing is 3ms. The input image has shape 256x512.
The module code:
class CustomModule(torch.nn.Module):
def __init__(self):
super().__init__()
inplanes = [64, 64]
planes = [16, 64]
layers = []
for i in range(len(inplanes)):
layers.append(nn.Conv2d(inplanes[i], planes[i] * 4, stride=2, kernel_size=1, bias=False))
self.layers = nn.Sequential(*layers)
self.inplanes = inplanes
def forward(self, x):
out = []
for layer in self.layers:
x = layer(x)
out.append(x)
return out[-1]
Pytorch version: 1.9.0+cu111
It seems that cudnn chooses a slow op, dgrad2d_alg1_1
on this particular GPU architecture, but not on other GPU architectures. I believe a similar timing occurs on all Turing architectures.
Profiling code:
class Timer(object):
def __init__(self, name: str, verbose: bool) -> None:
super().__init__()
self.name = name
self.verbose = verbose
def __enter__(self):
torch.cuda.synchronize()
self.start = time.time()
def __exit__(self, type, value, traceback):
torch.cuda.synchronize()
if self.verbose:
print(self.name, "{:.3f}ms".format((time.time() - self.start) * 1e3))
def custom_step(custom_model, data, verbose=True):
with Timer("forward", verbose):
output = custom_model(data)
loss = torch.ones_like(output)
with Timer("backward", verbose):
output.backward(loss)
def profile(model, data, model_step, warmup_steps):
for _ in range(warmup_steps):
model_step(model, data, verbose=False)
model_step(model, data)
if __name__ == "__main__":
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
custom_module = CustomModule().cuda()
model_input = torch.randn((1, custom_module.inplanes[0], 256, 512), device="cuda")
profile(custom_module, model_input, custom_step, warmup_steps=10)
These results reproduce with benchmark = True and False.