GPU training 10x slower when running two modules back to back

Each module running time is 0.003 sec and 0.02 respectively, but combining them together and performing backward passes on only the second module results in a cumulative time of 0.2 second. After running torch.utils.bottleneck, it saids that att::normal_ takes up 90% of the running time but there is no random operations, and no torch.randn() operations… This performance makes training infeasible since the training time also increases with batch size, training takes 2x longer for 256 than for 128, and there is no data transfer between cuda and cpu

start = time.time()


with torch.no_grad():
    # latents = model.module.get_encoder_latents(torch.randint(0, 5, (128, 512), device="cuda"))
    latents = model.module.get_encoder_latents(torch.zeros(128, 512, device="cuda", dtype=torch.int64))

print('time ', time.time()-start)
# time.sleep(1)

loss = model.module.train_vae(latents)
# loss = model.module.train_vae(torch.randn((128, 512, 768), device="cuda"))
loss.backward()

# print(latents.grad)
# for param in model.module.encoder.parameters():
#     print(param.grad)

print('time ', time.time()-start)

# seq_len 512
# 16, 0.027439117431640625 (both), 0.0023941993713378906 (encoder), 0.020084381103515625 (vae)
# 32, 0.04886817932128906 (both), 0.002727508544921875 (encoder),  0.02025008201599121 (vae)
# 64, 0.10552453994750977 (both), 0.0029969215393066406 (encoder), 0.021106719970703125 (vae)
# 128, 0.210113525390625 (both), 0.0033850669860839844 (encoder), 0.02246260643005371 (vae)

after commenting in time.sleep(1), cumulative training time goes back to normal even though nothing’s changed except waiting for one second

-------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
               Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
      aten::normal_        43.50%     218.735ms        43.50%     218.735ms     218.735ms     218.821ms        43.48%     218.821ms     218.821ms             1  
      aten::normal_        34.57%     173.861ms        34.57%     173.861ms     173.861ms     173.933ms        34.56%     173.933ms     173.933ms             1  
        aten::copy_         3.37%      16.950ms         3.37%      16.950ms      16.950ms      16.960ms         3.37%      16.960ms      16.960ms             1  
           aten::to         0.00%       5.000us         3.22%      16.213ms      16.213ms       5.000us         0.00%      16.215ms      16.215ms             1  
     aten::_to_copy         0.00%      16.000us         3.22%      16.207ms      16.207ms      15.000us         0.00%      16.210ms      16.210ms             1  
           aten::to         0.00%       7.000us         3.20%      16.076ms      16.076ms       7.000us         0.00%      16.077ms      16.077ms             1  
     aten::_to_copy         0.00%      17.000us         3.20%      16.068ms      16.068ms      17.000us         0.00%      16.070ms      16.070ms             1  
        aten::copy_         0.00%       8.000us         3.19%      16.058ms      16.058ms      16.063ms         3.19%      16.063ms      16.063ms             1  
        aten::copy_         0.00%      13.000us         3.19%      16.033ms      16.033ms      16.036ms         3.19%      16.036ms      16.036ms             1  
    cudaMemcpyAsync         3.18%      15.974ms         3.18%      15.974ms      15.974ms       0.000us         0.00%       0.000us       0.000us             1  
    cudaMemcpyAsync         3.17%      15.943ms         3.17%      15.943ms      15.943ms       0.000us         0.00%       0.000us       0.000us             1  
     aten::uniform_         3.07%      15.438ms         3.07%      15.438ms      15.438ms      15.448ms         3.07%      15.448ms      15.448ms             1  
     aten::uniform_         3.06%      15.375ms         3.06%      15.375ms      15.375ms      15.385ms         3.06%      15.385ms      15.385ms             1  
     aten::uniform_         3.03%      15.260ms         3.03%      15.260ms      15.260ms      15.270ms         3.03%      15.270ms      15.270ms             1  
     aten::uniform_         3.03%      15.251ms         3.03%      15.251ms      15.251ms      15.259ms         3.03%      15.259ms      15.259ms             1  
-------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 502.853ms
Self CUDA time total: 503.219ms

1 Like

using import torch.autograd.profiler as profiler

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       cudaLaunchKernel        74.51%        1.152s        74.51%        1.152s     118.010us           0 b           0 b        -512 b        -512 b          9760  
                                        cudaMemsetAsync        10.03%     154.968ms        10.03%     154.968ms     156.533us           0 b           0 b      65.00 Kb      65.00 Kb           990  
                                            aten::addmm         1.32%      20.433ms         8.91%     137.729ms     148.096us           0 b           0 b      47.59 Gb      47.59 Gb           930  
                                              aten::mul         1.12%      17.297ms        11.19%     172.921ms      95.536us         784 b         784 b      33.10 Gb      33.10 Gb          1810  
                                               aten::mm         1.07%      16.483ms        11.06%     170.996ms     155.451us           0 b           0 b      20.40 Gb      20.40 Gb          1100  
                                              aten::sum         0.87%      13.462ms        10.30%     159.165ms     157.589us           0 b           0 b      47.93 Mb      47.87 Mb          1010  
                                             aten::add_         0.77%      11.923ms        15.94%     246.393ms     139.996us           0 b           0 b           0 b           0 b          1760  
                                        cudaMemcpyAsync         0.60%       9.224ms         0.60%       9.224ms      57.650us           0 b           0 b           0 b           0 b           160  
                                              aten::add         0.55%       8.500ms        10.55%     163.110ms     217.480us           0 b           0 b      11.88 Gb      11.88 Gb           750  
    autograd::engine::evaluate_function: AddmmBackward0         0.43%       6.606ms        17.92%     276.955ms     485.886us           0 b           0 b     -11.42 Gb     -31.83 Gb           570  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  

You are not properly synchronizing the code while using host timers, so your profiling is invalid and synchronizing operations will accumulate the execution time of previous CUDA operations, which are executed asynchronously.
Add warmup iterations and synchronize the code before starting and stopping host timers to get a proper profile.

I modified the code afterwards but execution time still 2x as batch size 2x

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                  cudaDeviceSynchronize        88.22%     230.149ms        88.22%     230.149ms     230.149ms           0 b           0 b           0 b           0 b             1  
                                             cudaMalloc         1.73%       4.510ms         1.73%       4.510ms      72.742us           0 b           0 b           0 b           0 b            62  
                                       cudaLaunchKernel         1.32%       3.440ms         1.32%       3.440ms       3.695us           0 b           0 b        -512 b        -512 b           931  
                                               aten::mm         0.77%       2.018ms         1.07%       2.795ms      22.910us           0 b           0 b       3.78 Gb       3.78 Gb           122  
                                            aten::addmm         0.72%       1.868ms         1.53%       3.989ms      61.369us           0 b           0 b       4.46 Gb       4.46 Gb            65  
                                              aten::mul         0.71%       1.843ms         1.14%       2.978ms      16.544us          88 b          88 b       6.53 Gb       6.53 Gb           180  
                                              aten::sum         0.56%       1.459ms         0.74%       1.927ms      17.679us           0 b           0 b       7.71 Mb       7.70 Mb           109  
                                             aten::add_         0.45%       1.172ms         0.73%       1.914ms       9.765us           0 b           0 b           0 b           0 b           196  
                                              aten::add         0.30%     785.000us         0.58%       1.505ms      24.672us           0 b           0 b       1.16 Gb       1.16 Gb            61  
    autograd::engine::evaluate_function: AddmmBackward0         0.29%     762.000us         2.35%       6.132ms      94.338us           0 b           0 b      -2.68 Gb      -6.46 Gb            65  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
import torch
import time
import os
# os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING'] = '1'
# x = torch.randn((32, 2, 4), device="cuda")
#
# start = time.time()
# for i in range(10000):
#     x *= x
# print(time.time()-start)

# 32, 0.046601057052612305
# 320, 0.044339895248413086

import torch.nn as nn
from latent_models import BART

model = BART(seq_len=512).cuda()

# start = time.time()
# model.module.encoder(torch.randint(0, 5, (32, 512), device="cuda"))
model.encoder(torch.zeros(32, 512, device="cuda", dtype=torch.int64))
# 32, 0.3491981029510498
# 64, 0.4617133140563965
# 128, 0.510127067565918
# 320, 0.7788600921630859
# 640, 1.2591266632080078

# start = time.time()
# print(model.module.forward(torch.zeros(32, 512, device="cuda", dtype=torch.int64),
#                            torch.zeros(32, 512, device="cuda", dtype=torch.int64)))
# print(time.time()-start)

# 16, 0.3350405693054199
# 32, 0.381974458694458
# 64, 0.47709083557128906

start = time.time()
model.train_vae(torch.zeros(16, 512, 768, device="cuda")).backward()
model.train_vae(torch.zeros(16, 512, 768, device="cuda")).backward()
model.train_vae(torch.zeros(16, 512, 768, device="cuda")).backward()
model.train_vae(torch.zeros(16, 512, 768, device="cuda")).backward()
model.train_vae(torch.zeros(16, 512, 768, device="cuda")).backward()

# print(model.module.train_vae(torch.randn((16, 64, 768), device="cuda")).backward())
# print('time ', time.time()-start)

# 16, 0.38356733322143555
# 32, 0.3720133304595947
# 64, 0.39897871017456055
# 128, 0.43648314476013184
# 256, 0.5453002452850342


import torch.autograd.profiler as profiler

torch.cuda.synchronize()
start = time.time()
torch.cuda.empty_cache()


# @torch.jit.script
def testing(model):

    for i in range(1):
        with torch.no_grad():
            latents = model.module.get_encoder_latents(torch.randint(0, 5, (128, 512), device="cuda"))
            latents = model.get_encoder_latents(torch.zeros(128, 512, device="cuda", dtype=torch.int64))

        # print('time ', time.time()-start)
        # time.sleep(1) 10280

        # loss = model.train_vae(latents)
        loss = model.train_vae(torch.randn((64, 512, 768), device="cuda"))
        loss.backward()

        # print(latents.grad)
        # for param in model.module.encoder.parameters():
        #     print(param.grad)

        torch.cuda.synchronize()

    print('time ', time.time() - start)


# seq_len 512
# 16, 0.13292932510375977 (both), 0.06869959831237793 (encoder), 0.11004400253295898 (vae)
# 32, 0.22198772430419922 (both), 0.12623834609985352 (encoder),  0.1743018627166748 (vae)
# 64, 0.40708303451538086 (both), 0.1575465202331543 (encoder), 0.30747365951538086 (vae)
# 128, 0.8132572174072266 (both), 0.2463982105255127 (encoder), 0.6213085651397705 (vae)

I’m currently trying to scale a model using a H100 but the training time 2x as batch size 2x, shouldn’t the parallelizability of GPUs make the training time almost the same regardless of batch size, the encoder and vae is just a few attention layers

No, the compute isn’t a free resource keeping the processing time constant. Increasing the batch size can improve the GPU utilization assuming the GPU wasn’t busy at all times using lower batch sizes (which is often the case).

for a 12 layer encoder and 8 layer vae (dim 768), the training time doubles from 16 to 32 batch size, even on a H100? it is only 2x faster compared to my 4090 GPU which is significantly less powerful, this makes training very slow and almost infeasible

Profile the code with a visual profiler, e.g. Nsight Systems, and check for bottlenecks in your code, as it’s impossible to speculate what might be causing unexpected speedups.

class DilatedConvBlock(nn.Module):
def init(self,
in_channels:int,
out_channels:int,
kernel_size:int=3,
stride:int=1,
padding:int=1):
super().init()
self.diconv = nn.Sequential(
nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=padding),
nn.ReLU())

    def forward(self, x):
            return self.diconv(x)

class FCN(nn.Module):
def init(self,
n_actions:int,
input_shape:int=1,
hidden_units:int=64,
output_shape:int=1):
super().init()
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels=input_shape,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=1),
nn.ReLU()
)
self.diconv2 = DilatedConvBlock(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=2)
self.diconv3 = DilatedConvBlock(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=3)
self.diconv4 = DilatedConvBlock(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=4)
self.diconv5_pi = DilatedConvBlock(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=3)
self.diconv6_pi = DilatedConvBlock(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=2)
self.conv7_pi = nn.Sequential(
nn.Conv2d(in_channels=hidden_units,
out_channels=n_actions,
kernel_size=3,
stride=1,
padding=1),
nn.Softmax(dim=1)
)
self.diconv5_v = DilatedConvBlock(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=3)
self.diconv6_v = DilatedConvBlock(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=2)
self.conv7_v = nn.Conv2d(in_channels=hidden_units,
out_channels=output_shape,
kernel_size=3,
stride=1,
padding=1)

            self.conv1.apply(self.weight_init)
            self.diconv2.apply(self.weight_init)
            self.diconv3.apply(self.weight_init)
            self.diconv4.apply(self.weight_init)
            self.diconv5_pi.apply(self.weight_init)
            self.diconv5_v.apply(self.weight_init)
            self.diconv6_pi.apply(self.weight_init)
            self.diconv6_v.apply(self.weight_init)
            self.conv7_pi.apply(self.weight_init)
            self.conv7_v.apply(self.weight_init)

    def weight_init(self, m):
            classname = m.__class__.__name__
            if classname.find("Conv2d") != -1:
                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                    m.weight.data.normal_(0, math.sqrt(2. / n))
                    if m.bias is not None:
                            m.bias.data.zero_()
            elif classname.find('Linear') != -1:
                    m.weight.data.normal_(0, 0.01)
                    m.bias.data = torch.ones(m.bias.data.size())

    def pi_and_v(self, x):
            h = self.conv1(x)
            h = self.diconv2(h)
            h = self.diconv3(h)
            h = self.diconv4(h)
            h_pi = self.diconv5_pi(h)
            h_pi = self.diconv6_pi(h_pi)
            p_out = self.conv7_pi(h_pi)
            h_v = self.diconv5_v(h)
            h_v = self.diconv6_v(h_v)
            v_out = self.conv7_v(h_v)

            return p_out, v_out