Hi, I am trying to compare the bandwidth difference of large and small tensor transfer. Suppose I have many layers of linear module, there are 2 ways to save the checkpoint. One is directly saving all the tensors in each module, the other one is to utilize a memory management to let all the tensor in a huge chunk and save the single large tensor directly. I would suppose that, saving a large tensor should obtain higher bandwidth than saving a lot of samll tensors. However, I find it isn’t. The large tensor is aroud 1GB, and it obtains 2.7GB/s bandwidth, while for small tensors the bandwidth for D2H is 15-16GB/s. The results are got from torch.profiler. It’s quite strange. Here it the code to reproduce:
Note, I am using 3090 GPU + PyTorch 1.12.1 + CUDA 11.6. I disable _use_new_zipfile_serialization
since I find it takes additional time (on nsight system profile timeline) before calling the system write.
import os
import time
import torch
import torch.nn as nn
def get_timestamp():
torch.cuda.synchronize()
return time.perf_counter_ns()
class TestNet(nn.Module):
def __init__(self, nlayers, hidden_size):
super(TestNet, self).__init__()
self.nlayers = nlayers
self.hidden_size = hidden_size
self.layers = nn.ModuleList()
for i in range(nlayers):
self.layers.append(nn.Linear(hidden_size, 4 * hidden_size))
self.layers.append(nn.Linear(4 * hidden_size, hidden_size))
def forward(self, x):
for i in range(self.nlayers):
x = self.layers[2 * i](x)
x = self.layers[2 * i + 1](x)
return x
def benchmark_split(nlayers, hidden_size):
net = TestNet(nlayers, hidden_size).cuda()
# init CUDA
batch_size = 2
seq_len = 1024
x = torch.randn(batch_size, seq_len, hidden_size).cuda()
with torch.no_grad():
for _ in range(5):
y = net(x)
# copy from GPU to CPU
ncnts = 5
st = get_timestamp()
for _ in range(ncnts):
torch.save(net.state_dict(), 'log/test_split.pt', _use_new_zipfile_serialization=False)
et = get_timestamp()
os.remove('log/test_split.pt')
return (et - st) / 1e6 / ncnts
def benchmark_cont(nparam, hidden_size):
net = nn.Linear(hidden_size, nparam // hidden_size, bias=False).cuda()
# init CUDA
batch_size = 1
seq_len = 12
x = torch.randn(batch_size, seq_len, hidden_size).cuda()
with torch.no_grad():
for _ in range(5):
y = net(x)
ncnts = 5
st = get_timestamp()
for _ in range(ncnts):
torch.save(net.state_dict(), 'log/test_cont.pt', _use_new_zipfile_serialization=False)
et = get_timestamp()
os.remove('log/test_cont.pt')
return (et - st) / 1e6 / ncnts
def main():
hidden_size = 512 #2048 #2048 # 1024
# for nlayers in [1, 2, 4, 8, 16, 32, 48, 64]:
for nlayers in [128]:
nparam = (hidden_size * hidden_size * 4 * 2 + hidden_size * 5) * nlayers
split_time, cont_time = 1, 0
split_time = benchmark_split(nlayers, hidden_size)
cont_time = benchmark_cont(nparam, hidden_size)
print(f'#param: {nparam}, split: {split_time:.2f} ms, cont: {cont_time:.2f} ms, ratio: {cont_time / split_time:.2f}')
if __name__ == '__main__':
def trace_handler(prof):
print(prof.key_averages().table(sort_by="self_cpu_time_total"))
prof.export_chrome_trace('log/test_split_cont.json')
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=trace_handler
) as prof:
main()
prof.step()
# main()
Besides, I also follow https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/ to benchmark the data transfer speed, here is the result:
Device: NVIDIA GeForce RTX 3090
Transfer size (MB): 1600
Pageable transfers
Host to Device bandwidth (GB/s): 15.161337
Device to Host bandwidth (GB/s): 10.506727
Pinned transfers
Host to Device bandwidth (GB/s): 26.113239
Device to Host bandwidth (GB/s): 18.666413
Can anyone help me find out what is the problem? Thanks a lot!