– EDIT: some final answers. in short, cuda streams are very limited.
hi,
pytorch doc defines cuda streams as A CUDA stream is a linear sequence of execution that belongs to a specific device
.
- i assume by linear sequence, they meant a set of cuda instructions without control statements. (?)
- what would happen if the sequence is not linear?
- what would happen if the instructions contain mixed instructions: once for cuda, other for cpu such as print(!!!) or more cpu stuff. so, if your set of instructions are mixed and not linear, cuda streams dont seem to be a solution to speedup.
- so fare, i am unable to see any speedup using streams over linear cuda sequence. any explanations? this suggests that when the cuda instructions are run in a time shorter that time required for cpu to start the next stream, you wont see any speedup. i modified the code in order to slow down a stream in order to give enough time to the cpu to start the other one, but i still dont see any speedup.
thanks
this is the run time of the below code, and it uses only 3gb/16gb of gpu memory:
$ CUDA_LAUNCH_BLOCKING=1 python streamer.py
time linear: 276498.625ms
time concurrent: 277744.0625ms
code streamer.py
:
import time
import torch
import torch.nn as nn
def run(iters=10, streams=False):
device = torch.device(1)
s1 = torch.cuda.Stream(device=device)
s2 = torch.cuda.Stream(device=device)
x = torch.rand(size=(1024 * 10, 1024 * 10)).to(device)
w1 = torch.rand(size=(1024 * 10, 1024 * 10)).to(device)
w2 = torch.rand(size=(1024 * 10, 1024 * 10)).to(device)
def op():
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
torch.cuda.synchronize()
for i in range(iters):
torch.cuda.nvtx.range_push('iter{}'.format(i))
if streams:
with torch.cuda.stream(s1):
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
x.matmul(w1)
with torch.cuda.stream(s2):
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
x.matmul(w2)
else:
op()
torch.cuda.nvtx.range_pop()
torch.cuda.synchronize()
if __name__ == '__main__':
# warmup
run()
torch.cuda.cudart().cudaProfilerStart()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
run(streams=False)
end_event.record()
elapsed_time_ms = start_event.elapsed_time(end_event)
print('time linear: {}ms'.format(elapsed_time_ms))
torch.cuda.cudart().cudaProfilerStop()
start_event.record()
run(streams=True)
end_event.record()
elapsed_time_ms = start_event.elapsed_time(end_event)
print('time concurrent: {}ms'.format(elapsed_time_ms))