Cuda streams, no speedup, mixed devices cpu/gpu

– EDIT: some final answers. in short, cuda streams are very limited.

hi,
pytorch doc defines cuda streams as A CUDA stream is a linear sequence of execution that belongs to a specific device.

  1. i assume by linear sequence, they meant a set of cuda instructions without control statements. (?)
  2. what would happen if the sequence is not linear?
  3. what would happen if the instructions contain mixed instructions: once for cuda, other for cpu such as print(!!!) or more cpu stuff. so, if your set of instructions are mixed and not linear, cuda streams dont seem to be a solution to speedup.
  4. so fare, i am unable to see any speedup using streams over linear cuda sequence. any explanations? this suggests that when the cuda instructions are run in a time shorter that time required for cpu to start the next stream, you wont see any speedup. i modified the code in order to slow down a stream in order to give enough time to the cpu to start the other one, but i still dont see any speedup.

thanks

this is the run time of the below code, and it uses only 3gb/16gb of gpu memory:

$ CUDA_LAUNCH_BLOCKING=1 python streamer.py 
time linear:          276498.625ms
time concurrent:  277744.0625ms

code streamer.py:

import time
import torch
import torch.nn as nn

def run(iters=10, streams=False):
    device = torch.device(1)

    s1 = torch.cuda.Stream(device=device)
    s2 = torch.cuda.Stream(device=device)
    x = torch.rand(size=(1024 * 10, 1024 * 10)).to(device)
    w1 = torch.rand(size=(1024 * 10, 1024 * 10)).to(device)
    w2 = torch.rand(size=(1024 * 10, 1024 * 10)).to(device)

    def op():
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)
        x.matmul(w1)

        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)
        x.matmul(w2)

    torch.cuda.synchronize()

    for i in range(iters):
        torch.cuda.nvtx.range_push('iter{}'.format(i))

        if streams:

            with torch.cuda.stream(s1):
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)
                x.matmul(w1)

            with torch.cuda.stream(s2):
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)
                x.matmul(w2)

        else:
            op()

        torch.cuda.nvtx.range_pop()

    torch.cuda.synchronize()


if __name__ == '__main__':
    # warmup
    run()

    torch.cuda.cudart().cudaProfilerStart()
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    start_event.record()
    run(streams=False)
    end_event.record()
    elapsed_time_ms = start_event.elapsed_time(end_event)
    print('time linear: {}ms'.format(elapsed_time_ms))
    torch.cuda.cudart().cudaProfilerStop()

    start_event.record()
    run(streams=True)
    end_event.record()
    elapsed_time_ms = start_event.elapsed_time(end_event)
    print('time concurrent: {}ms'.format(elapsed_time_ms))