I’m writing a CUDA extension with CUDAExtension from torch.utils.cpp_extension. When I benchmark the write speeds for 1,000,000 objects, it seems very slow, where a million writes takes ~2ms:
__global__ void write_test( int* edges, int size)
{
int thid = blockIdx.x * blockDim.x + threadIdx.x;
if (thid >= size){ return; }
else { edges[thid] = 23; }
}
// Entry point cpu code for python
void some_func()
{
int size = 1000000;
int* edges;
cudaMalloc((void **)&edges, (size)*sizeof(int) );
write_test<<<dim3(1024), dim3(1024)>>>( edges, size );
}
In contrast, timing a nn.Conv2d with similar numbers of inputs takes less than 0.5ms:
def test_conv(self):
device = torch.cuda.current_device()
mod = torch.nn.Conv2d(32,32,3).cuda().eval()
data = t.randint(0,255,[40,32,32,32], dtype=t.float, device=device)
torch.cuda.synchronize(device)
tic = time.time()
out = mod(data)
t.cuda.synchronize(device)
print(time.time() - tic)
The time to convolve and then write the outputs includes however many multiplications. Is there some way to write to global memory on the GPU faster?