I compare the time of tensor.pin_memory().cuda() with tensor.cuda(), the first one is significantly slower. It is assumed that tensor.cuda() will copy data from pageable memory to pinned memory first, so it is should be of similar speed, but it is almost 10 times slower.
#include <torch/extension.h>
#include <vector>
#include <thread>
#include <mutex>
#include <sys/time.h>
// Function to pin a single tensor
void pin_tensor(torch::Tensor& tensor) {
tensor = tensor.pin_memory();
}
// Custom op to pin a list of tensors in parallel
std::vector<torch::Tensor> pin_tensors_parallel(const std::vector<torch::Tensor>& tensors) {
struct timeval ts0, ts1;
gettimeofday(&ts0, NULL);
std::vector<torch::Tensor> pinned_tensors(tensors.size());
torch::Device cuda_device(torch::kCUDA, 0);
for (size_t i = 0; i < tensors.size(); ++i) {
// to pin memory first
pinned_tensors[i] = tensors[i].pin_memory().to(cuda_device);
// direct copy to cuda
// pinned_tensors[i] = tensors[i].to(cuda_device);
}
torch::cuda::synchronize();
gettimeofday(&ts1, NULL);
double ts = (ts1.tv_sec - ts0.tv_sec) * 1000 * 1000 + (ts1.tv_usec - ts0.tv_usec);
printf("total time=%.3g\n", ts / 1000.0);
return pinned_tensors;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("pin_tensors_parallel", &pin_tensors_parallel, "Pin a list of tensors in parallel");
}
import torch
import my_custom_op
import time
# Create a list of tensors
tensors = [torch.randn(1000, 1000) for _ in range(10)]
# the first time is always slower
tensors[0].pin_memory()
cuda_tensors = my_custom_op.pin_tensors_parallel(tensors)
torch.cuda.synchronize(device=0)
tensor.pin_memory().cuda() takes around 100 ms;
tensor.cuda() takes around 15ms.