Hi, boss.
For CPU block resources are not free.
at::cuda::detail::getDeviceIndexWithPrimaryContext();
if (primary_ctx_device_index.has_value()) {
device_guard.reset_device(
at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
}
// Round up the allocation to the nearest power of two to improve reuse.
void* ptr = nullptr;
C10_CUDA_CHECK(cudaHostAlloc(
&ptr, c10::llvm::PowerOf2Ceil(size), cudaHostAllocDefault));
auto block = new Block();
block->size_ = c10::llvm::PowerOf2Ceil(size);
block->ptr_ = ptr;
block->allocated_ = true;
{
std::lock_guard<std::mutex> g(blocks_mutex_);
blocks_.insert(block);
ptr_to_block_.insert({block->ptr_, block});
}
return {block->ptr_, reinterpret_cast<void*>(block)};
at::OptionalDeviceGuard device_guard;
auto primary_ctx_device_index =
at::cuda::detail::getDeviceIndexWithPrimaryContext();
if (primary_ctx_device_index.has_value()) {
device_guard.reset_device(
at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
}
// Round up the allocation to the nearest power of two to improve reuse.
void* ptr = nullptr;
C10_CUDA_CHECK(cudaHostAlloc(
&ptr, c10::llvm::PowerOf2Ceil(size), cudaHostAllocDefault));
auto block = new Block();
block->size_ = c10::llvm::PowerOf2Ceil(size);
block->ptr_ = ptr;
block->allocated_ = true;
{
std::lock_guard<std::mutex> g(blocks_mutex_);
blocks_.insert(block);
ptr_to_block_.insert({block->ptr_, block});
Through there is a API empty_cache can release resource, but this API is not using now.
So why don’t explicit free those resources, and how to ensure resources are not leaked.
A caching allocator tries to avoid allocating and freeing memory for each use for performance reasons. Resources can be freed by explicitly clearing the cache or at the teardown. Also the CUDACachingAllocator
will free the cache in case it’s running OOM before retrying the allocation again.
Yep, this is ok. There are also two questions about this.
CachingHostAllocator don’t free cache when OOM, this will be directly failed;
2)According empty_cache function, just free cudaEvent and pin memory from cudaHostAlloc directly without any device sync, this is OK?
https://github.com/pytorch/pytorch/blob/release/1.13/aten/src/ATen/cuda/CachingHostAllocator.cpp#L283
Why would you need a device sync to free page-locked memory on the host?
In async mode, page-locked memory maybe freed before kernel using this memory.
After read the codes, i found this is safe, because CachingHostAllocator_recordEvent and cudaFreeHost keep page-locked memory alive until kernel finished.
Thanks a lot.