Here is a simplified program that doesn’t work properly and throws the following error:
#include <torch/torch.h>
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
#include <torch/csrc/distributed/c10d/FileStore.hpp>
#include <c10/cuda/CUDAStream.h> // For CUDAStreamGuard and getStreamFromPool
#include <c10/cuda/CUDAFunctions.h> // For CUDASetDevice, device_count, is_available
#include <c10/core/DeviceGuard.h> // For CUDAGuard
#include <cstdlib> // for std::getenv, std::stoi
#include <iostream>
#include <string>
#include <vector>
#include <stdexcept> // For std::runtime_error, std::exception
#include <memory> // For c10::make_intrusive
#include <chrono> // For timeout duration and sleep_for
#include <thread> // For std::this_thread::sleep_for
#include <cstdio> // For std::remove
int main(int argc, char* argv[]) {
// Define the size of memory to allocate (e.g., 1KB)
size_t size_bytes = 1024; // 1 KB
// 1. Prepare data on the CPU
std::vector<char> cpu_data(size_bytes);
for (size_t i = 0; i < size_bytes; ++i) {
cpu_data[i] = static_cast<char>(i % 128);
}
char* cpu_ptr = cpu_data.data();
std::cout << "CPU data prepared, size: " << size_bytes << " bytes." << std::endl;
// Check if CUDA is available
if (!torch::cuda::is_available()) {
std::cerr << "Error: CUDA is not available. This test requires CUDA." << std::endl;
return 1; // Exit if CUDA is not available
} else {
int device_count = torch::cuda::device_count();
std::cout << "CUDA is available. Found " << device_count << " GPU device(s)." << std::endl;
if (device_count == 0) {
std::cerr << "Error: CUDA is available but no GPU devices were found." << std::endl;
return 1;
}
}
void* gpu_ptr = nullptr;
try {
// (Optional) Set the CUDA device to operate on, default is 0
c10::cuda::set_device(0);
c10::DeviceGuard device_guard(c10::Device(c10::kCUDA, 0)); // Ensure subsequent operations are on the specified device
// 2. Allocate GPU memory using CUDACachingAllocator
std::cout << "Attempting to allocate " << size_bytes << " bytes of GPU memory..." << std::endl;
gpu_ptr = c10::cuda::CUDACachingAllocator::raw_alloc(size_bytes);
std::cout << "Attempting to copy CPU data to GPU..." << std::endl;
cudaError_t cuda_err = cudaMemcpy(gpu_ptr, cpu_ptr, size_bytes, cudaMemcpyHostToDevice);
if (cuda_err != cudaSuccess) {
std::cerr << "Error: cudaMemcpy HostToDevice failed: " << cudaGetErrorString(cuda_err) << std::endl;
// Free allocated GPU memory on error
c10::cuda::CUDACachingAllocator::raw_delete(gpu_ptr);
return 1;
}
std::cout << "Data successfully copied from CPU to GPU." << std::endl;
// (Optional) Add code here to use the GPU memory pointed to by gpu_ptr...
// 4. Free GPU memory
std::cout << "Freeing GPU memory..." << std::endl;
c10::cuda::CUDACachingAllocator::raw_delete(gpu_ptr);
gpu_ptr = nullptr; // Set pointer to null to avoid dangling pointer
std::cout << "GPU memory has been freed." << std::endl;
} catch (const c10::CUDAError& e) {
std::cerr << "Caught c10::CUDAError: " << e.what() << std::endl;
// In case of exception, try to free memory if it was allocated
if (gpu_ptr) {
std::cout << "Freeing GPU memory in exception handler..." << std::endl;
c10::cuda::CUDACachingAllocator::raw_delete(gpu_ptr);
}
return 1;
} catch (const std::exception& e) {
std::cerr << "Caught std::exception: " << e.what() << std::endl;
// In case of exception, try to free memory if it was allocated
if (gpu_ptr) {
std::cout << "Freeing GPU memory in exception handler..." << std::endl;
c10::cuda::CUDACachingAllocator::raw_delete(gpu_ptr);
}
return 1;
}
std::cout << "Memory allocation and copy test completed successfully." << std::endl;
return 0; // Main function exits normally
}
CPU data prepared, size: 1024 bytes.
CUDA is available. Found 7 GPU device(s).
Attempting to allocate 1024 bytes of GPU memory...
Caught std::exception: 0 <= device && static_cast<size_t>(device) < device_allocator.size() INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":2906, please report a bug to PyTorch. Allocator not initialized for device
The version of libtorch I am using is 2.3.1+cu118.