[report a bug] I allocated some GPU memory using Torch, and I want to copy data from the CPU to the GPU, but it hasn’t been successful

Here is a simplified program that doesn’t work properly and throws the following error:


#include <torch/torch.h>
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
#include <torch/csrc/distributed/c10d/FileStore.hpp>
#include <c10/cuda/CUDAStream.h> // For CUDAStreamGuard and getStreamFromPool
#include <c10/cuda/CUDAFunctions.h> // For CUDASetDevice, device_count, is_available
#include <c10/core/DeviceGuard.h> // For CUDAGuard
#include <cstdlib> // for std::getenv, std::stoi
#include <iostream>
#include <string>
#include <vector>
#include <stdexcept> // For std::runtime_error, std::exception
#include <memory> // For c10::make_intrusive
#include <chrono> // For timeout duration and sleep_for
#include <thread> // For std::this_thread::sleep_for
#include <cstdio> // For std::remove

int main(int argc, char* argv[]) {
    // Define the size of memory to allocate (e.g., 1KB)
    size_t size_bytes =  1024; // 1 KB

    // 1. Prepare data on the CPU
    std::vector<char> cpu_data(size_bytes);
    for (size_t i = 0; i < size_bytes; ++i) {
        cpu_data[i] = static_cast<char>(i % 128);
    }
    char* cpu_ptr = cpu_data.data();
    std::cout << "CPU data prepared, size: " << size_bytes << " bytes." << std::endl;

    // Check if CUDA is available
    if (!torch::cuda::is_available()) {
        std::cerr << "Error: CUDA is not available. This test requires CUDA." << std::endl;
        return 1; // Exit if CUDA is not available
    } else {
        int device_count = torch::cuda::device_count();
        std::cout << "CUDA is available. Found " << device_count << " GPU device(s)." << std::endl;
        if (device_count == 0) {
             std::cerr << "Error: CUDA is available but no GPU devices were found." << std::endl;
             return 1;
        }
    }

    void* gpu_ptr = nullptr;
    try {
        // (Optional) Set the CUDA device to operate on, default is 0
        c10::cuda::set_device(0);
        c10::DeviceGuard device_guard(c10::Device(c10::kCUDA, 0)); // Ensure subsequent operations are on the specified device

        // 2. Allocate GPU memory using CUDACachingAllocator
        std::cout << "Attempting to allocate " << size_bytes << " bytes of GPU memory..." << std::endl;
        gpu_ptr = c10::cuda::CUDACachingAllocator::raw_alloc(size_bytes);
        std::cout << "Attempting to copy CPU data to GPU..." << std::endl;
        cudaError_t cuda_err = cudaMemcpy(gpu_ptr, cpu_ptr, size_bytes, cudaMemcpyHostToDevice);
        if (cuda_err != cudaSuccess) {
            std::cerr << "Error: cudaMemcpy HostToDevice failed: " << cudaGetErrorString(cuda_err) << std::endl;
            // Free allocated GPU memory on error
            c10::cuda::CUDACachingAllocator::raw_delete(gpu_ptr);
            return 1;
        }
        std::cout << "Data successfully copied from CPU to GPU." << std::endl;

        // (Optional) Add code here to use the GPU memory pointed to by gpu_ptr...

        // 4. Free GPU memory
        std::cout << "Freeing GPU memory..." << std::endl;
        c10::cuda::CUDACachingAllocator::raw_delete(gpu_ptr);
        gpu_ptr = nullptr; // Set pointer to null to avoid dangling pointer
        std::cout << "GPU memory has been freed." << std::endl;

    } catch (const c10::CUDAError& e) {
        std::cerr << "Caught c10::CUDAError: " << e.what() << std::endl;
        // In case of exception, try to free memory if it was allocated
        if (gpu_ptr) {
            std::cout << "Freeing GPU memory in exception handler..." << std::endl;
            c10::cuda::CUDACachingAllocator::raw_delete(gpu_ptr);
        }
        return 1;
    } catch (const std::exception& e) {
        std::cerr << "Caught std::exception: " << e.what() << std::endl;
        // In case of exception, try to free memory if it was allocated
        if (gpu_ptr) {
             std::cout << "Freeing GPU memory in exception handler..." << std::endl;
            c10::cuda::CUDACachingAllocator::raw_delete(gpu_ptr);
        }
        return 1;
    }

    std::cout << "Memory allocation and copy test completed successfully." << std::endl;
    return 0; // Main function exits normally
}

CPU data prepared, size: 1024 bytes.
CUDA is available. Found 7 GPU device(s).
Attempting to allocate 1024 bytes of GPU memory...
Caught std::exception: 0 <= device && static_cast<size_t>(device) < device_allocator.size() INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":2906, please report a bug to PyTorch. Allocator not initialized for device

The version of libtorch I am using is 2.3.1+cu118.

Before allocating GPU memory, I added a line to allocate a tensor on the GPU, and it started working. It seems that this correctly initializes the device. Why is this happening?

Did you try to initialize the allocator via c10::cuda::CUDACachingAllocator::init?

1 Like

This solves the problem, thank you very much for your reply.