I’m trying to use Customized CUDAPluggableAllocator using cudaMallocManaged
#include <sys/types.h>
#include <cuda_runtime_api.h>
#include <iostream>
// g++ alloc.cc -o alloc.so -I/usr/local/cuda/include -shared -fPIC -lcuda
void checkCudaError(cudaError_t error, const char* message) {
if (error != cudaSuccess) {
std::cerr << message << ": " << cudaGetErrorString(error) << std::endl;
throw std::runtime_error("CUDA error");
}
}
extern "C" {
void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
void* ptr = nullptr;
cudaError_t error = cudaMallocManaged(&ptr, size);
checkCudaError(error, "Failed to allocate memory");
return ptr;
}
void my_free(void* ptr, int device, cudaStream_t stream) {
cudaError_t error = cudaFree(ptr);
checkCudaError(error, "Failed to free memory");
}
}
And when I was running the following code
import torch
# Load the allocator
new_alloc = torch.cuda.memory.CUDAPluggableAllocator(
'./alloc.so', 'my_malloc', 'my_free')
# Swap the current allocator
torch.cuda.memory.change_current_allocator(new_alloc)
x = torch.rand(100000, 40000).to('cuda')
x.pow_(2)
print(x)
the terminal was killed, seemed to be out of memory.
[ 3235.978176] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0,global_oom,task_memcg=/user.slice/user-1000.slice/user@1000.service,task=gnome-shell,pid=12981,uid=1000
[ 3235.978233] Out of memory: Killed process 12981 (gnome-shell) total-vm:5898356kB, anon-rss:0kB, file-rss:0kB, shmem-rss:0kB, UID:1000 pgtables:1440kB oom_score_adj:0
I traced the ram, vram and swap space memory usage,
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 43C P8 6W / 60W | 13MiB / 4096MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:38:22 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 44C P8 6W / 60W | 13MiB / 4096MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:38:27 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 45C P0 16W / 60W | 1622MiB / 4096MiB | 11% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:38:32 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 46C P0 16W / 60W | 3542MiB / 4096MiB | 10% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:38:37 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 46C P0 17W / 60W | 3900MiB / 4096MiB | 18% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:38:42 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 46C P0 17W / 60W | 3900MiB / 4096MiB | 19% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:38:47 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 47C P0 17W / 60W | 3900MiB / 4096MiB | 21% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:38:52 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 47C P0 17W / 60W | 3900MiB / 4096MiB | 21% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:38:57 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 47C P0 17W / 60W | 3900MiB / 4096MiB | 23% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:39:02 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 47C P0 17W / 60W | 3900MiB / 4096MiB | 29% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:39:07 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 48C P0 17W / 60W | 3900MiB / 4096MiB | 23% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 68MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:39:12 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 51C P0 32W / 60W | 3900MiB / 4096MiB | 100% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 13119 C python 78MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:39:17 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 47C P8 15W / 60W | 13MiB / 4096MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1127 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 9023 G /usr/lib/xorg/Xorg 4MiB |
+---------------------------------------------------------------------------------------+
Thu Apr 18 17:39:23 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 47C P8 7W / 60W | 13MiB / 4096MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
And when I edit x = torch.rand(100000, 40000).to('cuda')
to x = torch.rand(100000, 30000).to('cuda')
, program went well, though the RAM was still oversubscribed and swap space was occupied as well.
I traced the memory usage:
I found that under normal circumstances, RAM usage will suddenly decrease after a while and then slowly rise again, but in the case of too much RAM oversubscribed, RAM usage will still exceed physical memory after the decrease, so the process being process will be killed. Is there any way to avoid this?