I’ve been playing around with LLMs on my laptop, I have rather limited VRAM so I had to get creative with memory allocation.
Initially, I tried UVM, which gets the model running at about 8s per token.
After some experimentation, I settled on a switching allocator that puts the model weights in page locked host memory and everything else in VRAM, which runs at about 3s per token, a substantial improvement!
#include <cuda_runtime.h>
#include <iostream>
#include <unordered_set>
bool host_alloc = true;
std::unordered_set<void*> host_ptrs;
extern "C" {
void* sw_alloc(ssize_t size, int device, cudaStream_t stream) {
void *ptr;
auto ec = host_alloc ? cudaMallocHost(&ptr, size) : cudaMalloc(&ptr, size);
if (ec != cudaSuccess) {
std::cout << "sw_alloc failed: " << cudaGetErrorString(ec) << std::endl;
}
if (host_alloc) {
host_ptrs.insert(ptr);
}
return ptr;
}
void sw_free(void* ptr, ssize_t size, int device, cudaStream_t stream) {
cudaError_t ec;
if (host_ptrs.contains(ptr)) {
ec = cudaFreeHost(ptr);
host_ptrs.erase(ptr);
} else {
ec = cudaFree(ptr);
}
if (ec != cudaSuccess) {
std::cout << "sw_free failed: " << cudaGetErrorString(ec) << std::endl;
}
}
void switch_alloc() {
std::cout << "switching allocator" << std::endl;
host_alloc = !host_alloc;
}
}
Thoughts on some API that allows the user to choose what kind of CUDA memory they want to use for a particular tensor/module?