Replicating switching allocator behavior natively

Velveret · May 16, 2023, 11:13am

I’ve been playing around with LLMs on my laptop, I have rather limited VRAM so I had to get creative with memory allocation.
Initially, I tried UVM, which gets the model running at about 8s per token.
After some experimentation, I settled on a switching allocator that puts the model weights in page locked host memory and everything else in VRAM, which runs at about 3s per token, a substantial improvement!

#include <cuda_runtime.h>
#include <iostream>
#include <unordered_set>

bool host_alloc = true;

std::unordered_set<void*> host_ptrs;

extern "C" {
void* sw_alloc(ssize_t size, int device, cudaStream_t stream) {
    void *ptr;
    auto ec = host_alloc ? cudaMallocHost(&ptr, size) : cudaMalloc(&ptr, size);

    if (ec != cudaSuccess) {
        std::cout << "sw_alloc failed: " << cudaGetErrorString(ec) << std::endl;
    }

    if (host_alloc) {
        host_ptrs.insert(ptr);
    }

    return ptr;
}

void sw_free(void* ptr, ssize_t size, int device, cudaStream_t stream) {
    cudaError_t ec;
    if (host_ptrs.contains(ptr)) {
        ec = cudaFreeHost(ptr);
        host_ptrs.erase(ptr);
    } else {
        ec = cudaFree(ptr);
    }

    if (ec != cudaSuccess) {
        std::cout << "sw_free failed: " << cudaGetErrorString(ec) << std::endl;
    }
}

void switch_alloc() {
    std::cout << "switching allocator" << std::endl;
    host_alloc = !host_alloc;
}
}

Thoughts on some API that allows the user to choose what kind of CUDA memory they want to use for a particular tensor/module?