I would like to limit the max GPU memory pytorch has available to a fraction of total GPU memory. (At deployment many processes use GPU memory and I would like to ensure that deployed pytorch models do not consume to much resources).
Let’s say at inference, the following code is used (from the docs )
#include <torch/script.h> // One-stop header.
#include <iostream>
#include <memory>
int main(int argc, const char* argv[]) {
if (argc != 2) {
std::cerr << "usage: example-app <path-to-exported-script-module>\n";
return -1;
}
torch::jit::script::Module module;
try {
// Deserialize the ScriptModule from a file using torch::jit::load().
module = torch::jit::load(argv[1]);
}
catch (const c10::Error& e) {
std::cerr << "error loading the model\n";
return -1;
}
std::vector<torch::jit::IValue> inputs;
inputs.push_back(torch::ones({1, 3, 224, 224}));
// Execute the model and turn its output into a tensor.
at::Tensor output = module.forward(inputs).toTensor();
std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << '\n';
std::cout << "ok\n";
}
The CudaCachingAllcator API seems to permit setting a limit of the total GPU memory, see pytorch/CUDACachingAllocator.h at f84f89b1c3f2bc74512e7a7b05ae6185164a9b3e · pytorch/pytorch · GitHub :
class CUDAAllocator : public Allocator {
public:
virtual void* raw_alloc(size_t nbytes) = 0;
virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
virtual void raw_delete(void* ptr) = 0;
virtual void init(int device_count) = 0;
virtual bool initialized() = 0;
virtual void setMemoryFraction(double fraction, int device) = 0;
Can somebody explain how I could use the CudaAllocator
class to limit total GPU consumption in the example above?