It seems there is a bug leading to a memory leak in the (C++) libtorch.
#include <torch/extension.h>
#include <thread>
class FrameVector {
private:
std::vector<int> data;
public:
FrameVector() {
std::vector<int> data(7056);
this->data = data;
}
};
class FrameTensor {
private:
torch::Tensor data;
public:
FrameTensor() {
this->data = torch::zeros({1, 84, 84});
}
};
template<class T>
void f() {
int capacity = 1000000;
std::vector<std::vector<T>> frames(capacity);
for (auto i = 0; i < capacity + 1000000; i++) {
if (i == capacity) {
std::cout << "buffer is full!" << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(2));
std::cout << "restart!" << std::endl;
}
frames[i % capacity].push_back(T());
if (i >= capacity) {
frames[i % capacity].erase(frames[i % capacity].begin());
}
}
}
int main(int argc, char *argv[])
{
f<FrameTensor>(); // needs 34G to fill the replay buffer, then memory increases to 60G
f<FrameVector>(); // needs 34G to fill the replay buffer, then memory stay constant (as it should)
}
The bug only seems to occur when the torch::Tensor
is stored in nested containers for examples:
std::vector<std::vector<T>>
std::vector<std::deque<T>>
I believe the internal counter that keep track of the number of references to the torch::Tensor
fail to count the correct number of references. This leads the tensors memory to never be released.