Good day fellow pytorch enthusiasts,
I have a multi file dataset that i use boost::iostreams::mapped_file to index into and then collate into tensors.
pybind11 is used to create extension to be used in pytorch.
I send these tensors to the gpu on the pytorch side.
This works fine with num_workers = 0, but when I use more workers I have to pickle the class.
I want the file handler class to be a singleton because it has the file handles used by boost::iostreams::mapped_file which improves caching and whatnot since the files are bigger than my RAM.
The thing is that I don’t know how to get all the instances of the OneMinuteDataset class to point to a single instance of file_holder.
here is my attempt using constructor overloading (only relevant parts included):
OneMinuteDataset::OneMinuteDataset()
{
file_holder = std::make_shared<FileHolder>();
std::cout << "OneMinuteDataset ctor " << &file_holder << "\n";
file_holder->load_files();
}
OneMinuteDataset::OneMinuteDataset(std::shared_ptr<FileHolder> file_holder)
{
this->file_holder = file_holder;
std::cout << "increase instance count" << &file_holder << "\n";
}
#define PYBIND11_DETAILED_ERROR_MESSAGES
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
py::class_<OneMinuteDataset>(m, "OneMinuteDataset")
.def(py::init<>())
.def(py::init<std::shared_ptr<FileHolder>>())
.def("size", &OneMinuteDataset::size)
.def("get_input", &OneMinuteDataset::get_input)
.def("get_target", &OneMinuteDataset::get_target)
.def(py::pickle(
[](const OneMinuteDataset &p) { // __getstate__
return p.file_holder;
},
[](std::shared_ptr<FileHolder> s) { // __setstate__
OneMinuteDataset p(s);
return p;
}
));
py::class_<FileHolder, std::shared_ptr<FileHolder>>(m, "FileHolder")
.def(py::init<>())
.def("load_files", &FileHolder::load_files);
/*
.def(py::pickle(
[](const std::shared_ptr<FileHolder> &p) {
return p;
},
[](const std::shared_ptr<FileHolder> &p) {
return p;
}
));
*/
}
I have to include a pybind for FileHolder because its exposed in the pickle for OneMinuteDataset,
but it wants that pickled too.
Is there a way to hide some class internals from pybind so i don’t have to pickle all the way down?
Should I attempt some disjoint service to serve the data instead?
any hint or advice would be appreciated.
Regards,
Rob