I am using libTorch with C++20. I create DataLoader like this:
template <typename DatasetType>
auto BuildDataLoader(const Settings& sets, int& bacthesCount)
{
auto ds = DefaultDataset();
auto dsMapped = ds.map(torch::data::transforms::Stack<typename DatasetType::ExampleType>());
auto datasetSize = dsMapped.size().value();
bacthesCount = (datasetSize + sets.batchSize - 1) / sets.batchSize;
auto loader = torch::data::make_data_loader(
std::move(dsMapped),
torch::data::DataLoaderOptions()
.batch_size(sets.batchSize)
.workers(sets.numWorkers)
.enforce_ordering(false)
);
return loader;
}
where
DataLoaderData DefaultDataset::get(size_t index){
DataLoaderData ld;
ld.input = at::ones({ 3, 64, 64 }, at::kFloat);
ld.target = at::ones({ 1, 64, 64 }, at::kFloat);
return ld;
}
torch::optional<size_t> DefaultDataset::size() const {
return 30;
}
and
struct DataLoaderData
{
at::Tensor input;
at::Tensor target;
void setupDevice(const Settings& sets)
{
input = input.to(sets.device, input.dtype(), sets.perf.useNonBlockingTransfers);
target = target.to(sets.device, target.dtype(), sets.perf.useNonBlockingTransfers);
}
};
//=================================================================================
/*
With this, we can use Dataset.map with
torch::data::transforms::Stack<typename DatasetType::ExampleType>()
which will stack batches together to one tensor (B, ...)
*/
namespace torch::data::transforms {
template <>
struct Stack<DataLoaderData> : public Collation<DataLoaderData> {
DataLoaderData apply_batch(std::vector<DataLoaderData> ds) override {
std::vector<torch::Tensor> inputs, targets;
inputs.reserve(ds.size());
targets.reserve(ds.size());
for (auto& d : ds)
{
inputs.push_back((d.input));
targets.push_back((d.target));
}
DataLoaderData d;
d.input = torch::stack(inputs);
d.target = torch::stack(targets);
return d;
}
};
}
However, when I run training with numWorkers > 0, code randomly crashes during looping dataLoader.
for (auto& batch : *dl) { .. some code.. }
Crash is from thread, stack trace:
LibTorchFramework.exe!free_dbg_nolock(void * const block=0x00000150060b4040, const int block_use=1) Line 996 C++
LibTorchFramework.exe!_free_dbg(void * block=0x00000150060b4040, int block_use=-1) Line 1030 C++
[External Code]
LibTorchFramework.exe!torch::data::detail::Queue<torch::data::DataLoaderBase<torch::data::datasets::MapDataset<DefaultDataset,torch::data::transforms::Stack<DataLoaderData>>,DataLoaderData,std::vector<unsigned __int64,std::allocator<unsigned __int64>>>::Job>::pop(std::optional<std::chrono::duration<__int64,std::ratio<1,1000>>> timeout={...}) Line 57 C++
LibTorchFramework.exe!torch::data::detail::DataShuttle<torch::data::DataLoaderBase<torch::data::datasets::MapDataset<DefaultDataset,torch::data::transforms::Stack<DataLoaderData>>,DataLoaderData,std::vector<unsigned __int64,std::allocator<unsigned __int64>>>::Job,torch::data::DataLoaderBase<torch::data::datasets::MapDataset<DefaultDataset,torch::data::transforms::Stack<DataLoaderData>>,DataLoaderData,std::vector<unsigned __int64,std::allocator<unsigned __int64>>>::Result>::pop_job() Line 40 C++
LibTorchFramework.exe!torch::data::DataLoaderBase<torch::data::datasets::MapDataset<DefaultDataset,torch::data::transforms::Stack<DataLoaderData>>,DataLoaderData,std::vector<unsigned __int64,std::allocator<unsigned __int64>>>::worker_thread(torch::data::datasets::MapDataset<DefaultDataset,torch::data::transforms::Stack<DataLoaderData>> & dataset={...}) Line 186 C++
LibTorchFramework.exe!torch::data::StatelessDataLoader<torch::data::datasets::MapDataset<DefaultDataset,torch::data::transforms::Stack<DataLoaderData>>,torch::data::samplers::RandomSampler>::{ctor}::__l6::<lambda_1>::operator()() Line 48 C++
[External Code]
LibTorchFramework.exe!thread_start<unsigned int (__cdecl*)(void *),1>(void * const parameter=0x00000150060de940) Line 97 C++
I have tried to comment out most of the code, but I cannot pinpoint the problem
No matter what is inside dataLoader loop, it will crashes sooner or later with the same error.