TLDR:
Using torch::save
and torch::load
on std::stringstream
works just fine to save and load tensors. However, transforming the std::stringstream
to a std::string
and then a char*
to send over a TCP socket results in a broken message during deserialization. What could be the cause? Am I using it incorrectly?
Details:
I am working on a project that needs to transfer tensors over a tcp socket and need to serialize the tensors. Initially, I had code looking like the following to transfer a Tensor:
// CLIENT
while (this->keep_alive.load()) {
this_thread::sleep_for(chrono::seconds(5));
auto tensor = torch::ones({3, 4});
stringstream stream;
torch::save(tensor, stream);
string stream_str = stream.str();
const char* stream_str_char_ptr = stream.c_str()
if(::send(client_fd, stream_str_char_ptr, stream_str.size(), 0) < 0){
// Handle cases
}
}
// SERVER
while (this->keep_alive.load()) {
size_t read_so_far = 0;
while (read_so_far < this->read_size) {
ssize_t valread =
::read(this->new_socket, this->read_buffer + read_so_far,
this->read_size - read_so_far);
read_so_far += valread;
};
torch::Tensor input_tensor;
stringstream buffer_stream(this->read_buffer);
torch::load(input_tensor, buffer_stream);
cout << input_tensor << endl;
}
Now this code is raising the following error on the server side and I’m assuming this message means the serialization stream is corrupted:
libc++abi: terminating with uncaught exception of type c10::Error: istream reader failed: getting the current position.
Exception raised from validate at /Users/runner/work/pytorch/pytorch/pytorch/caffe2/serialize/istream_adapter.cc:32 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >) + 81 (0x1070a4ca1 in libc10.dylib)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) + 98 (0x1070a3342 in libc10.dylib)
frame #2: caffe2::serialize::IStreamAdapter::validate(char const*) const + 124 (0x1231bd7bc in libtorch_cpu.dylib)
frame #3: caffe2::serialize::IStreamAdapter::size() const + 65 (0x1231bd6b1 in libtorch_cpu.dylib)
frame #4: caffe2::serialize::PyTorchStreamReader::init() + 99 (0x1231b81c3 in libtorch_cpu.dylib)
frame #5: caffe2::serialize::PyTorchStreamReader::PyTorchStreamReader(std::__1::basic_istream<char, std::__1::char_traits<char> >*) + 184 (0x1231b8b68 in libtorch_cpu.dylib)
frame #6: torch::jit::import_ir_module(std::__1::shared_ptr<torch::jit::CompilationUnit>, std::__1::basic_istream<char, std::__1::char_traits<char> >&, c10::optional<c10::Device>, std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > >&, bool, bool) + 529 (0x12467e241 in libtorch_cpu.dylib)
frame #7: torch::jit::import_ir_module(std::__1::shared_ptr<torch::jit::CompilationUnit>, std::__1::basic_istream<char, std::__1::char_traits<char> >&, c10::optional<c10::Device>, bool) + 75 (0x12467df6b in libtorch_cpu.dylib)
frame #8: torch::jit::load(std::__1::basic_istream<char, std::__1::char_traits<char> >&, c10::optional<c10::Device>, bool) + 147 (0x124681c73 in libtorch_cpu.dylib)
frame #9: torch::serialize::InputArchive::load_from(std::__1::basic_istream<char, std::__1::char_traits<char> >&, c10::optional<c10::Device>) + 28 (0x124ef814c in libtorch_cpu.dylib)
frame #10: void torch::load<at::Tensor, std::__1::basic_stringstream<char, std::__1::char_traits<char>, std::__1::allocator<char> >&>(at::Tensor&, std::__1::basic_stringstream<char, std::__1::char_traits<char>, std::__1::allocator<char> >&) + 94 (0x106e3de0e in baton)
frame #11: baton::OutputClient::thread_handler() + 369 (0x106e42cb1 in baton)
frame #12: decltype(*(static_cast<baton::OutputClient*>(fp0)).*fp()) std::__1::__invoke<void (baton::OutputClient::*)(), baton::OutputClient*, void>(void (baton::OutputClient::*&&)(), baton::OutputClient*&&) + 105 (0x106e4d7f9 in baton)
frame #13: void std::__1::__thread_execute<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (baton::OutputClient::*)(), baton::OutputClient*, 2ul>(std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (baton::OutputClient::*)(), baton::OutputClient*>&, std::__1::__tuple_indices<2ul>) + 62 (0x106e4d73e in baton)
frame #14: void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (baton::OutputClient::*)(), baton::OutputClient*> >(void*) + 98 (0x106e4cf42 in baton)
frame #15: _pthread_start + 125 (0x7ff8149e14e1 in libsystem_pthread.dylib)
frame #16: thread_start + 15 (0x7ff8149dcf6b in libsystem_pthread.dylib)
So then I tried doing the serialization → string conversion → char* conversion all on the client side to make sure the issue was not caused by code on my side.
//CLIENT
while (this->keep_alive.load()) {
this_thread::sleep_for(chrono::seconds(5));
auto tensor = torch::ones({3, 4});
stringstream stream;
torch::save(tensor, stream);
torch::Tensor load_tensor;
string stream_str = stream.str();
const char *stream_str_char_ptr = stream_str.c_str();
stringstream load_stream = stringstream(stream_str_char_ptr);
torch::load(load_tensor, load_stream);
cout << load_tensor << endl;
}
However, this still raised the same error I pasted above. Suggesting the issue is with converting the std::stringstream
to a std::string
and then a char *
. I will appreciate some advice on how to go around resolving this issue.
Edit:
The client and the server are both locally running on x86_64 MacOS.