Binary (de)serialization in c++

Question 1

I'm trying to make a simple library for de/serialization in c++, but I know it can be tricky to implement, so I'd really like to have my code reviewed to see if there's anything that stands out and/or can be improved. Here is the full repo containing all the code. Any other necessary code can be found there, as most of it was written by others.

binaryio/reader.h:

#include <memory>
#include <span>
#include <stdexcept>
#include "binaryio/interface.h"
#include "binaryio/swap.h"
#ifndef BINARYIO_READER_H
#define BINARYIO_READER_H
namespace binaryio {
class BinaryReader : IBinaryIO {
public:
 BinaryReader(void* const begin, void* const end)
 : m_begin(reinterpret_cast<char*>(begin)),
 m_end(reinterpret_cast<char*>(end)),
 m_current(reinterpret_cast<char*>(begin)){};
 BinaryReader(void* const begin, void* const end, const endian& endianness)
 : m_begin{reinterpret_cast<char*>(begin)},
 m_end{reinterpret_cast<char*>(end)},
 m_current{reinterpret_cast<char*>(begin)}, m_endian{endianness} {};
 BinaryReader(void* const begin, const std::ptrdiff_t& size)
 : m_begin{reinterpret_cast<char*>(begin)}, m_end{m_begin + size},
 m_current{reinterpret_cast<char*>(begin)} {};
 BinaryReader(void* const begin, const std::ptrdiff_t& size,
 const endian& endianness)
 : m_begin{reinterpret_cast<char*>(begin)}, m_end{m_begin + size},
 m_current{reinterpret_cast<char*>(begin)}, m_endian{endianness} {};
 void seek(std::ptrdiff_t offset) override {
 if (m_begin + offset > m_end)
 throw std::out_of_range("out of bounds seek");
 m_current = m_begin + offset;
 }
 size_t tell() const override {
 size_t offset{static_cast<size_t>(m_current - m_begin)};
 return offset;
 }
 template <typename T>
 T read() {
 if (m_current + sizeof(T) > m_end)
 throw std::out_of_range("out of bounds read");
 T val = *(T*)m_current;
 swap_if_needed_in_place(val, m_endian);
 m_current += sizeof(T);
 return val;
 }
 std::string read_string(size_t max_len = 0) {
 if (m_current + max_len > m_end || max_len == 0)
 max_len = m_end - m_current;
 return {m_current, strnlen(m_current, max_len)};
 }
 template <typename T>
 std::span<T> read_many(int count) {
 if (m_current + sizeof(T) * count > m_end)
 throw std::out_of_range("out of bound read");
 std::span<T> vals{{}, count};
 for (int i{0}; i < count; ++i) {
 vals[i] = *(T*)m_current;
 swap_if_needed_in_place(vals[i], m_endian);
 m_current += sizeof(T);
 }
 return vals;
 }
 endian endianness() { return m_endian; }
 void set_endianness(endian new_endian) { m_endian = new_endian; }
 void swap_endianness() {
 if (m_endian == endian::big)
 m_endian = endian::little;
 else
 m_endian = endian::big;
 }
private:
 char* m_begin;
 char* m_end;
 char* m_current;
 endian m_endian{endian::native};
};
} // namespace binaryio
#endif

binaryio/writer.h:

#include <memory>
#include <span>
#include <vector>
#include "binaryio/align.h"
#include "binaryio/interface.h"
#include "binaryio/swap.h"
#ifndef BINARYIO_WRITER_H
#define BINARYIO_WRITER_H
namespace binaryio {
class BinaryWriter : IBinaryIO {
public:
 // Based on
 // https://github.com/zeldamods/oead/blob/master/src/include/oead/util/binary_reader.h
 BinaryWriter() = default;
 BinaryWriter(endian byte_order) : m_endian{byte_order} {};
 std::vector<uint8_t> finalize() { return std::move(m_storage); }
 void seek(std::ptrdiff_t offset) override { m_offset = offset; };
 size_t tell() const override { return m_offset; }
 void write_bytes(const uint8_t* data, size_t size) {
 std::span<const uint8_t> bytes{data, size};
 if (m_offset + bytes.size() > m_storage.size())
 m_storage.resize(m_offset + bytes.size());
 std::memcpy(&m_storage[m_offset], bytes.data(), bytes.size());
 m_offset += bytes.size();
 };
 template <typename T,
 typename std::enable_if_t<!std::is_pointer_v<T> &&
 std::is_trivially_copyable_v<T>>* = nullptr>
 void write(T value) {
 swap_if_needed_in_place(value, m_endian);
 write_bytes(reinterpret_cast<const uint8_t*>(&value), sizeof(value));
 }
 void write(std::string_view str) {
 write_bytes(reinterpret_cast<const uint8_t*>(str.data()), str.size());
 }
 void write_null() { write<uint8_t>(0); }
 void write_cstr(std::string_view str) {
 write(str);
 write_null();
 }
 void align_up(size_t n) { seek(AlignUp(tell(), n)); }
private:
 std::vector<uint8_t> m_storage;
 size_t m_offset{0};
 endian m_endian{endian::native};
};
} // namespace binaryio
#endif

And here's a test I wrote using a wav file as a base:

#include <iostream>
#include <fstream>
#include <filesystem>
#include "binaryio/reader.h"
#include "binaryio/writer.h"
struct FileHeader {
 uint32_t magic;
 uint32_t fileSize;
 BINARYIO_DEFINE_FIELDS(FileHeader, magic, fileSize);
};
struct WaveFile {
 FileHeader riffHeader;
 std::array<uint8_t, 4> magic;
 std::array<uint8_t, 4> fmt;
 uint32_t fmtSize;
 uint16_t audioFormat;
 uint16_t numChannels;
 uint32_t sampleRate;
 uint32_t byteRate;
 uint16_t blockAlign;
 uint16_t bitsPerSample;
 std::array<uint8_t, 4> dataMagic;
 uint32_t dataSize;
 // Data starts
 BINARYIO_DEFINE_FIELDS(WaveFile, riffHeader, magic, fmt, fmtSize, audioFormat, numChannels, sampleRate, byteRate, blockAlign, bitsPerSample, dataMagic, dataSize);
};
int main(int argc, char** argv)
try {
 std::vector<uint8_t> bytes(std::filesystem::file_size(argv[1]));
 {
 std::ifstream ifs{argv[1]};
 ifs.read(reinterpret_cast<char*>(bytes.data()), bytes.size());
 }
 // Read from the byte buffer
 binaryio::BinaryReader reader{bytes.begin().base(), bytes.end().base()};
 WaveFile wav {reader.read<WaveFile>()};
 std::vector<uint8_t> data;
 for (int i {sizeof(WaveFile)}; i<bytes.size(); ++i)
 data.push_back(reader.read<uint8_t>());
 std::cout << "Riff Magic: " << std::hex << wav.riffHeader.magic << std::endl;
 std::cout << "Wave Magic: " << wav.magic.data() << std::endl;
 // Write a new file
 binaryio::BinaryWriter writer{binaryio::endian::little};
 writer.write(wav);
 for (uint8_t byte : data)
 writer.write(byte);
 bytes.clear();
 bytes = writer.finalize();
 {
 std::ofstream ofs{"out_little.wav"};
 ofs.write(reinterpret_cast<char*>(bytes.data()), bytes.size());
 }
 // Write a different file with its endianness swapped
 writer = {binaryio::endian::big};
 writer.write(wav);
 for (uint8_t byte : data)
 writer.write(byte);
 bytes.clear();
 bytes = writer.finalize();
 {
 std::ofstream ofs{"out_big.wav"};
 ofs.write(reinterpret_cast<char*>(bytes.data()), wav.riffHeader.fileSize + sizeof(FileHeader));
 }
 // Read the new file, and compare the result with the original struct
 reader = {bytes.begin().base(), bytes.end().base()};
 if (reader.read<uint32_t>() == 0x52494646) {
 reader.swap_endianness();
 reader.seek(0);
 }
 WaveFile other_wav {reader.read<WaveFile>()};
 std::cout << "Riff Magic: " << std::hex << other_wav.riffHeader.magic << std::endl;
 std::cout << "Wave Magic: " << other_wav.magic.data() << std::endl;
 if (wav.sampleRate == other_wav.sampleRate)
 std::cout << "Data preserved, endianness swapped" << std::endl;
 else {
 throw std::runtime_error("Something went wrong and the data was changed");
 }
 return 0;
}
catch (std::runtime_error& err) {
 std::cerr << err.what() << std::endl;
 return 1;
}
```

Question 2

I don't understand this code at all. And I don't want the author to tell me, I want the code to tell me. Here's what I'm grappling with. We serialize int64's to a file on host A. We read them back on host B. Where did we write out a Magic Number file prefix? (I think we didn't.) Where did we write out "A was using big-endian, and the file does, too!"? (I think we didn't.) Where does B notice it needs byte swaps to convert to B's little-endian setup? Also, get_platform_endianness() seems to have no notion of A writing data once that will be convenient for B to read a million times. Unit test?

Question 3

I'm guessing swap_if_needed_in_place() is significant, yet that's not even declared here. I think we need more of the code to give it a decent review. And as J_H says, show the unit-tests to give us an idea of intended usage.

Question 4

@Nitram The comments say your code is based on code from another project. I checked and that project is licensed under the GPL version 2 or later. You are allowed to reuse that code, but you have to make sure your code also is published under the same license, and regardless, you should credit the original author of the code.

Question 5

@G.Sliepen I've licensed it under it GPL version 3. Does that not work? Also, should credits be more visible? I assumed it was enough with the copyright notice of each file detailing where they're from, but I'll add them to the README if that's preferable.

Question 6

@Nitram Yes, credits should be visible. The exact details are in the text of the GPL license itself. There might also be a requirement from copyright law, although that varies by jurisdiction of course, and I am not a lawyer. Also have a look at this post. Also note that just adding a link is not enough, and is problematic; websites might move or go offline.

Question 7

Make the (de)serializer work on streams

If I look at your example main(), I see a lot of inefficiencies. Part of that is caused by having to read the data into a memory buffer before it can be deserialized, and similarly you have to completely fill a memory buffer before you can finalize the results and write it out. It would be much nicer if your code could work on files directly. Consider being able to write:

std::ifstream ifs{argv[1]};
binaryio::BinaryReader reader{ifs};
auto wav = reader.read<WaveFile>();
...

This could be implemented like so:

class BinaryReader {
 std::istream& ifs;
public:
 BinaryReader(std::ifstream& ifs): ifs{ifs} {}
 template<typename T>
 T read() {
 T val;
 if (!ifs.read(reinterpret_cast<char*>(&val), sizeof val))
 throw std::out_of_range("out of bounds read");
 swap_if_needed_in_place(val, m_endian);
 return val;
 }
 ...
};

Now you might think: but what if my data is not in a file to begin with? The great thing about C++'s I/O functions is that they support more than just files. For example, the above reader can also work on std::istringstreams, and C++23 makes it easy to turn any contiguous buffer of memory into a stream using std::ispanstream.

Note that if your classes work on streams, it is no longer necessary to provide stream-like functions like tell() and seek() yourself.

The same goes for output.

Make it easy to read and write arrays

Your example shows reading the data from a WAV file byte by byte. That's potentially going to be slow, and it is inconvenient. It would be much nicer if you could write:

auto data = reader.read<std::vector<uint8_t>>(wav.datasize);

You can make this work by creating an overload for read() that checks if T is a container (in C++20, that is easy using concepts like std::ranges::output_range), and then creates a value of that type and read directly into it.

It might be some work to get it to work for all container types though, and you could instead consider creating a function that only returns std::vectors, or a function that takes a std::span to read into, so that you can write code like:

auto data = reader.read_vector<uint8_t>(wav.datasize);
// or:
std::vector<uint8_t> data(wav.datasize);
reader.read_span(std::span{data});

About endianness

If you call reader.read<WaveFile>(), how does your code know how to swap endianness? Not everything is a 32-bit value in the WAV header, and the WAV data might contain samples in various formats, not just 8 bit values. If you swap 32 bits at a time, then two consecutive 16-bit fields in the header might get swapped. If your data consists of 8-bit samples, they should never be swapped.

Most serialization libraries allow you to provide custom serializers for your own types. That way, you can provide a serializer for WaveFile that serializes each header value separately. There are several approaches possible. For example, you could allow user code to overload the read and write functions, or you could check if a given type has serialize() and deserialize() member functions. Have a look at how Boost Serialization handles this.

Question 8

The way it deals with endianness is with the BINARYIO_DEFINE_FIELDS macro, which exposes every field passed to it so that it can automatically check the size and and swap accordingly.

Question 9

I don't see a definition of that macro, nor of swap_if_needed_in_place(), so I had to guess.

Question 10

I just wanted to say thanks for the feedback. Refactoring it to use streams instead makes the code much more streamlined, although I can't test it with std::spanstream since C++23 doesn't seem to be available yet (please correct me if I'm wrong). I also would like to point out that reading straight into the vector data was about 2x faster that what I was doing before.

Question 11

Found out that I just had to update on my end, so please ignore that part about C++23 not being available.

Question 12

No problem! It's good to hear that my suggestions improved your code. Consider posting the refactored code as a new question on Code Review. It would also be interesting to see how you implemented BINARYIO_DEFINE_FIELDS and swap_if_needed_in_place().

Question 13

BinaryReader

constructors:

I think the constructors should take char*s, not void*s to be clear to the user what's actually needed (a contiguous array of characters). Any necessary reinterpret_casting should be done by the user, where it's nice and visible.
Why does a "Reader" class take a mutable range? We should probably be taking a const char* (or char const*).
Use std::size_t, not std::ptrdiff_t, since we expect a contiguous range, and begin should always be before end (we should probably check that). Pass it by value, not const&.
Assuming endian is an enum, it should also be passed by value.

seek() / tell():

The range check in seek is inadequate; since std::ptrdiff_t can be negative we may end up before m_begin, as well as after m_end.
In C++ we should use std::size_t instead of size_t, as the latter is technically not guaranteed to be present.

read():

C-style casting should be avoided, because it's ambiguous and dangerous.
We also need to be careful about C++'s "strict aliasing" rules. We can examine the underlying representation of a T by casting a T* to a char*, but there's no guarantee that the opposite will work. To convert our array of chars to a T, we need to use std::memcpy, bearing in mind that T must be trivially copyable. So:

 T val;
 std::memcpy(&val, m_current, sizeof(T));

BinaryWriter::write() checks that the object type is trivially copyable - we should do that here too.

read_string():

As a user, I'd expect that setting max_len to zero would return an empty string. Consider setting std::string::npos as the default value for max_len, and use it to symbolize taking the maximum possible string, similar to how std::string::substr behaves.

read_many():

Should take a std::size_t, not an int parameter.
Could use the existing read() function to read the value (or abstract the functionality to a private member function to avoid duplicate bounds checking).

`BinaryWriter`

Using uint8_t for the storage type is inconsistent with the BinaryReader. It might be simpler to use char for both.

seek() / tell():

Your compiler should give you an error message about mismatched types, since m_offset is a size_t, and offset a std::ptrdiff_t. If not, turn up your compiler warning level and enable warnings as errors!
The offset in seek is treated as an absolute value, rather than a relative value which is how it's treated in BinaryReader. This inconsistency is confusing and potentially dangerous. If we do want to use an absolute value, std::size_t would be a better choice than std::ptrdiff_t. (As is, we need to check somewhere that m_offset isn't negative).

write_bytes():

Why not just take the span as the parameter?
The C++ version of uint8_t is std::uint8_t. However, I think we actually need a char* or unsigned char* instead. std::uint8_t is usually (almost certainly) a typedef for unsigned char, but it may technically be something else. If it isn't unsigned char then we run into "strict aliasing" problems again. We can only get the object representation by casting to char*, unsigned char*, or std::byte. So we should be taking one of those three types as a parameter to this function.

We're already using std::memcpy to go from the object representation to storage, which is good!
We're missing an equivalent read_bytes on the BinaryReader.

write():

See above about accessing the underlying object representation.
write(string_view) write_null, write_cstr and align_up are all oddly specific for a general purpose binary writer. Perhaps they'd be better placed with the relevant reading / writing code for the specific file formats that need them?

Question 14

Could you clarify why should some things be passed by value? I assumed it would be more memory efficient to pass by (const) reference in most cases

Question 15

On a 64 bit system, a std::ptrdiff_t will usually be 64 bits. A pointer (or reference) will also be 64 bits and has the additional performance cost of the indirection (although in practice it will probably be optimized to simply pass by value anyway). It's just more readable to pass small built-in types by value, and makes passing by mutable reference stand out more.

G. Sliepen G. Sliepen 68.7k3 gold badges74 silver badges179 bronze badges · Accepted Answer · 2023-02-16 12:37:06Z

Make the (de)serializer work on streams

If I look at your example main(), I see a lot of inefficiencies. Part of that is caused by having to read the data into a memory buffer before it can be deserialized, and similarly you have to completely fill a memory buffer before you can finalize the results and write it out. It would be much nicer if your code could work on files directly. Consider being able to write:

std::ifstream ifs{argv[1]};
binaryio::BinaryReader reader{ifs};
auto wav = reader.read<WaveFile>();
...

This could be implemented like so:

class BinaryReader {
 std::istream& ifs;
public:
 BinaryReader(std::ifstream& ifs): ifs{ifs} {}
 template<typename T>
 T read() {
 T val;
 if (!ifs.read(reinterpret_cast<char*>(&val), sizeof val))
 throw std::out_of_range("out of bounds read");
 swap_if_needed_in_place(val, m_endian);
 return val;
 }
 ...
};

Now you might think: but what if my data is not in a file to begin with? The great thing about C++'s I/O functions is that they support more than just files. For example, the above reader can also work on std::istringstreams, and C++23 makes it easy to turn any contiguous buffer of memory into a stream using std::ispanstream.

Note that if your classes work on streams, it is no longer necessary to provide stream-like functions like tell() and seek() yourself.

The same goes for output.

Make it easy to read and write arrays

Your example shows reading the data from a WAV file byte by byte. That's potentially going to be slow, and it is inconvenient. It would be much nicer if you could write:

auto data = reader.read<std::vector<uint8_t>>(wav.datasize);

You can make this work by creating an overload for read() that checks if T is a container (in C++20, that is easy using concepts like std::ranges::output_range), and then creates a value of that type and read directly into it.

It might be some work to get it to work for all container types though, and you could instead consider creating a function that only returns std::vectors, or a function that takes a std::span to read into, so that you can write code like:

auto data = reader.read_vector<uint8_t>(wav.datasize);
// or:
std::vector<uint8_t> data(wav.datasize);
reader.read_span(std::span{data});

About endianness

If you call reader.read<WaveFile>(), how does your code know how to swap endianness? Not everything is a 32-bit value in the WAV header, and the WAV data might contain samples in various formats, not just 8 bit values. If you swap 32 bits at a time, then two consecutive 16-bit fields in the header might get swapped. If your data consists of 8-bit samples, they should never be swapped.

Most serialization libraries allow you to provide custom serializers for your own types. That way, you can provide a serializer for WaveFile that serializes each header value separately. There are several approaches possible. For example, you could allow user code to overload the read and write functions, or you could check if a given type has serialize() and deserialize() member functions. Have a look at how Boost Serialization handles this.

The way it deals with endianness is with the BINARYIO_DEFINE_FIELDS macro, which exposes every field passed to it so that it can automatically check the size and and swap accordingly.
I don't see a definition of that macro, nor of swap_if_needed_in_place(), so I had to guess.
I just wanted to say thanks for the feedback. Refactoring it to use streams instead makes the code much more streamlined, although I can't test it with std::spanstream since C++23 doesn't seem to be available yet (please correct me if I'm wrong). I also would like to point out that reading straight into the vector data was about 2x faster that what I was doing before.
Found out that I just had to update on my end, so please ignore that part about C++23 not being available.
No problem! It's good to hear that my suggestions improved your code. Consider posting the refactored code as a new question on Code Review. It would also be interesting to see how you implemented BINARYIO_DEFINE_FIELDS and swap_if_needed_in_place().

Stack Exchange Network

Binary (de)serialization in c++

2 Answers 2

Make the (de)serializer work on streams

Make it easy to read and write arrays

About endianness

BinaryReader

`BinaryWriter`

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Binary (de)serialization in c++

2 Answers 2

Make the (de)serializer work on streams

Make it easy to read and write arrays

About endianness

BinaryReader

BinaryWriter

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions

`BinaryWriter`