I'm trying to make a simple library for de/serialization in c++, but I know it can be tricky to implement, so I'd really like to have my code reviewed to see if there's anything that stands out and/or can be improved. Here is the full repo containing all the code. Any other necessary code can be found there, as most of it was written by others.
binaryio/reader.h:
#include <memory>
#include <span>
#include <stdexcept>
#include "binaryio/interface.h"
#include "binaryio/swap.h"
#ifndef BINARYIO_READER_H
#define BINARYIO_READER_H
namespace binaryio {
class BinaryReader : IBinaryIO {
public:
BinaryReader(void* const begin, void* const end)
: m_begin(reinterpret_cast<char*>(begin)),
m_end(reinterpret_cast<char*>(end)),
m_current(reinterpret_cast<char*>(begin)){};
BinaryReader(void* const begin, void* const end, const endian& endianness)
: m_begin{reinterpret_cast<char*>(begin)},
m_end{reinterpret_cast<char*>(end)},
m_current{reinterpret_cast<char*>(begin)}, m_endian{endianness} {};
BinaryReader(void* const begin, const std::ptrdiff_t& size)
: m_begin{reinterpret_cast<char*>(begin)}, m_end{m_begin + size},
m_current{reinterpret_cast<char*>(begin)} {};
BinaryReader(void* const begin, const std::ptrdiff_t& size,
const endian& endianness)
: m_begin{reinterpret_cast<char*>(begin)}, m_end{m_begin + size},
m_current{reinterpret_cast<char*>(begin)}, m_endian{endianness} {};
void seek(std::ptrdiff_t offset) override {
if (m_begin + offset > m_end)
throw std::out_of_range("out of bounds seek");
m_current = m_begin + offset;
}
size_t tell() const override {
size_t offset{static_cast<size_t>(m_current - m_begin)};
return offset;
}
template <typename T>
T read() {
if (m_current + sizeof(T) > m_end)
throw std::out_of_range("out of bounds read");
T val = *(T*)m_current;
swap_if_needed_in_place(val, m_endian);
m_current += sizeof(T);
return val;
}
std::string read_string(size_t max_len = 0) {
if (m_current + max_len > m_end || max_len == 0)
max_len = m_end - m_current;
return {m_current, strnlen(m_current, max_len)};
}
template <typename T>
std::span<T> read_many(int count) {
if (m_current + sizeof(T) * count > m_end)
throw std::out_of_range("out of bound read");
std::span<T> vals{{}, count};
for (int i{0}; i < count; ++i) {
vals[i] = *(T*)m_current;
swap_if_needed_in_place(vals[i], m_endian);
m_current += sizeof(T);
}
return vals;
}
endian endianness() { return m_endian; }
void set_endianness(endian new_endian) { m_endian = new_endian; }
void swap_endianness() {
if (m_endian == endian::big)
m_endian = endian::little;
else
m_endian = endian::big;
}
private:
char* m_begin;
char* m_end;
char* m_current;
endian m_endian{endian::native};
};
} // namespace binaryio
#endif
binaryio/writer.h:
#include <memory>
#include <span>
#include <vector>
#include "binaryio/align.h"
#include "binaryio/interface.h"
#include "binaryio/swap.h"
#ifndef BINARYIO_WRITER_H
#define BINARYIO_WRITER_H
namespace binaryio {
class BinaryWriter : IBinaryIO {
public:
// Based on
// https://github.com/zeldamods/oead/blob/master/src/include/oead/util/binary_reader.h
BinaryWriter() = default;
BinaryWriter(endian byte_order) : m_endian{byte_order} {};
std::vector<uint8_t> finalize() { return std::move(m_storage); }
void seek(std::ptrdiff_t offset) override { m_offset = offset; };
size_t tell() const override { return m_offset; }
void write_bytes(const uint8_t* data, size_t size) {
std::span<const uint8_t> bytes{data, size};
if (m_offset + bytes.size() > m_storage.size())
m_storage.resize(m_offset + bytes.size());
std::memcpy(&m_storage[m_offset], bytes.data(), bytes.size());
m_offset += bytes.size();
};
template <typename T,
typename std::enable_if_t<!std::is_pointer_v<T> &&
std::is_trivially_copyable_v<T>>* = nullptr>
void write(T value) {
swap_if_needed_in_place(value, m_endian);
write_bytes(reinterpret_cast<const uint8_t*>(&value), sizeof(value));
}
void write(std::string_view str) {
write_bytes(reinterpret_cast<const uint8_t*>(str.data()), str.size());
}
void write_null() { write<uint8_t>(0); }
void write_cstr(std::string_view str) {
write(str);
write_null();
}
void align_up(size_t n) { seek(AlignUp(tell(), n)); }
private:
std::vector<uint8_t> m_storage;
size_t m_offset{0};
endian m_endian{endian::native};
};
} // namespace binaryio
#endif
And here's a test I wrote using a wav file as a base:
#include <iostream>
#include <fstream>
#include <filesystem>
#include "binaryio/reader.h"
#include "binaryio/writer.h"
struct FileHeader {
uint32_t magic;
uint32_t fileSize;
BINARYIO_DEFINE_FIELDS(FileHeader, magic, fileSize);
};
struct WaveFile {
FileHeader riffHeader;
std::array<uint8_t, 4> magic;
std::array<uint8_t, 4> fmt;
uint32_t fmtSize;
uint16_t audioFormat;
uint16_t numChannels;
uint32_t sampleRate;
uint32_t byteRate;
uint16_t blockAlign;
uint16_t bitsPerSample;
std::array<uint8_t, 4> dataMagic;
uint32_t dataSize;
// Data starts
BINARYIO_DEFINE_FIELDS(WaveFile, riffHeader, magic, fmt, fmtSize, audioFormat, numChannels, sampleRate, byteRate, blockAlign, bitsPerSample, dataMagic, dataSize);
};
int main(int argc, char** argv)
try {
std::vector<uint8_t> bytes(std::filesystem::file_size(argv[1]));
{
std::ifstream ifs{argv[1]};
ifs.read(reinterpret_cast<char*>(bytes.data()), bytes.size());
}
// Read from the byte buffer
binaryio::BinaryReader reader{bytes.begin().base(), bytes.end().base()};
WaveFile wav {reader.read<WaveFile>()};
std::vector<uint8_t> data;
for (int i {sizeof(WaveFile)}; i<bytes.size(); ++i)
data.push_back(reader.read<uint8_t>());
std::cout << "Riff Magic: " << std::hex << wav.riffHeader.magic << std::endl;
std::cout << "Wave Magic: " << wav.magic.data() << std::endl;
// Write a new file
binaryio::BinaryWriter writer{binaryio::endian::little};
writer.write(wav);
for (uint8_t byte : data)
writer.write(byte);
bytes.clear();
bytes = writer.finalize();
{
std::ofstream ofs{"out_little.wav"};
ofs.write(reinterpret_cast<char*>(bytes.data()), bytes.size());
}
// Write a different file with its endianness swapped
writer = {binaryio::endian::big};
writer.write(wav);
for (uint8_t byte : data)
writer.write(byte);
bytes.clear();
bytes = writer.finalize();
{
std::ofstream ofs{"out_big.wav"};
ofs.write(reinterpret_cast<char*>(bytes.data()), wav.riffHeader.fileSize + sizeof(FileHeader));
}
// Read the new file, and compare the result with the original struct
reader = {bytes.begin().base(), bytes.end().base()};
if (reader.read<uint32_t>() == 0x52494646) {
reader.swap_endianness();
reader.seek(0);
}
WaveFile other_wav {reader.read<WaveFile>()};
std::cout << "Riff Magic: " << std::hex << other_wav.riffHeader.magic << std::endl;
std::cout << "Wave Magic: " << other_wav.magic.data() << std::endl;
if (wav.sampleRate == other_wav.sampleRate)
std::cout << "Data preserved, endianness swapped" << std::endl;
else {
throw std::runtime_error("Something went wrong and the data was changed");
}
return 0;
}
catch (std::runtime_error& err) {
std::cerr << err.what() << std::endl;
return 1;
}
```
2 Answers 2
Make the (de)serializer work on streams
If I look at your example main()
, I see a lot of inefficiencies. Part of that is caused by having to read the data into a memory buffer before it can be deserialized, and similarly you have to completely fill a memory buffer before you can finalize the results and write it out. It would be much nicer if your code could work on files directly. Consider being able to write:
std::ifstream ifs{argv[1]};
binaryio::BinaryReader reader{ifs};
auto wav = reader.read<WaveFile>();
...
This could be implemented like so:
class BinaryReader {
std::istream& ifs;
public:
BinaryReader(std::ifstream& ifs): ifs{ifs} {}
template<typename T>
T read() {
T val;
if (!ifs.read(reinterpret_cast<char*>(&val), sizeof val))
throw std::out_of_range("out of bounds read");
swap_if_needed_in_place(val, m_endian);
return val;
}
...
};
Now you might think: but what if my data is not in a file to begin with? The great thing about C++'s I/O functions is that they support more than just files. For example, the above reader can also work on std::istringstream
s, and C++23 makes it easy to turn any contiguous buffer of memory into a stream using std::ispanstream
.
Note that if your classes work on streams, it is no longer necessary to provide stream-like functions like tell()
and seek()
yourself.
The same goes for output.
Make it easy to read and write arrays
Your example shows reading the data from a WAV file byte by byte. That's potentially going to be slow, and it is inconvenient. It would be much nicer if you could write:
auto data = reader.read<std::vector<uint8_t>>(wav.datasize);
You can make this work by creating an overload for read()
that checks if T
is a container (in C++20, that is easy using concepts like std::ranges::output_range
), and then creates a value of that type and read directly into it.
It might be some work to get it to work for all container types though, and you could instead consider creating a function that only returns std::vector
s, or a function that takes a std::span
to read into, so that you can write code like:
auto data = reader.read_vector<uint8_t>(wav.datasize);
// or:
std::vector<uint8_t> data(wav.datasize);
reader.read_span(std::span{data});
About endianness
If you call reader.read<WaveFile>()
, how does your code know how to swap endianness? Not everything is a 32-bit value in the WAV header, and the WAV data might contain samples in various formats, not just 8 bit values. If you swap 32 bits at a time, then two consecutive 16-bit fields in the header might get swapped. If your data consists of 8-bit samples, they should never be swapped.
Most serialization libraries allow you to provide custom serializers for your own types. That way, you can provide a serializer for WaveFile
that serializes each header value separately. There are several approaches possible. For example, you could allow user code to overload the read and write functions, or you could check if a given type has serialize()
and deserialize()
member functions. Have a look at how Boost Serialization handles this.
-
\$\begingroup\$ The way it deals with endianness is with the
BINARYIO_DEFINE_FIELDS
macro, which exposes every field passed to it so that it can automatically check the size and and swap accordingly. \$\endgroup\$Nitram– Nitram2023年02月16日 21:39:43 +00:00Commented Feb 16, 2023 at 21:39 -
\$\begingroup\$ I don't see a definition of that macro, nor of
swap_if_needed_in_place()
, so I had to guess. \$\endgroup\$G. Sliepen– G. Sliepen2023年02月16日 22:14:43 +00:00Commented Feb 16, 2023 at 22:14 -
\$\begingroup\$ I just wanted to say thanks for the feedback. Refactoring it to use streams instead makes the code much more streamlined, although I can't test it with
std::spanstream
since C++23 doesn't seem to be available yet (please correct me if I'm wrong). I also would like to point out that reading straight into the vector data was about 2x faster that what I was doing before. \$\endgroup\$Nitram– Nitram2023年02月19日 16:49:45 +00:00Commented Feb 19, 2023 at 16:49 -
\$\begingroup\$ Found out that I just had to update on my end, so please ignore that part about C++23 not being available. \$\endgroup\$Nitram– Nitram2023年02月19日 17:03:34 +00:00Commented Feb 19, 2023 at 17:03
-
\$\begingroup\$ No problem! It's good to hear that my suggestions improved your code. Consider posting the refactored code as a new question on Code Review. It would also be interesting to see how you implemented BINARYIO_DEFINE_FIELDS and
swap_if_needed_in_place()
. \$\endgroup\$G. Sliepen– G. Sliepen2023年02月19日 17:04:54 +00:00Commented Feb 19, 2023 at 17:04
BinaryReader
constructors:
I think the constructors should take
char*
s, notvoid*
s to be clear to the user what's actually needed (a contiguous array of characters). Any necessaryreinterpret_cast
ing should be done by the user, where it's nice and visible.Why does a "Reader" class take a mutable range? We should probably be taking a
const char*
(orchar const*
).Use
std::size_t
, notstd::ptrdiff_t
, since we expect a contiguous range, andbegin
should always be beforeend
(we should probably check that). Pass it by value, notconst&
.Assuming
endian
is an enum, it should also be passed by value.
seek()
/ tell()
:
The range check in seek is inadequate; since
std::ptrdiff_t
can be negative we may end up beforem_begin
, as well as afterm_end
.In C++ we should use
std::size_t
instead ofsize_t
, as the latter is technically not guaranteed to be present.
read()
:
C-style casting should be avoided, because it's ambiguous and dangerous.
We also need to be careful about C++'s "strict aliasing" rules. We can examine the underlying representation of a
T
by casting aT*
to achar*
, but there's no guarantee that the opposite will work. To convert our array of chars to aT
, we need to usestd::memcpy
, bearing in mind thatT
must be trivially copyable. So:
T val;
std::memcpy(&val, m_current, sizeof(T));
BinaryWriter::write()
checks that the object type is trivially copyable - we should do that here too.
read_string()
:
- As a user, I'd expect that setting
max_len
to zero would return an empty string. Consider settingstd::string::npos
as the default value formax_len
, and use it to symbolize taking the maximum possible string, similar to howstd::string::substr
behaves.
read_many()
:
Should take a
std::size_t
, not anint
parameter.Could use the existing
read()
function to read the value (or abstract the functionality to a private member function to avoid duplicate bounds checking).
BinaryWriter
Using uint8_t
for the storage type is inconsistent with the BinaryReader
. It might be simpler to use char
for both.
seek()
/ tell()
:
Your compiler should give you an error message about mismatched types, since
m_offset
is asize_t
, andoffset
astd::ptrdiff_t
. If not, turn up your compiler warning level and enable warnings as errors!The offset in
seek
is treated as an absolute value, rather than a relative value which is how it's treated inBinaryReader
. This inconsistency is confusing and potentially dangerous. If we do want to use an absolute value,std::size_t
would be a better choice thanstd::ptrdiff_t
. (As is, we need to check somewhere thatm_offset
isn't negative).
write_bytes()
:
Why not just take the
span
as the parameter?The C++ version of
uint8_t
isstd::uint8_t
. However, I think we actually need achar*
orunsigned char*
instead.std::uint8_t
is usually (almost certainly) a typedef forunsigned char
, but it may technically be something else. If it isn'tunsigned char
then we run into "strict aliasing" problems again. We can only get the object representation by casting tochar*
,unsigned char*
, orstd::byte
. So we should be taking one of those three types as a parameter to this function.We're already using
std::memcpy
to go from the object representation to storage, which is good!We're missing an equivalent
read_bytes
on theBinaryReader
.
write()
:
See above about accessing the underlying object representation.
write(string_view)
write_null
,write_cstr
andalign_up
are all oddly specific for a general purpose binary writer. Perhaps they'd be better placed with the relevant reading / writing code for the specific file formats that need them?
-
\$\begingroup\$ Could you clarify why should some things be passed by value? I assumed it would be more memory efficient to pass by (const) reference in most cases \$\endgroup\$Nitram– Nitram2023年02月19日 17:02:29 +00:00Commented Feb 19, 2023 at 17:02
-
\$\begingroup\$ On a 64 bit system, a std::ptrdiff_t will usually be 64 bits. A pointer (or reference) will also be 64 bits and has the additional performance cost of the indirection (although in practice it will probably be optimized to simply pass by value anyway). It's just more readable to pass small built-in types by value, and makes passing by mutable reference stand out more. \$\endgroup\$user673679– user6736792023年02月20日 21:54:14 +00:00Commented Feb 20, 2023 at 21:54
swap_if_needed_in_place()
is significant, yet that's not even declared here. I think we need more of the code to give it a decent review. And as J_H says, show the unit-tests to give us an idea of intended usage. \$\endgroup\$