What I'd like to do is read data from a stream (like ifstream::read), but reserve N = 4 bytes from being returned. The last 4 bytes of the stream are not part of the data itself, but are metadata.
In the general case, the stream is not an ifstream so I won't be able to seek, putback, or determine the size of the stream ahead of time. Here I'm just using it as an example.
My approach is to keep a reserve buffer of 4 bytes to ensure I never accidentally return the last four bytes of the stream. However, the logic turned out to be pretty hard to manage... I'm hoping there's a cleaner way to accomplish what I want.
I'd like to ask for a code review in terms readability (is there a better approach?) and/or performance (is there anything I'm doing that is suboptimal?)
#include <iostream>
#include <array>
#include <vector>
#include <fstream>
#include <cstring>
#define RESERVE_SIZE 4
static std::array<char, RESERVE_SIZE> reserve = {0,0,0,0}; // buffer for holding bytes at end of stream
static bool first_read = true; // true if first read and reserve buffer is empty
// helper function to print output
void data_printer(std::string msg, char * bd, size_t bytes_read) {
std::cout << msg << bytes_read << ": ";
for(size_t i=0; i < bytes_read; i++) std::cout << (int)bd[i] << " ";
std::cout << std::endl;
}
// helper function to read and return # of bytes read
size_t read_count(std::ifstream & myFile, char * dst, size_t length) {
myFile.read(dst, length);
return myFile.gcount();
}
// dst -- output buffer, has at least "length" bytes
// exact -- if true, expect "length" bytes read from myFile and copied to dst; throw error if less than that
// return value is the number of bytes read from my File and copied to dst
size_t read_reserve(std::ifstream & myFile, char * dst, size_t length, bool exact=false) {
if(first_read) {
myFile.read(reserve.data(), RESERVE_SIZE);
first_read = false;
}
if(exact) {
if(length >= RESERVE_SIZE) {
std::memcpy(dst, reserve.data(), RESERVE_SIZE);
myFile.read(dst + RESERVE_SIZE, length - RESERVE_SIZE);
myFile.read(reserve.data(), RESERVE_SIZE);
if(myFile.gcount() != RESERVE_SIZE) throw std::runtime_error("not enough data in file :(");
return length;
} else { // RESERVE_SIZE > length
std::memcpy(dst, reserve.data(), length);
// since some of the reserve buffer was consumed, shift the unconsumed bytes to beginning of array
// then read from file to fill up reserve buffer
std::memmove(reserve.data(), reserve.data() + length, RESERVE_SIZE - length);
myFile.read(reserve.data() + RESERVE_SIZE - length, length);
if(myFile.gcount() != length) throw std::runtime_error("not enough data in file :(");
return length;
}
} else { // !exact -- we can't assume that "length" bytes are left in myFile; there could even be zero bytes left
if(length >= RESERVE_SIZE) {
// use "dst" as a temporary buffer, since it's already allocated
// it is not a good idea to allocate a temp buffer of size length, as length can be large
std::memcpy(dst, reserve.data(), RESERVE_SIZE);
size_t n_read = read_count(myFile, dst + RESERVE_SIZE, length - RESERVE_SIZE);
size_t n_bufferable = n_read + RESERVE_SIZE;
if(n_bufferable < length) {
std::memcpy(reserve.data(), dst + n_bufferable - RESERVE_SIZE, RESERVE_SIZE);
return n_bufferable - RESERVE_SIZE;
} else {
std::array<char, RESERVE_SIZE> temp_buffer = {0,0,0,0};
size_t temp_size = read_count(myFile, temp_buffer.data(), RESERVE_SIZE);
std::memcpy(reserve.data(), dst + n_bufferable - (RESERVE_SIZE - temp_size), RESERVE_SIZE - temp_size);
std::memcpy(reserve.data() + RESERVE_SIZE - temp_size, temp_buffer.data(), temp_size);
return n_bufferable - (RESERVE_SIZE - temp_size);
}
} else { // length < RESERVE_SIZE
std::vector<char> temp_buffer(length, '0円');
size_t return_value = read_count(myFile, temp_buffer.data(), length);
// n_bufferable is at most RESERVE_SIZE*2 - 1 = 7
std::memcpy(dst, reserve.data(), return_value);
std::memmove(reserve.data(), reserve.data() + return_value, RESERVE_SIZE - return_value);
std::memcpy(reserve.data() + (RESERVE_SIZE - return_value), temp_buffer.data(), return_value);
return return_value;
}
}
}
// example function usage
int main() {
std::ofstream outfile("/tmp/temp.bin", std::ios::out | std::ios::binary);
std::array<char, 30> mydata = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29};
outfile.write(mydata.data(), 30);
outfile.close();
std::cout << "test1" << std::endl;
std::ifstream infile("/tmp/temp.bin", std::ios::in | std::ios::binary);
std::vector<char> buffer(100, '0円');
size_t bytes_read;
char * bd = buffer.data();
bytes_read = read_reserve(infile, bd, 3, true); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE);
bytes_read = read_reserve(infile, bd, 3, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE);
bytes_read = read_reserve(infile, bd, 5, true); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE);
bytes_read = read_reserve(infile, bd, 5, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE);
bytes_read = read_reserve(infile, bd, 100, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE);
infile.close();
std::cout << "test2" << std::endl;
first_read = true;
std::ifstream infile2("/tmp/temp.bin", std::ios::in | std::ios::binary);
bytes_read = read_reserve(infile2, bd, 28, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE);
bytes_read = read_reserve(infile2, bd, 3, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE);
infile2.close();
}
// g++ -std=c++11 -O3 read_reserve.cpp -o test
1 Answer 1
In general, the code looks nice. It is structured and well-formatted.
It is probably a bad idea to have more than 80 characters in each line, because this may happen:
Also, put the body of a control statement on a separate line. For example, instead of
for(size_t i=0; i < bytes_read; i++) std::cout << (int)bd[i] << " ";
do
for (size_t i=0; i < bytes_read; i++)
std::cout << (int)bd[i] << " ";
(more on this loop later.)
General design
It seems that you are using a translation unit as a "module" and use global static
variables for implementation. Requiring the user to set first_read
manually feels wrong. What you are developing is a class that wraps the functionalities.
Also, your read_reserve
function is way too complex. A rule of thumb is that if a function is longer than about 10 lines, it is doing too much. In your case, the exact
parameter should be a separate function:
class Reader {
public:
Reader(std::istream& i)
:is{i}
{
init();
}
std::streamsize read (char* dst, std::streamsize length);
std::streamsize read_exact(char* dst, std::streamsize length);
// for the test
void print_reserve(std::ostream& os);
private:
void init();
std::streamsize read_count(char* dst, std::streamsize length);
std::istream& is;
static constexpr std::size_t reserve_size = 4;
std::array<char, reserve_size> reserve{};
};
void Reader::init()
{
read_count(reserve.data(), reserve_size);
}
This way, the user does not need to bother with first_read
. They simply create a Reader
object when they want to read something. The implementation also doesn't have to check for first_read
each time.
Code
Now let's go through the code and figure out some possible improvements.
#include <iostream> #include <array> #include <vector> #include <fstream> #include <cstring>
It is considered good practice to sort the #include
s in alphabetical order, so that you can easily figure out whether a particular header is included. Like:
#include <array>
#include <cstring>
#include <fstream>
#include <iostream>
#include <vector>
In particular, you forgot #include <cstddef>
for std::size_t
.
#define RESERVE_SIZE 4 static std::array<char, RESERVE_SIZE> reserve = {0,0,0,0}; // buffer for holding bytes at end of stream static bool first_read = true; // true if first read and reserve buffer is empty
In C++, don't use #define
for constants. Use constexpr
instead.
constexpr std::size_t reserve_size = 4;
The initialization of reserve
can be a simple {}
instead of = {0,0,0,0}
. In my opinion, that's clearer.
// helper function to print output void data_printer(std::string msg, char * bd, size_t bytes_read) { std::cout << msg << bytes_read << ": "; for(size_t i=0; i < bytes_read; i++) std::cout << (int)bd[i] << " "; std::cout << std::endl; }
Don't take msg
by value. Take by const reference instead. Also, I don't see why bd
is a non-const pointer. size_t
should be std::size_t
. Prefer ++i
over i++
. In C++, avoid C-style casts. Use '\n'
instead of std::endl
when flushing is not needed. Something along the lines of:
// helper function to print output
void data_printer(const std::string& msg, const char * bd, std::size_t bytes_read)
{
std::cout << msg << bytes_read << ": ";
for (std::size_t i = 0; i < bytes_read; ++i)
std::cout << static_cast<int>(bd[i]) << ' ';
std::cout << '\n';
}
I would put a space after the for
keyword, but that is a matter of taste, I guess.
Also, the STL algorithms can be used here: (needs #include <algorithm>
and #include <iterator>
)
// helper function to print output
void data_printer(const std::string& msg, const char* bd, std::size_t bytes_read)
{
std::cout << msg << bytes_read << ": ";
std::copy_n(bd, bytes_read, std::ostream_iterator<int>{std::cout, " "});
std::cout << '\n';
}
In fact, I feel that msg
doesn't fit in here well. And it would be nice if you take a std::ostream&
parameter instead of always outputting to std::cout
. Also, this function may deserve an inline
. I may write the function like this:
void print_data(std::ostream& os, const char* data, std::size_t cnt)
{
std::copy_n(bd, bytes_read, std::ostream_iterator<int>{std::cout, " "});
}
and let the user handle the message and/or newline. It would be even nicer if we write an I/O manipulator so that we can use it like
std::cout << "bytes read: " << print_data(bd, bytes_read) << '\n';
The implementation is left as an exercise to the reader. (oops)
// helper function to read and return # of bytes read size_t read_count(std::ifstream & myFile, char * dst, size_t length) { myFile.read(dst, length); return myFile.gcount(); }
Nice little function. Also a good candidate for inline
. myFile
can be any std::istream
, not just std::ifstream
. It may be better to use std::streamsize
instead of std::size_t
here:
// helper function to read and return # of bytes read
std::streamsize read_count(std::istream& myFile, char* dst, std::streamsize length)
{
myFile.read(dst, length);
return myFile.gcount();
}
Also, stream operations can be chained, but I'm not sure whether return myFile.read(dst, length).gcount()
is more readable.
Initialization code:
myFile.read(reserve.data(), RESERVE_SIZE);
As I wrote above, since you wrote read_count
, why not use it?
Non-exact reading code, part one:
if(length >= RESERVE_SIZE) { std::memcpy(dst, reserve.data(), RESERVE_SIZE); myFile.read(dst + RESERVE_SIZE, length - RESERVE_SIZE); myFile.read(reserve.data(), RESERVE_SIZE); if(myFile.gcount() != RESERVE_SIZE) throw std::runtime_error("not enough data in file :("); return length; }
Here, the logic is actually simpler than it seems to be: emit the cached data, read the remaining characters and emit immediately, and then read and cache specified amount of data. It would be nice if you place a comment explaining this. Also, std::copy
is easier to work with than std::memcpy
, and it increments the pointer for you. (std::copy
automatically calls std::memcpy
for trivial types, so there is no performance quality.) And since you have written read_count
, use it. I am starting to feel that the if statement deserves its own function:
void read_ensure(char* dest, std::streampos size)
{
if (read_count(dest, size) != size)
throw std::runtime{"not enough data in file :("};
}
then you can use it to make the code more readable:
if (length >= reserve_size) {
dst = std::copy(reserve.begin(), reserve.end(), dst);
read_count(dst, length - reserve_size);
read_ensure(reserve.data(), reserve_size);
}
The return statement is common to both branches, so don't repeat it.
Non-exact reading code, part two:
else { // RESERVE_SIZE > length std::memcpy(dst, reserve.data(), length); // since some of the reserve buffer was consumed, shift the unconsumed bytes to beginning of array // then read from file to fill up reserve buffer std::memmove(reserve.data(), reserve.data() + length, RESERVE_SIZE - length); myFile.read(reserve.data() + RESERVE_SIZE - length, length); if(myFile.gcount() != length) throw std::runtime_error("not enough data in file :("); return length; }
We can use std::copy
here since we know the direction: (std::copy
is likely to call std::memmove
internally)
else {
std::copy(reserve.begin(), reserve.end(), dst);
std::copy(reserve.begin() + length, reserve.end(), reserve.begin());
read_ensure(reserve, length);
}
Exact reading code, part one:
if(length >= RESERVE_SIZE) { // use "dst" as a temporary buffer, since it's already allocated // it is not a good idea to allocate a temp buffer of size length, as length can be large std::memcpy(dst, reserve.data(), RESERVE_SIZE); size_t n_read = read_count(myFile, dst + RESERVE_SIZE, length - RESERVE_SIZE); size_t n_bufferable = n_read + RESERVE_SIZE; if(n_bufferable < length) { std::memcpy(reserve.data(), dst + n_bufferable - RESERVE_SIZE, RESERVE_SIZE); return n_bufferable - RESERVE_SIZE; } else { std::array<char, RESERVE_SIZE> temp_buffer = {0,0,0,0}; size_t temp_size = read_count(myFile, temp_buffer.data(), RESERVE_SIZE); std::memcpy(reserve.data(), dst + n_bufferable - (RESERVE_SIZE - temp_size), RESERVE_SIZE - temp_size); std::memcpy(reserve.data() + RESERVE_SIZE - temp_size, temp_buffer.data(), temp_size); return n_bufferable - (RESERVE_SIZE - temp_size); } }
You are actually leaking the reserved bytes. You rely on the user to not read them, but are users always careful? It may be beneficial to std::fill
the remaining bytes to zero, but that depends on your case.
Exact reading code, part two:
else { // length < RESERVE_SIZE std::vector<char> temp_buffer(length, '0円'); size_t return_value = read_count(myFile, temp_buffer.data(), length); // n_bufferable is at most RESERVE_SIZE*2 - 1 = 7 std::memcpy(dst, reserve.data(), return_value); std::memmove(reserve.data(), reserve.data() + return_value, RESERVE_SIZE - return_value); std::memcpy(reserve.data() + (RESERVE_SIZE - return_value), temp_buffer.data(), return_value); return return_value; }
Well, using std::vector
seems a little bit strange here. reserve_size
is a small number, and since you are using it in other places, so why not use std::array
?
// example function usage int main() { std::ofstream outfile("/tmp/temp.bin", std::ios::out | std::ios::binary); std::array<char, 30> mydata = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29}; outfile.write(mydata.data(), 30); outfile.close(); std::cout << "test1" << std::endl; std::ifstream infile("/tmp/temp.bin", std::ios::in | std::ios::binary); std::vector<char> buffer(100, '0円'); size_t bytes_read; char * bd = buffer.data(); bytes_read = read_reserve(infile, bd, 3, true); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE); bytes_read = read_reserve(infile, bd, 3, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE); bytes_read = read_reserve(infile, bd, 5, true); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE); bytes_read = read_reserve(infile, bd, 5, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE); bytes_read = read_reserve(infile, bd, 100, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE); infile.close(); std::cout << "test2" << std::endl; first_read = true; std::ifstream infile2("/tmp/temp.bin", std::ios::in | std::ios::binary); bytes_read = read_reserve(infile2, bd, 28, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE); bytes_read = read_reserve(infile2, bd, 3, false); data_printer("bytes read: ", bd, bytes_read); data_printer("reserve: ", reserve.data(), RESERVE_SIZE); infile2.close(); }
out
is implied for std::ofstream
, and in
is implied for std::ifstream
, so omit them. Initializing mydata
like that is no fun; std::iota
is better. Also, it may be better to use scopes instead of manually calling close
. The reading tests are a lot of duplicate code and may deserve a helper function. Also, the three parts can probably be extracted into their own functions. This is just a test, so it doesn't really matter too much, though.
-
\$\begingroup\$ Thank you! This was more than I could have hoped for! :) \$\endgroup\$thc– thc2019年08月05日 17:48:48 +00:00Commented Aug 5, 2019 at 17:48