5
\$\begingroup\$

Intro

This post is the continuation of Get histogram of bytes in any set of files in C++14 - take II. This time, I have incorporated almost all suggestions in the answers.

Code

My newest trial looks like this:

#include <algorithm>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <ios>
#include <istream>
#include <iostream>
#include <sstream>
#include <cstdio>
#include <string>
#include <vector>
static constexpr size_t screenWidth = 80;
static constexpr size_t linePreambleWidth = 11;
using Histogram = std::vector<std::size_t>;
class ByteHistogram {
 static const size_t histogramCapacity = 0x100;
 Histogram histogram_;
 public:
 ByteHistogram() {
 histogram_.resize(histogramCapacity, 0);
 }
 void insert(unsigned char ch) {
 ++histogram_[ch];
 }
 Histogram::const_iterator cbegin() const {
 return histogram_.cbegin();
 }
 Histogram::const_iterator cend() const {
 return histogram_.cend();
 }
 Histogram::iterator begin() {
 return histogram_.begin();
 }
 Histogram::iterator end() {
 return histogram_.end();
 }
 size_t size() const {
 return histogram_.size();
 }
 size_t getMaximumCount() const {
 return *std::max_element(histogram_.cbegin(),
 histogram_.cend());
 }
 std::istream& insert(std::istream& is)
 {
 std::istream::int_type c;
 while ((c = is.get()) != std::istream::traits_type::eof()) {
 insert(static_cast<unsigned char>(c));
 }
 return is;
 }
 auto const operator[](std::size_t i) const {
 return histogram_[i];
 }
};
static size_t computeCounterStringLength(const size_t maximumCount) {
 return std::snprintf(nullptr, 0, "%zu", maximumCount);
}
static std::size_t computeBarLength(const size_t count,
 const size_t maximumCount,
 const size_t maximumCountLength) {
 auto const availableSpace = 
 static_cast<double>(screenWidth - linePreambleWidth 
 - maximumCountLength);
 return static_cast<size_t>(availableSpace * count / maximumCount);
}
std::ostream& operator<<(std::ostream& os,
 ByteHistogram& hist) {
 const size_t maximumCount = hist.getMaximumCount();
 const size_t countStringLength = computeCounterStringLength(maximumCount);
 const size_t maximumBarLength = screenWidth - linePreambleWidth 
 - countStringLength;
 std::size_t i = 0;
 for (auto const& count : hist) {
 const double ratio = static_cast<double>(count) / 
 static_cast<double>(maximumCount);
 const std::size_t barLength = 
 static_cast<std::size_t>(ratio * maximumBarLength);
 const unsigned char ch = static_cast<char>(i);
 std::cout << "0x" 
 << std::hex 
 << std::setfill('0') 
 << std::setw(2) 
 << i
 << " ["
 << (std::isprint(ch) ? (char) ch : '?')
 << "]: "
 << std::dec
 << std::setfill(' ')
 << std::setw(countStringLength) 
 << count 
 << ' '
 << std::setfill('*') 
 << std::setw(barLength) 
 << '\n';
 i++;
 }
 return os;
}
int main(int argc, char* argv[]) {
 ByteHistogram hist;
 if (argc == 1) {
 hist.insert(std::cin);
 if (std::cin.bad()) {
 std::cerr << "Reading from standard input failed.\n";
 return EXIT_FAILURE;
 }
 } else {
 for (int i = 1; i < argc; ++i) {
 std::ifstream stream{ argv[i], std::ios::binary };
 hist.insert(stream);
 if (stream.bad()) {
 std::cerr << "Reading from \"" << argv[i] << "\" failed.\n";
 return EXIT_FAILURE;
 }
 }
 }
 std::cout << hist;
 return EXIT_SUCCESS;
}

Typical output

0x00 [?]: 519 ****************
0x01 [?]: 513 ****************
0x02 [?]: 436 *************
0x03 [?]: 520 ****************
0x04 [?]: 449 **************
0x05 [?]: 516 ****************
0x06 [?]: 495 ***************
0x07 [?]: 479 ***************
0x08 [?]: 479 ***************
0x09 [?]: 499 ***************
0x0a [?]: 1007 ********************************
0x0b [?]: 486 ***************
0x0c [?]: 481 ***************
0x0d [?]: 499 ***************
0x0e [?]: 529 ****************
0x0f [?]: 518 ****************
0x10 [?]: 461 **************
0x11 [?]: 476 **************
0x12 [?]: 454 **************
0x13 [?]: 498 ***************
0x14 [?]: 493 ***************
0x15 [?]: 468 **************
0x16 [?]: 502 ***************
0x17 [?]: 522 ****************
0x18 [?]: 511 ****************
0x19 [?]: 500 ***************
0x1a [?]: 493 ***************
0x1b [?]: 517 ****************
0x1c [?]: 508 ****************
0x1d [?]: 473 **************
0x1e [?]: 499 ***************
0x1f [?]: 525 ****************
0x20 [ ]: 1941 ****************************************************************
0x21 [!]: 502 ***************
0x22 ["]: 443 *************
0x23 [#]: 490 ***************
0x24 [$]: 496 ***************
0x25 [%]: 508 ****************
0x26 [&]: 443 *************
0x27 [']: 473 **************
0x28 [(]: 492 ***************
0x29 [)]: 496 ***************
0x2a [*]: 446 *************
0x2b [+]: 475 **************
0x2c [,]: 510 ****************
0x2d [-]: 518 ****************
0x2e [.]: 533 ****************
0x2f [/]: 886 ****************************
0x30 [0]: 1361 ********************************************
0x31 [1]: 635 ********************
0x32 [2]: 681 *********************
0x33 [3]: 680 *********************
0x34 [4]: 661 *********************
0x35 [5]: 759 ************************
0x36 [6]: 693 **********************
0x37 [7]: 670 *********************
0x38 [8]: 598 *******************
0x39 [9]: 584 ******************
0x3a [:]: 563 *****************
0x3b [;]: 524 ****************
0x3c [<]: 641 ********************
0x3d [=]: 543 *****************
0x3e [>]: 619 *******************
0x3f [?]: 516 ****************
0x40 [@]: 483 ***************
0x41 [A]: 572 ******************
0x42 [B]: 538 *****************
0x43 [C]: 574 ******************
0x44 [D]: 561 *****************
0x45 [E]: 452 **************
0x46 [F]: 628 ********************
0x47 [G]: 513 ****************
0x48 [H]: 512 ****************
0x49 [I]: 572 ******************
0x4a [J]: 448 **************
0x4b [K]: 488 ***************
0x4c [L]: 446 *************
0x4d [M]: 500 ***************
0x4e [N]: 505 ***************
0x4f [O]: 525 ****************
0x50 [P]: 495 ***************
0x51 [Q]: 466 **************
0x52 [R]: 549 *****************
0x53 [S]: 536 ****************
0x54 [T]: 539 *****************
0x55 [U]: 504 ***************
0x56 [V]: 493 ***************
0x57 [W]: 493 ***************
0x58 [X]: 474 **************
0x59 [Y]: 508 ****************
0x5a [Z]: 525 ****************
0x5b [[]: 493 ***************
0x5c [\]: 499 ***************
0x5d []]: 527 ****************
0x5e [^]: 464 **************
0x5f [_]: 566 *****************
0x60 [`]: 505 ***************
0x61 [a]: 640 ********************
0x62 [b]: 629 ********************
0x63 [c]: 577 ******************
0x64 [d]: 694 **********************
0x65 [e]: 940 ******************************
0x66 [f]: 540 *****************
0x67 [g]: 561 *****************
0x68 [h]: 553 *****************
0x69 [i]: 640 ********************
0x6a [j]: 635 ********************
0x6b [k]: 481 ***************
0x6c [l]: 619 *******************
0x6d [m]: 594 ******************
0x6e [n]: 800 *************************
0x6f [o]: 792 *************************
0x70 [p]: 603 *******************
0x71 [q]: 518 ****************
0x72 [r]: 628 ********************
0x73 [s]: 658 *********************
0x74 [t]: 844 ***************************
0x75 [u]: 521 ****************
0x76 [v]: 484 ***************
0x77 [w]: 512 ****************
0x78 [x]: 561 *****************
0x79 [y]: 557 *****************
0x7a [z]: 503 ***************
0x7b [{]: 568 ******************
0x7c [|]: 536 ****************
0x7d [}]: 537 ****************
0x7e [~]: 554 *****************
0x7f [?]: 541 *****************
0x80 [?]: 465 **************
0x81 [?]: 512 ****************
0x82 [?]: 475 **************
0x83 [?]: 468 **************
0x84 [?]: 505 ***************
0x85 [?]: 483 ***************
0x86 [?]: 509 ****************
0x87 [?]: 501 ***************
0x88 [?]: 463 **************
0x89 [?]: 461 **************
0x8a [?]: 496 ***************
0x8b [?]: 501 ***************
0x8c [?]: 507 ***************
0x8d [?]: 467 **************
0x8e [?]: 475 **************
0x8f [?]: 544 *****************
0x90 [?]: 505 ***************
0x91 [?]: 512 ****************
0x92 [?]: 490 ***************
0x93 [?]: 482 ***************
0x94 [?]: 428 *************
0x95 [?]: 511 ****************
0x96 [?]: 489 ***************
0x97 [?]: 518 ****************
0x98 [?]: 485 ***************
0x99 [?]: 464 **************
0x9a [?]: 522 ****************
0x9b [?]: 528 ****************
0x9c [?]: 493 ***************
0x9d [?]: 516 ****************
0x9e [?]: 558 *****************
0x9f [?]: 531 ****************
0xa0 [?]: 529 ****************
0xa1 [?]: 550 *****************
0xa2 [?]: 474 **************
0xa3 [?]: 534 ****************
0xa4 [?]: 497 ***************
0xa5 [?]: 487 ***************
0xa6 [?]: 487 ***************
0xa7 [?]: 491 ***************
0xa8 [?]: 495 ***************
0xa9 [?]: 483 ***************
0xaa [?]: 498 ***************
0xab [?]: 451 **************
0xac [?]: 474 **************
0xad [?]: 480 ***************
0xae [?]: 490 ***************
0xaf [?]: 522 ****************
0xb0 [?]: 493 ***************
0xb1 [?]: 507 ***************
0xb2 [?]: 492 ***************
0xb3 [?]: 504 ***************
0xb4 [?]: 528 ****************
0xb5 [?]: 516 ****************
0xb6 [?]: 499 ***************
0xb7 [?]: 519 ****************
0xb8 [?]: 451 **************
0xb9 [?]: 507 ***************
0xba [?]: 517 ****************
0xbb [?]: 533 ****************
0xbc [?]: 483 ***************
0xbd [?]: 517 ****************
0xbe [?]: 527 ****************
0xbf [?]: 538 *****************
0xc0 [?]: 510 ****************
0xc1 [?]: 464 **************
0xc2 [?]: 490 ***************
0xc3 [?]: 492 ***************
0xc4 [?]: 449 **************
0xc5 [?]: 505 ***************
0xc6 [?]: 512 ****************
0xc7 [?]: 529 ****************
0xc8 [?]: 472 **************
0xc9 [?]: 493 ***************
0xca [?]: 510 ****************
0xcb [?]: 490 ***************
0xcc [?]: 482 ***************
0xcd [?]: 512 ****************
0xce [?]: 521 ****************
0xcf [?]: 497 ***************
0xd0 [?]: 498 ***************
0xd1 [?]: 524 ****************
0xd2 [?]: 482 ***************
0xd3 [?]: 545 *****************
0xd4 [?]: 459 **************
0xd5 [?]: 506 ***************
0xd6 [?]: 459 **************
0xd7 [?]: 527 ****************
0xd8 [?]: 485 ***************
0xd9 [?]: 523 ****************
0xda [?]: 515 ****************
0xdb [?]: 482 ***************
0xdc [?]: 468 **************
0xdd [?]: 557 *****************
0xde [?]: 532 ****************
0xdf [?]: 542 *****************
0xe0 [?]: 513 ****************
0xe1 [?]: 543 *****************
0xe2 [?]: 494 ***************
0xe3 [?]: 549 *****************
0xe4 [?]: 493 ***************
0xe5 [?]: 464 **************
0xe6 [?]: 505 ***************
0xe7 [?]: 545 *****************
0xe8 [?]: 539 *****************
0xe9 [?]: 540 *****************
0xea [?]: 513 ****************
0xeb [?]: 537 ****************
0xec [?]: 514 ****************
0xed [?]: 506 ***************
0xee [?]: 496 ***************
0xef [?]: 556 *****************
0xf0 [?]: 552 *****************
0xf1 [?]: 545 *****************
0xf2 [?]: 520 ****************
0xf3 [?]: 532 ****************
0xf4 [?]: 566 *****************
0xf5 [?]: 509 ****************
0xf6 [?]: 497 ***************
0xf7 [?]: 533 ****************
0xf8 [?]: 547 *****************
0xf9 [?]: 539 *****************
0xfa [?]: 518 ****************
0xfb [?]: 524 ****************
0xfc [?]: 560 *****************
0xfd [?]: 527 ****************
0xfe [?]: 482 ***************
0xff [?]: 496 ***************

Critique request

Is there anything left to improve? Please tell me anything that comes to mind.

Toby Speight
87.1k14 gold badges104 silver badges322 bronze badges
asked Nov 13, 2024 at 7:56
\$\endgroup\$
3
  • \$\begingroup\$ It's worth pointing out that I disagree with switching from array to vector. \$\endgroup\$ Commented Nov 13, 2024 at 13:59
  • \$\begingroup\$ One thing you might consider doing is unit tests for the ByteHistogram class. You'll need to spilt your source into separate translation units first. \$\endgroup\$ Commented Nov 13, 2024 at 15:53
  • \$\begingroup\$ The earliest iteration of this question is Get histogram of bytes in any set of files in C++14, where my answer points out an easy 10x speedup for non-tiny files. (This version still calls .get() in a loop, and performs the same as before.) BTW, I tested this with blitiri.com.ar/p/libfiu/doc/posix.html and it does correctly stop on file read errors and report them, so apparently error also returns eof from .get(). At least glibc / libstdc++ does. \$\endgroup\$ Commented Nov 14, 2024 at 7:09

2 Answers 2

5
\$\begingroup\$

It's got all my suggestions incorporated (apart from a late edit to recommend std::uint_fast64_t from <cstdint> instead of std::size_t for the counts), so not much I can add now.

Just a handful of tiny observations:

  1. Includes are almost in sorted order - except for <cstdio>.

  2. We use std::isprint(), but failed to include <cctype>.

  3. size_t assumed in global namespace - should be std::size_t.

  4. It's not clear why the Histogram class needs to be visible outside of ByteHistogram.

  5. We should construct histogram_ just once, in the initialiser list, rather than default-constructing and then resizing:

     public:
     ByteHistogram()
     : histogram_(histogramCapacity)
     {
     }
    
  6. I think we should be providing (only) const versions of begin() and end() - unless you really want to allow modification of the internals from outside (I don't recommend that). A member function to clear() the container may be worth considering.

  7. This code might print one fewer star than you expect:

     std::cout << std::setfill('*') 
     << std::setw(barLength) 
     << '\n';
    

    I don't see anywhere that the length of the \n is accounted for when computing barLength.

  8. Eliminate all compiler warnings - I'm getting a few for lossy arithmetic conversions and for the unused function computeBarLength().

answered Nov 13, 2024 at 8:29
\$\endgroup\$
5
\$\begingroup\$

Making more use of C++20

Like the previous versions, this is valid C++14 code. But this one is tagged C++20, so let's have a look at what newer features we might take advantage of, now we're no longer supporting ten-year-old compilers.

These are all light suggestions rather than recommendations - the code being replaced is not awful, and the suggestion isn't always unequivocally better.


Finding the maximum value can be done with a simpler function call. Instead of

 return *std::max_element(histogram_.cbegin(),
 histogram_.cend());

we can write

 return std::ranges::max(histogram_);

We might choose to add a template overload of insert() that accepts any range type (such as strings, string-views, other containers, etc):

#include <concepts>
#include <ranges>
 template<std::ranges::input_range R>
 void insert(const R& input)
 requires std::convertible_to<std::ranges::range_value_t<R>, unsigned char>
 {
 for (unsigned char c: input) {
 insert(c);
 }
 }

We're using two constraints here. The first is secreted away in template<std::ranges::input_range R>, meaning that R can be any type that satisfies the std::ranges::input_range concept. The second is more explicit (written with requires) and says that this template exists only if the value type of the range can be converted to unsigned char.

You might be tempted to use this function to implement the stream input (by creating a std::views::istream<unsigned char> from the stream to make a range), but that won't quite work, because the range takes formatted input from the stream and we want unformatted. Disappointingly, there's no std::views::istreambuf to parallel std::istreambuf_iterator, so if you want such a view, you'll have to make your own.


With a range adapter, we can change this for loop that iterates over the arguments into a range-based loop:

 for (int i = 1; i < argc; ++i) {
 std::ifstream stream{ argv[i], std::ios::binary };
 ⋮

Sadly, the need to convert argc to std::size_t for the count argument obfuscates it a bit:

#include <ranges>
#include <span>
 std::span const args{argv, static_cast<std::size_t>(argc)};
 for (auto filename: args | std::views::drop(1)) {
 std::ifstream stream{filename, std::ios::binary};
 ⋮

I think it's a bit more readable if we use the span constructor that takes two iterators (pointers, in this case):

 for (auto filename: std::span{argv+1, argv+argc}) {

Here, I was able to adjust the first pointer to skip the program name as a simpler alternative to using the "drop" view.


Anticipating C++23

If and when we're able to move to C++23, the new formatted printing facility could be used for our error reporting. We can replace

 std::cerr << "Reading from \"" << filename << "\" failed.\n";

with

 std::println(std::cerr, R"(Reading from "{}" failed.)", filename);

(I've used a raw string here so that we don't need backslashes; that's available as far back as C++11, but often overlooked).

answered Nov 13, 2024 at 14:39
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.