I rarely program C++ in anger, and considered myself somewhat rusty, and certainly not up to speed on C++11. For a bit of exercise, I thought I'd try implementing sha1sum, as wasn't really up to speed with topics such as cryptographic hash functions either.
(Yes, yes, you shouldn't use SHA-1 in new code, understood!).
Would this pass muster as modern idiomatic C++?
#include <vector>
#include <cinttypes>
#include <iostream>
#include <ostream>
#include <istream>
#include <ios>
#include <numeric>
#include <iomanip>
#include <functional>
#include <fstream>
#include <algorithm>
namespace {
const auto blockBits = 512;
const auto int32Size = 4;
const auto int32Bits = int32Size * 8;
const auto intsPerBlock = blockBits / int32Bits;
const auto resultBits = 160;
const auto intsInResult = resultBits / int32Bits;
typedef std::vector<uint32_t> BlockVector;
typedef std::vector<uint32_t> HashVector;
uint32_t rotl(const uint32_t x, const unsigned int& n) {
const unsigned int rshift = int32Bits - n;
return (x << n) | (x >> rshift);
}
uint32_t accumulateUint(const uint32_t& left, unsigned char& right) {
return (left << 8) | right;
}
typedef std::function<uint32_t(const uint32_t&, const uint32_t&, const uint32_t&)> RoundFunction;
RoundFunction& getFunctionForRound(const unsigned int round) {
static RoundFunction roundFunctions[] = {
// Ch: rounds 0 - 19
[] (const uint32_t& x, const uint32_t& y, const uint32_t& z) -> uint32_t {
return (x & y) ^ (~x & z);
},
// Parity: 20-39, 60 - 79
[] (const uint32_t& x, const uint32_t& y, const uint32_t& z) -> uint32_t {
return x ^ y ^ z;
},
// Maj: 40 - 59
[] (const uint32_t& x, const uint32_t& y, const uint32_t& z) -> uint32_t {
return (x & y) ^ (x & z) ^ (y & z);
}
};
return roundFunctions[(round > 59 ? round - 40 : round) / 20];
}
const uint32_t& getConstantForRound(const unsigned int round) {
static uint32_t constants[] = {
0x5a827999, // 0 - 29
0x6ed9eba1, // 20 - 39
0x8f1bbcdc, // 40 - 59
0xca62c1d6, // 60 - 79
};
return constants[round / 20];
}
struct RoundVariables {
HashVector workingVars;
BlockVector W;
RoundVariables(HashVector& hash, BlockVector&& w) : workingVars(HashVector(hash)), W(w) {}
HashVector&& getHash() { return std::move(workingVars); }
};
void hashRound(RoundVariables& roundVars, const unsigned int& round) {
enum {a, b, c, d, e};
HashVector& vars = roundVars.workingVars;
uint32_t T = 0u;
T = rotl(vars[a], 5) + getFunctionForRound(round)(vars[b], vars[c], vars[d]) +
vars[e] + getConstantForRound(round) + roundVars.W[round];
std::rotate(vars.rbegin(), vars.rbegin() + 1, vars.rend());
vars[c] = rotl(vars[c], 30);
vars[a] = T;
}
BlockVector computeHash(HashVector&& previousHash, const BlockVector::const_iterator& blockStart, const BlockVector::const_iterator& blockEnd) {
BlockVector w;
std::copy(blockStart, blockEnd, std::back_inserter(w));
for (auto i = 16; i < 80; i++) {
w.emplace_back(rotl(w[i - 3] ^ w[i - 8] ^ w[i - 14] ^ w[i - 16], 1));
}
RoundVariables roundVars(previousHash, std::move(w));
for (auto i = 0; i < 80; i++) {
hashRound(roundVars, i);
}
HashVector newHash = roundVars.getHash();
std::transform(newHash.begin(), newHash.end(), previousHash.begin(), newHash.begin(),
std::plus<uint32_t>());
return newHash;
}
HashVector sha1(const BlockVector& input) {
HashVector hashes = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0};
for (auto it = input.begin(); it < input.end(); it += intsPerBlock) {
hashes = computeHash(std::move(hashes), it, it + intsPerBlock);
}
return hashes;
}
typedef std::pair<BlockVector, uint64_t> Input;
BlockVector PadInput(Input&& input) {
BlockVector padded(std::get<0>(input));
uint64_t byteCount = std::get<1>(input);
// Add the end-of-message marker
if ((byteCount % int32Size) != 0) {
unsigned char marker = 0x80u;
uint32_t last = padded.back();
last = accumulateUint(last, marker);
last = last << ((int32Size - (byteCount % int32Size) - 1) * 8);
padded.back() = last;
} else {
padded.emplace_back(0x80000000);
}
unsigned int paddingRequired = intsPerBlock - ((padded.size() + 2) % intsPerBlock);
padded.insert(padded.end(), paddingRequired, '0円');
// Add the number of bits in the original message
byteCount *= 8;
uint32_t messageLenLSB = static_cast<uint32_t>(byteCount & 0xFFFFFFFF);
uint32_t messageLenMSB = static_cast<uint32_t>((byteCount >> (int32Size * 8)) & 0xFFFFFFFF);
// The standard assumes big-endian
padded.emplace_back(messageLenMSB);
padded.emplace_back(messageLenLSB);
return padded;
}
Input GetInput(std::istream& in) {
int c;
uint64_t count = 0u;
uint32_t next = 0u;
BlockVector v;
v.reserve(intsPerBlock);
while ((c = in.get()) != EOF) {
unsigned char charInput = static_cast<unsigned char>(c);
next = accumulateUint(next, charInput);
if ((++count % int32Size) == 0) {
v.emplace_back(next);
next = 0;
}
}
if (count > 0) {
v.emplace_back(next);
}
return std::make_pair(std::move(v), count);
}
void hashSource(std::istream& in, const char* source) {
HashVector hashed = sha1(PadInput(move(GetInput(in))));
std::cout << std::hex << std::setfill('0') << std::setw(int32Size);
for (auto i : hashed) {
std::cout << i;
}
std::cout << " " << source << std::endl;
}
} // namespace
int main(int argc, char** argv) {
if (argc == 1) {
hashSource(std::cin, "-");
return 0;
}
int exitCode = 0;
for (auto i = 1; i < argc; i++) {
std::ifstream in(argv[i]);
if (in.fail()) {
std::cerr << argv[0] << ": " << argv[i] << ": no such file or directory" << std::endl;
in.close();
exitCode = 1;
continue;
}
hashSource(in, argv[i]);
in.close();
}
return exitCode;
}
1 Answer 1
Without doing a full comprehensive review, here are some things you could do to improve your code:
While it's generally a good practice to always explicitly include the header that you need from the standard library,
<iostream>
is guaranteed to include<istream>
,<ostream>
and<ios>
; moreover it's easy to remember. Therefore, you could get rid of the explicit#include
s of<istream>
,<ostream>
and<ios>
.Instead of putting everything in a single file and putting everything but
main
in an anonymousnamespace
, you could break your code in reusable pieces of code (you already have reusable functions, which is good) and create a header to put their prototypes.const auto int32Size = 4;
and friends could be safer. If you know that you can only work when a 32-bits integer is present, then I would write it like that:constexpr std::int32_t in32Size = sizeof(std::int32_t);
While using
auto
is generally a good practice, especially when writing generic code, you are actually writing code for a very specific type, namelystd::int32_t
(it's even in the name, using anything else could be misleading). Therefore, I would document that by usingstd::int32_t
and lettingsizeof
deduce its size. This way, you can never be wrong.const auto int32Bits = int32Size * 8;
can also be trivially replaced with this:constexpr std::int32_t int32Bits = std::numeric_limits<std::int32_t>::digits;
std::numeric_limits
is the tool you want to use if you want to get type-safe and implementation-safe information about the built-in integer and floating point types (and even more if the types are overloaded).Note that I used
constexpr
in both of the previous points. It's the C++11 standard way to require the compiler to compute the value at compile time so that it can be used in more contexts requiring a compile-time constant expression.The two following lines could benefit from a little bit of C++11:
typedef std::vector<uint32_t> BlockVector; typedef std::vector<uint32_t> HashVector;
Using the new alias template, they could be rewritten as:
using BlockVector = std::vector<uint32_t>; using HashVector = std::vector<uint32_t>;
While the old and new version are semantically the same, I find the new alias template syntax closer to the variable assignment and namespace assignment syntaxes which helps not having to remember the
typedef
syntax which becomes awkward with functions pointers. Morevoer, this using declaration can be templated, unliketypedef
.It is often considered good practice to pass built-in types and small types (~smaller than the size of two pointers) by value to functions instead of passing them by
const&
. The rationale being that types that can fit in registers are easier to optimize if the compiler doesn't have to take into account possible aliasing problems introduced by references.This line does not feel safe:
HashVector&& getHash() { return std::move(workingVars); }
Generally speaking, one should return by rvalue-reference only if the current instance is guaranteed to be a temporary. In order to do this, you could rvlue-qualify the function itself:
HashVector&& getHash() && { return std::move(workingVars); }
But then it would also probably need another overload for lvalue-references. Actually, rvalue-references were also introduced so that simple code could be simple to write. Which is to say, your function should simply be:
HashVector getHash() { return workingVars; }
The compiler will do the job of finding what is a temporary and move when appropriate.