128-bit unsigned integer

Question 1

This implements 128-bit unsigned integer using C++14. It works on MSVC and 32-bit architectures being complementary to the unsigned __int128 type provided by GCC and clang on 64-bit architectures.

// intx: extended precision integer library.
// Copyright 2019 Pawel Bylica.
// Licensed under the Apache License, Version 2.0.
#pragma once
#include <algorithm>
#include <climits>
#include <cstdint>
#include <limits>
#include <stdexcept>
#include <string>
#include <type_traits>
#ifdef _MSC_VER
#include <intrin.h>
#endif
namespace intx
{
template <unsigned N>
struct uint;
/// The 128-bit unsigned integer.
///
/// This type is defined as a specialization of uint<> to easier integration with full intx package,
/// however, uint128 may be used independently.
template <>
struct uint<128>
{
 uint64_t lo = 0;
 uint64_t hi = 0;
 constexpr uint() noexcept = default;
 constexpr uint(uint64_t high, uint64_t low) noexcept : lo{low}, hi{high} {}
 template <typename T,
 typename = typename std::enable_if_t<std::is_convertible<T, uint64_t>::value>>
 constexpr uint(T x) noexcept : lo(static_cast<uint64_t>(x)) // NOLINT
 {}
#ifdef __SIZEOF_INT128__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
 constexpr uint(unsigned __int128 x) noexcept // NOLINT
 : lo{uint64_t(x)}, hi{uint64_t(x >> 64)}
 {}
 constexpr explicit operator unsigned __int128() const noexcept
 {
 return (static_cast<unsigned __int128>(hi) << 64) | lo;
 }
#pragma GCC diagnostic pop
#endif
 constexpr explicit operator bool() const noexcept { return hi | lo; }
 /// Explicit converting operator for all builtin integral types.
 template <typename Int, typename = typename std::enable_if<std::is_integral<Int>::value>::type>
 constexpr explicit operator Int() const noexcept
 {
 return static_cast<Int>(lo);
 }
};
using uint128 = uint<128>;
/// Linear arithmetic operators.
/// @{
constexpr uint128 operator+(uint128 x, uint128 y) noexcept
{
 const auto lo = x.lo + y.lo;
 const auto carry = x.lo > lo;
 const auto hi = x.hi + y.hi + carry;
 return {hi, lo};
}
constexpr uint128 operator+(uint128 x) noexcept
{
 return x;
}
constexpr uint128 operator-(uint128 x, uint128 y) noexcept
{
 const auto lo = x.lo - y.lo;
 const auto borrow = x.lo < lo;
 const auto hi = x.hi - y.hi - borrow;
 return {hi, lo};
}
constexpr uint128 operator-(uint128 x) noexcept
{
 // Implementing as subtraction is better than ~x + 1.
 // Clang9: Perfect.
 // GCC8: Does something weird.
 return 0 - x;
}
inline uint128& operator++(uint128& x) noexcept
{
 return x = x + 1;
}
inline uint128& operator--(uint128& x) noexcept
{
 return x = x - 1;
}
inline uint128 operator++(uint128& x, int) noexcept
{
 auto ret = x;
 ++x;
 return ret;
}
inline uint128 operator--(uint128& x, int) noexcept
{
 auto ret = x;
 --x;
 return ret;
}
/// Optimized addition.
///
/// This keeps the multiprecision addition until CodeGen so the pattern is not
/// broken during other optimizations.
constexpr uint128 fast_add(uint128 x, uint128 y) noexcept
{
#ifdef __SIZEOF_INT128__xxx
 return (unsigned __int128){x} + (unsigned __int128){y};
#else
 // Fallback to regular addition.
 return x + y;
#endif
}
/// @}
/// Comparison operators.
///
/// In all implementations bitwise operators are used instead of logical ones
/// to avoid branching.
///
/// @{
constexpr bool operator==(uint128 x, uint128 y) noexcept
{
 // Clang7: generates perfect xor based code,
 // much better than __int128 where it uses vector instructions.
 // GCC8: generates a bit worse cmp based code
 // although it generates the xor based one for __int128.
 return (x.lo == y.lo) & (x.hi == y.hi);
}
constexpr bool operator!=(uint128 x, uint128 y) noexcept
{
 // Analogous to ==, but == not used directly, because that confuses GCC8.
 return (x.lo != y.lo) | (x.hi != y.hi);
}
constexpr bool operator<(uint128 x, uint128 y) noexcept
{
 // OPT: This should be implemented by checking the borrow of x - y,
 // but compilers (GCC8, Clang7)
 // have problem with properly optimizing subtraction.
 return (x.hi < y.hi) | ((x.hi == y.hi) & (x.lo < y.lo));
}
constexpr bool operator<=(uint128 x, uint128 y) noexcept
{
 // OPT: This also should be implemented by subtraction + flag check.
 // TODO: Clang7 is not able to fully optimize
 // the naive implementation as (x < y) | (x == y).
 return (x.hi < y.hi) | ((x.hi == y.hi) & (x.lo <= y.lo));
}
constexpr bool operator>(uint128 x, uint128 y) noexcept
{
 return !(x <= y);
}
constexpr bool operator>=(uint128 x, uint128 y) noexcept
{
 return !(x < y);
}
/// @}
/// Bitwise operators.
/// @{
constexpr uint128 operator~(uint128 x) noexcept
{
 return {~x.hi, ~x.lo};
}
constexpr uint128 operator|(uint128 x, uint128 y) noexcept
{
 // Clang7: perfect.
 // GCC8: stupidly uses a vector instruction in all bitwise operators.
 return {x.hi | y.hi, x.lo | y.lo};
}
constexpr uint128 operator&(uint128 x, uint128 y) noexcept
{
 return {x.hi & y.hi, x.lo & y.lo};
}
constexpr uint128 operator^(uint128 x, uint128 y) noexcept
{
 return {x.hi ^ y.hi, x.lo ^ y.lo};
}
constexpr uint128 operator<<(uint128 x, unsigned shift) noexcept
{
 return (shift < 64) ?
 // Find the part moved from lo to hi.
 // For shift == 0 right shift by (64 - shift) is invalid so
 // split it into 2 shifts by 1 and (63 - shift).
 uint128{(x.hi << shift) | ((x.lo >> 1) >> (63 - shift)), x.lo << shift} :
 // Guarantee "defined" behavior for shifts larger than 128.
 (shift < 128) ? uint128{x.lo << (shift - 64), 0} : 0;
}
constexpr uint128 operator<<(uint128 x, uint128 shift) noexcept
{
 if (shift < 128)
 return x << unsigned(shift);
 return 0;
}
constexpr uint128 operator>>(uint128 x, unsigned shift) noexcept
{
 return (shift < 64) ?
 // Find the part moved from lo to hi.
 // For shift == 0 left shift by (64 - shift) is invalid so
 // split it into 2 shifts by 1 and (63 - shift).
 uint128{x.hi >> shift, (x.lo >> shift) | ((x.hi << 1) << (63 - shift))} :
 // Guarantee "defined" behavior for shifts larger than 128.
 (shift < 128) ? uint128{0, x.hi >> (shift - 64)} : 0;
}
constexpr uint128 operator>>(uint128 x, uint128 shift) noexcept
{
 if (shift < 128)
 return x >> unsigned(shift);
 return 0;
}
/// @}
/// Multiplication
/// @{
/// Portable full unsigned multiplication 64 x 64 -> 128.
constexpr uint128 constexpr_umul(uint64_t x, uint64_t y) noexcept
{
 uint64_t xl = x & 0xffffffff;
 uint64_t xh = x >> 32;
 uint64_t yl = y & 0xffffffff;
 uint64_t yh = y >> 32;
 uint64_t t0 = xl * yl;
 uint64_t t1 = xh * yl;
 uint64_t t2 = xl * yh;
 uint64_t t3 = xh * yh;
 uint64_t u1 = t1 + (t0 >> 32);
 uint64_t u2 = t2 + (u1 & 0xffffffff);
 uint64_t lo = (u2 << 32) | (t0 & 0xffffffff);
 uint64_t hi = t3 + (u2 >> 32) + (u1 >> 32);
 return {hi, lo};
}
/// Full unsigned multiplication 64 x 64 -> 128.
inline uint128 umul(uint64_t x, uint64_t y) noexcept
{
#if defined(__SIZEOF_INT128__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
 const auto p = static_cast<unsigned __int128>(x) * y;
 return {uint64_t(p >> 64), uint64_t(p)};
#pragma GCC diagnostic pop
#elif defined(_MSC_VER)
 unsigned __int64 hi;
 const auto lo = _umul128(x, y, &hi);
 return {hi, lo};
#else
 return constexpr_umul(x, y);
#endif
}
inline uint128 operator*(uint128 x, uint128 y) noexcept
{
 auto p = umul(x.lo, y.lo);
 p.hi += (x.lo * y.hi) + (x.hi * y.lo);
 return {p.hi, p.lo};
}
constexpr uint128 constexpr_mul(uint128 x, uint128 y) noexcept
{
 auto p = constexpr_umul(x.lo, y.lo);
 p.hi += (x.lo * y.hi) + (x.hi * y.lo);
 return {p.hi, p.lo};
}
/// @}
/// Assignment operators.
/// @{
constexpr uint128& operator+=(uint128& x, uint128 y) noexcept
{
 return x = x + y;
}
constexpr uint128& operator-=(uint128& x, uint128 y) noexcept
{
 return x = x - y;
}
inline uint128& operator*=(uint128& x, uint128 y) noexcept
{
 return x = x * y;
}
constexpr uint128& operator|=(uint128& x, uint128 y) noexcept
{
 return x = x | y;
}
constexpr uint128& operator&=(uint128& x, uint128 y) noexcept
{
 return x = x & y;
}
constexpr uint128& operator^=(uint128& x, uint128 y) noexcept
{
 return x = x ^ y;
}
constexpr uint128& operator<<=(uint128& x, unsigned shift) noexcept
{
 return x = x << shift;
}
constexpr uint128& operator>>=(uint128& x, unsigned shift) noexcept
{
 return x = x >> shift;
}
/// @}
inline unsigned clz(uint32_t x) noexcept
{
#ifdef _MSC_VER
 unsigned long most_significant_bit;
 _BitScanReverse(&most_significant_bit, x);
 return 31 ^ (unsigned)most_significant_bit;
#else
 return unsigned(__builtin_clz(x));
#endif
}
inline unsigned clz(uint64_t x) noexcept
{
#ifdef _MSC_VER
 unsigned long most_significant_bit;
 _BitScanReverse64(&most_significant_bit, x);
 return 63 ^ (unsigned)most_significant_bit;
#else
 return unsigned(__builtin_clzll(x));
#endif
}
inline unsigned clz(uint128 x) noexcept
{
 // In this order `h == 0` we get less instructions than in case of `h != 0`.
 return x.hi == 0 ? clz(x.lo) | 64 : clz(x.hi);
}
inline uint64_t bswap(uint64_t x) noexcept
{
#ifdef _MSC_VER
 return _byteswap_uint64(x);
#else
 return __builtin_bswap64(x);
#endif
}
inline uint128 bswap(uint128 x) noexcept
{
 return {bswap(x.lo), bswap(x.hi)};
}
/// Division.
/// @{
template <typename T>
struct div_result
{
 T quot;
 T rem;
};
namespace internal
{
constexpr uint16_t reciprocal_table_item(uint8_t d9) noexcept
{
 return uint16_t(0x7fd00 / (0x100 | d9));
}
#define REPEAT4(x) \
 reciprocal_table_item((x) + 0), reciprocal_table_item((x) + 1), \
 reciprocal_table_item((x) + 2), reciprocal_table_item((x) + 3)
#define REPEAT32(x) \
 REPEAT4((x) + 4 * 0), REPEAT4((x) + 4 * 1), REPEAT4((x) + 4 * 2), REPEAT4((x) + 4 * 3), \
 REPEAT4((x) + 4 * 4), REPEAT4((x) + 4 * 5), REPEAT4((x) + 4 * 6), REPEAT4((x) + 4 * 7)
#define REPEAT256() \
 REPEAT32(32 * 0), REPEAT32(32 * 1), REPEAT32(32 * 2), REPEAT32(32 * 3), REPEAT32(32 * 4), \
 REPEAT32(32 * 5), REPEAT32(32 * 6), REPEAT32(32 * 7)
/// Reciprocal lookup table.
constexpr uint16_t reciprocal_table[] = {REPEAT256()};
#undef REPEAT4
#undef REPEAT32
#undef REPEAT256
} // namespace internal
/// Computes the reciprocal (2^128 - 1) / d - 2^64 for normalized d.
///
/// Based on Algorithm 2 from "Improved division by invariant integers".
inline uint64_t reciprocal_2by1(uint64_t d) noexcept
{
 auto d9 = uint8_t(d >> 55);
 auto v0 = uint64_t{internal::reciprocal_table[d9]};
 auto d40 = (d >> 24) + 1;
 auto v1 = (v0 << 11) - (v0 * v0 * d40 >> 40) - 1;
 auto v2 = (v1 << 13) + (v1 * (0x1000000000000000 - v1 * d40) >> 47);
 auto d0 = d % 2;
 auto d63 = d / 2 + d0; // ceil(d/2)
 auto nd0 = uint64_t(-int64_t(d0));
 auto e = ((v2 / 2) & nd0) - v2 * d63;
 auto mh = umul(v2, e).hi;
 auto v3 = (v2 << 31) + (mh >> 1);
 // OPT: The compiler tries a bit too much with 128 + 64 addition and ends up using subtraction.
 // Compare with __int128.
 auto mf = umul(v3, d);
 auto m = fast_add(mf, d);
 auto v3a = m.hi + d;
 auto v4 = v3 - v3a;
 return v4;
}
inline uint64_t reciprocal_3by2(uint128 d) noexcept
{
 auto v = reciprocal_2by1(d.hi);
 auto p = d.hi * v;
 p += d.lo;
 if (p < d.lo)
 {
 --v;
 if (p >= d.hi)
 {
 --v;
 p -= d.hi;
 }
 p -= d.hi;
 }
 auto t = umul(v, d.lo);
 p += t.hi;
 if (p < t.hi)
 {
 --v;
 if (uint128{p, t.lo} >= d)
 --v;
 }
 return v;
}
inline div_result<uint64_t> udivrem_2by1(uint128 u, uint64_t d, uint64_t v) noexcept
{
 auto q = umul(v, u.hi);
 q = fast_add(q, u);
 ++q.hi;
 auto r = u.lo - q.hi * d;
 if (r > q.lo)
 {
 --q.hi;
 r += d;
 }
 if (r >= d)
 {
 ++q.hi;
 r -= d;
 }
 return {q.hi, r};
}
inline div_result<uint128> udivrem_3by2(
 uint64_t u2, uint64_t u1, uint64_t u0, uint128 d, uint64_t v) noexcept
{
 auto q = umul(v, u2);
 q = fast_add(q, {u2, u1});
 auto r1 = u1 - q.hi * d.hi;
 auto t = umul(d.lo, q.hi);
 auto r = uint128{r1, u0} - t - d;
 r1 = r.hi;
 ++q.hi;
 if (r1 >= q.lo)
 {
 --q.hi;
 r += d;
 }
 if (r >= d)
 {
 ++q.hi;
 r -= d;
 }
 return {q.hi, r};
}
inline div_result<uint128> udivrem(uint128 x, uint128 y) noexcept
{
 if (y.hi == 0)
 {
 uint64_t xn_ex, xn_hi, xn_lo, yn;
 auto lsh = clz(y.lo);
 if (lsh != 0)
 {
 auto rsh = 64 - lsh;
 xn_ex = x.hi >> rsh;
 xn_hi = (x.lo >> rsh) | (x.hi << lsh);
 xn_lo = x.lo << lsh;
 yn = y.lo << lsh;
 }
 else
 {
 xn_ex = 0;
 xn_hi = x.hi;
 xn_lo = x.lo;
 yn = y.lo;
 }
 auto v = reciprocal_2by1(yn);
 auto res = udivrem_2by1({xn_ex, xn_hi}, yn, v);
 auto q1 = res.quot;
 res = udivrem_2by1({res.rem, xn_lo}, yn, v);
 return {{q1, res.quot}, res.rem >> lsh};
 }
 if (y.hi > x.hi)
 return {0, x};
 auto lsh = clz(y.hi);
 if (lsh == 0)
 {
 const auto q = unsigned{y.hi < x.hi} | unsigned{y.lo <= x.lo};
 return {q, x - (q ? y : 0)};
 }
 auto rsh = 64 - lsh;
 auto yn_lo = y.lo << lsh;
 auto yn_hi = (y.lo >> rsh) | (y.hi << lsh);
 auto xn_ex = x.hi >> rsh;
 auto xn_hi = (x.lo >> rsh) | (x.hi << lsh);
 auto xn_lo = x.lo << lsh;
 auto v = reciprocal_3by2({yn_hi, yn_lo});
 auto res = udivrem_3by2(xn_ex, xn_hi, xn_lo, {yn_hi, yn_lo}, v);
 return {res.quot, res.rem >> lsh};
}
inline div_result<uint128> sdivrem(uint128 x, uint128 y) noexcept
{
 constexpr auto sign_mask = uint128{1} << 127;
 const auto x_is_neg = (x & sign_mask) != 0;
 const auto y_is_neg = (y & sign_mask) != 0;
 const auto x_abs = x_is_neg ? -x : x;
 const auto y_abs = y_is_neg ? -y : y;
 const auto q_is_neg = x_is_neg ^ y_is_neg;
 const auto res = udivrem(x_abs, y_abs);
 return {q_is_neg ? -res.quot : res.quot, x_is_neg ? -res.rem : res.rem};
}
inline uint128 operator/(uint128 x, uint128 y) noexcept
{
 return udivrem(x, y).quot;
}
inline uint128 operator%(uint128 x, uint128 y) noexcept
{
 return udivrem(x, y).rem;
}
inline uint128& operator/=(uint128& x, uint128 y) noexcept
{
 return x = x / y;
}
inline uint128& operator%=(uint128& x, uint128 y) noexcept
{
 return x = x % y;
}
/// @}
} // namespace intx
namespace std
{
template <unsigned N>
struct numeric_limits<intx::uint<N>>
{
 using type = intx::uint<N>;
 static constexpr bool is_specialized = true;
 static constexpr bool is_integer = true;
 static constexpr bool is_signed = false;
 static constexpr bool is_exact = true;
 static constexpr bool has_infinity = false;
 static constexpr bool has_quiet_NaN = false;
 static constexpr bool has_signaling_NaN = false;
 static constexpr float_denorm_style has_denorm = denorm_absent;
 static constexpr bool has_denorm_loss = false;
 static constexpr float_round_style round_style = round_toward_zero;
 static constexpr bool is_iec559 = false;
 static constexpr bool is_bounded = true;
 static constexpr bool is_modulo = true;
 static constexpr int digits = CHAR_BIT * sizeof(type);
 static constexpr int digits10 = int(0.3010299956639812 * digits);
 static constexpr int max_digits10 = 0;
 static constexpr int radix = 2;
 static constexpr int min_exponent = 0;
 static constexpr int min_exponent10 = 0;
 static constexpr int max_exponent = 0;
 static constexpr int max_exponent10 = 0;
 static constexpr bool traps = std::numeric_limits<unsigned>::traps;
 static constexpr bool tinyness_before = false;
 static constexpr type min() noexcept { return 0; }
 static constexpr type lowest() noexcept { return min(); }
 static constexpr type max() noexcept { return ~type{0}; }
 static constexpr type epsilon() noexcept { return 0; }
 static constexpr type round_error() noexcept { return 0; }
 static constexpr type infinity() noexcept { return 0; }
 static constexpr type quiet_NaN() noexcept { return 0; }
 static constexpr type signaling_NaN() noexcept { return 0; }
 static constexpr type denorm_min() noexcept { return 0; }
};
} // namespace std
namespace intx
{
template <typename Int>
constexpr Int from_string(const char* s)
{
 using namespace std::literals;
 auto x = Int{};
 int num_digits = 0;
 if (s[0] == '0' && s[1] == 'x')
 {
 s += 2;
 while (auto d = *s++)
 {
 if (++num_digits > int{sizeof(x) * 2})
 throw std::overflow_error{"Integer overflow"};
 x <<= 4;
 if (d >= '0' && d <= '9')
 d -= '0';
 else if (d >= 'a' && d <= 'f')
 d -= 'a' - 10;
 else if (d >= 'A' && d <= 'F')
 d -= 'A' - 10;
 else
 throw std::invalid_argument{"Invalid literal character: "s + d};
 x |= d;
 }
 return x;
 }
 while (auto d = *s++)
 {
 if (num_digits++ > std::numeric_limits<Int>::digits10)
 throw std::overflow_error{"Integer overflow"};
 x = constexpr_mul(x, Int{10});
 if (d >= '0' && d <= '9')
 d -= '0';
 else
 throw std::invalid_argument{"Invalid literal character: "s + d};
 x += d;
 if (x < d)
 throw std::overflow_error{"Integer overflow"};
 }
 return x;
}
template <typename Int>
constexpr Int from_string(const std::string& s)
{
 return from_string<Int>(s.c_str());
}
constexpr uint128 operator""_u128(const char* s)
{
 return from_string<uint128>(s);
}
template <unsigned N>
inline std::string to_string(uint<N> x, int base = 10)
{
 if (base < 2 || base > 36)
 throw std::invalid_argument{"invalid base: " + std::to_string(base)};
 if (x == 0)
 return "0";
 auto s = std::string{};
 while (x != 0)
 {
 // TODO: Use constexpr udivrem_1?
 const auto res = udivrem(x, base);
 const auto d = int(res.rem);
 const auto c = d < 10 ? '0' + d : 'a' + d - 10;
 s.push_back(char(c));
 x = res.quot;
 }
 std::reverse(s.begin(), s.end());
 return s;
}
template <unsigned N>
inline std::string hex(uint<N> x)
{
 return to_string(x, 16);
}
} // namespace intx

Question 2

When you write "complementary to the unsigned __int128 type" do you mean that it's a drop-in replacement? (so one could write #define u128 unsigned __int128 or #define u128 intx::uint<128> depending on whether there's a native 128-bit integer)

Question 3

std::uint64_t is consistently misspelt throughout the code, and may be a poor choice anyway (since an exact 64-bit type need not be provided). It's better to use one of std::uint_fast64_t or std::uint_least64_t instead, for better portability. (Edit: as suggested by supercat in a comment, only the exact-width type will work correctly here, given the implicit masking we rely on for detecting carry and elsewhere, so disregard this recommendation: we want to fail to compile where this won't hold.)

I'd expect constructors from other intx::uint<> instantiations (obviously, the narrowing conversions should be explicit).

The ++ and -- operators could be implemented much more efficiently by using the members rather than converting and adding/subtracting. Unary - might also be better implemented element-wise.

It's good to see that you've included a specialization of std::numeric_limits for this type. A minor nitpick: I think that digits10 needs to round up rather than down. It's unfortunate that we're not allowed to specialize std::is_integral too.

A few problems in from_string():

Style (minor): writing sizeof(x) instead of simply sizeof x makes it look like x is a type name.
Actually, there's a more serious error in that line, in assuming that char is 8 bits (2 hex digits), rather than using CHAR_BIT - I'd write sizeof x * CHAR_BIT / 4 there for full portability.
Alternatively, allow redundant leading zeros by simply checking clz(x) >= 4 before shifting.
Also, why are octal inputs to from_string() treated as decimal? That's surprising to me.
It might be useful to have (private) multiply/divide routines for the small multiplicands in from_string() and to_string().
Perhaps we could skip over any leading whitespace and/or +, like the standard conversion functions (std::stoi(), std::from_chars(), etc) do?

These string conversions look like they could easily be adapted to become streaming operators << and >>.

Finally, it's a great shame that you didn't include the unit tests in the review; that would have greatly aided reviewers (especially when making suggestions for improvement).

Question 4

Code which uses objects of type uint64_t can rely upon any values being stored to such types being implicitly masked with 0xFFFFFFFFFFFFFFFFull, and will be rejected on any machines where such masking would not occur. Code which relies upon such masking but uses uint_least64_t would compile on machines where the masking would not occur, but may run incorrectly. Having such code be rejected would be preferable to having it run but produce wrong output.

Question 5

That's a good point @supercat - fully agreed.

Question 6

A bit of necromancy, since this came up in a web search.

On Clang and ICX, the __int128_t and __uint128_t types work with no warnings. #ifdef __clang__ detects either of them.
On GCC, the __int128__ and unsigned __int128__ types work with no warnings, even with -Wpedantic on. So you can remove those #pragma GCC diagnostic commands. Since several other compilers also define __GNUC__, check this afterward. or test __GNUC >= 10 (since other compilers that pretend to be GCC pretend to be a very early version).
MSVC does not have this type, but does support a few AVX intrinsics that can accelerate bitwise operations on 128-bit data, including left-shift, right-shift, and, or and xor. One way to synthesize multiplication would be from SIMD high- and low-word horizontal multiplications of the four 64-bit halves of the two operands.
C23 now supports _BitInt and unsigned _BitInt, and some compilers, including Clang, add that as an extension to C++. (This, however, does generate a compiler warning.)
A future implementation that adds support for int128_t will be required to define INT128_MAX in <cstdint>/<stdint.h>, so you can check that this is defined, at least before including any other headers that might define it.

Even though support is spotty and compilers do not accept constants larger than UINT64_MAX, the following declarations work for many definitions of these types:

#define INT128_MAX (~((int128_t)1 << 127U))
#define INT128_MIN ((int128_t)1 << 127U)
#define UINT128_MAX ((uint128_t)-1)
#define UINT128_MIN ((uint128_t)0)

With these definitions, GCC 20 and Clang 15 do not support stream output with <<, but do support <format>, so this works (although not in C++14):

std::println("{:L} {:L} {:#x} {:x}", INT128_MAX, INT128_MIN, UINT128_MAX, UINT128_MIN);

Most importantly, if you declare your 16-byte datatype alignas(16), all atomic operations on it are lock-free on x86_64 and ARM64. This the most practical use for them (given that both targets have other ways to get the high word of 64-bit multiplication).

Question 7

Comparison: you use | instead of ||. I suggest you don't, and that is why: even if the left test is successful, the right test will be run also. Unlike |, || short-circuits, i.e. if the left test succeeds, the right test isn't run. Same applies to & vs. &&.

Toby Speight Toby Speight 87.9k14 gold badges104 silver badges325 bronze badges · Answer 1 · 2019-05-23 11:30:32Z

std::uint64_t is consistently misspelt throughout the code, and may be a poor choice anyway (since an exact 64-bit type need not be provided). It's better to use one of std::uint_fast64_t or std::uint_least64_t instead, for better portability. (Edit: as suggested by supercat in a comment, only the exact-width type will work correctly here, given the implicit masking we rely on for detecting carry and elsewhere, so disregard this recommendation: we want to fail to compile where this won't hold.)

I'd expect constructors from other intx::uint<> instantiations (obviously, the narrowing conversions should be explicit).

The ++ and -- operators could be implemented much more efficiently by using the members rather than converting and adding/subtracting. Unary - might also be better implemented element-wise.

It's good to see that you've included a specialization of std::numeric_limits for this type. A minor nitpick: I think that digits10 needs to round up rather than down. It's unfortunate that we're not allowed to specialize std::is_integral too.

A few problems in from_string():

Style (minor): writing sizeof(x) instead of simply sizeof x makes it look like x is a type name.
Actually, there's a more serious error in that line, in assuming that char is 8 bits (2 hex digits), rather than using CHAR_BIT - I'd write sizeof x * CHAR_BIT / 4 there for full portability.
Alternatively, allow redundant leading zeros by simply checking clz(x) >= 4 before shifting.
Also, why are octal inputs to from_string() treated as decimal? That's surprising to me.
It might be useful to have (private) multiply/divide routines for the small multiplicands in from_string() and to_string().
Perhaps we could skip over any leading whitespace and/or +, like the standard conversion functions (std::stoi(), std::from_chars(), etc) do?

These string conversions look like they could easily be adapted to become streaming operators << and >>.

Finally, it's a great shame that you didn't include the unit tests in the review; that would have greatly aided reviewers (especially when making suggestions for improvement).

Code which uses objects of type uint64_t can rely upon any values being stored to such types being implicitly masked with 0xFFFFFFFFFFFFFFFFull, and will be rejected on any machines where such masking would not occur. Code which relies upon such masking but uses uint_least64_t would compile on machines where the masking would not occur, but may run incorrectly. Having such code be rejected would be preferable to having it run but produce wrong output.

Davislor Davislor 9,10519 silver badges39 bronze badges · Answer 2 · 2025-08-29 11:17:05Z

A bit of necromancy, since this came up in a web search.

On Clang and ICX, the __int128_t and __uint128_t types work with no warnings. #ifdef __clang__ detects either of them.
On GCC, the __int128__ and unsigned __int128__ types work with no warnings, even with -Wpedantic on. So you can remove those #pragma GCC diagnostic commands. Since several other compilers also define __GNUC__, check this afterward. or test __GNUC >= 10 (since other compilers that pretend to be GCC pretend to be a very early version).
MSVC does not have this type, but does support a few AVX intrinsics that can accelerate bitwise operations on 128-bit data, including left-shift, right-shift, and, or and xor. One way to synthesize multiplication would be from SIMD high- and low-word horizontal multiplications of the four 64-bit halves of the two operands.
C23 now supports _BitInt and unsigned _BitInt, and some compilers, including Clang, add that as an extension to C++. (This, however, does generate a compiler warning.)
A future implementation that adds support for int128_t will be required to define INT128_MAX in <cstdint>/<stdint.h>, so you can check that this is defined, at least before including any other headers that might define it.

Even though support is spotty and compilers do not accept constants larger than UINT64_MAX, the following declarations work for many definitions of these types:

#define INT128_MAX (~((int128_t)1 << 127U))
#define INT128_MIN ((int128_t)1 << 127U)
#define UINT128_MAX ((uint128_t)-1)
#define UINT128_MIN ((uint128_t)0)

With these definitions, GCC 20 and Clang 15 do not support stream output with <<, but do support <format>, so this works (although not in C++14):

std::println("{:L} {:L} {:#x} {:x}", INT128_MAX, INT128_MIN, UINT128_MAX, UINT128_MIN);

Most importantly, if you declare your 16-byte datatype alignas(16), all atomic operations on it are lock-free on x86_64 and ARM64. This the most practical use for them (given that both targets have other ways to get the high word of 64-bit multiplication).

coderodde coderodde 31.8k15 gold badges77 silver badges202 bronze badges · Answer 3 · 2025-08-29 12:59:38Z

Comparison: you use | instead of ||. I suggest you don't, and that is why: even if the left test is successful, the right test will be run also. Unlike |, || short-circuits, i.e. if the left test succeeds, the right test isn't run. Same applies to & vs. &&.

Stack Exchange Network

128-bit unsigned integer

3 Answers 3

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

128-bit unsigned integer

3 Answers 3

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions