Arbitrary-layout Floating-point Number conversion library

Question 1

This is a library I've been writing, for now named ALFPN, for converting between native floats and layouts that are not native - half, various 8-bit formats etc. Any number of exponent and mantissa bits should be supported, sign bit is optional, Inf/NaN support is optional, subnormals are not optional.

There is an intermediate class used for conversions that stores the sign, exponent, mantissa and classification as separate variables. The Numbers can be constructed from and casted to all native numeric types - floating and integral. Formats matching the native types can be easily created (using from_native_t<> template) (though for ints it is still sign-magnitude and not 2s complement).

I'm also providing some tests, comparisons of conversions done by my code and by FPU. All seem to pass (with expected exceptions listed in comments).

Here is the code:

#pragma once
//#include <cstdint>
//#include <cstring>
#include <iostream>
#include <bitset>
#include <cmath>
#include <cfloat> // FLT_ROUNDS
#include <bit>
#include <type_traits>
#include <limits>
#define PRT(x) std::cout << #x << "\t" << x << "\n"
#define PRTB(x) std::cout << #x << "\t" << std::bitset<8*sizeof(x)>(x) << "\n"
// Template for arbitrary float representation
// Resources:
// https://stackoverflow.com/a/3542975/7321403
// https://en.wikipedia.org/wiki/Minifloat
#if (__cplusplus >= 202302L)
#define cmath_var constexpr
#define cmath_ret constexpr
#else
#define cmath_var const
#define cmath_ret 
#endif
//#define DEBUG_ROUNDING
namespace ALFPN
{
#ifndef ALFPN_EXT_TYPES
 using exp_t = std::intmax_t;
 using mnt_t = std::uintmax_t;
#endif
 static_assert(std::is_signed_v<exp_t>);
 static_assert(std::is_unsigned_v<mnt_t>);
 using bits_t = int; // size_t;// unsigned int;
 namespace
 {
 template <typename T>
 constexpr T safe_shift_left(T val, bits_t shift)
 {
 static_assert(std::is_integral_v<T>, "T must be an integral type");
 constexpr size_t bit_width = sizeof(T) * CHAR_BIT;
 if (shift >= bit_width)
 return 0;
 return val << shift;
 }
 template <typename T>
 constexpr T safe_shift_right(T val, bits_t shift)
 {
 static_assert(std::is_integral_v<T>, "T must be an integral type");
 constexpr size_t bit_width = sizeof(T) * CHAR_BIT;
 if (shift >= bit_width)
 return (std::is_signed_v<T> && val < 0) ? -1 : 0;
 return val >> shift;
 }
 template <typename T>
 constexpr T safe_shift(T val, int shift)
 {
 static_assert(std::is_integral_v<T>, "T must be an integral type");
 size_t abssh = std::abs(shift);
 if (shift < 0) // "decrease" power
 return safe_shift_right(val, abssh);
 else
 return safe_shift_left(val, abssh);
 }
 template <typename T>
 constexpr T cexpr_exp2(int n)
 {
 T result = 1;
 T base = 2;
 bool neg = n < 0;
 n = neg ? -n : n;
 while (n)
 {
 if (n % 2)
 result *= base;
 base *= base;
 n /= 2;
 }
 return neg ? (1 / result) : result;
 }
 template <typename T>
 constexpr T mask_lowest_N_bits(size_t N)
 {
 using UT = std::make_unsigned_t<T>;
 constexpr size_t len = sizeof(T) * CHAR_BIT;
 bool nonzero = N; // (N != 0)
 //bits_t subn_pwr = (len - N % len) % len;
 if (N > len)
 N = len;
 size_t shift = (len - N) % len;
 //size_t subn_pwr = (0 - N) & (len-1);
 return static_cast<UT>(static_cast<UT>(0) - static_cast<UT>(nonzero)) >> shift;
 }
 /*/
 template <typename T>
 constexpr T mask_highest_N_bits(size_t N)
 {
 using UT = std::make_unsigned_t<T>;
 constexpr size_t len = sizeof(T) * CHAR_BIT;
 bool nonzero = N; // (N != 0)
 //bits_t subn_pwr = (len - N % len) % len;
 size_t subn_pwr = (0 - N) % len;
 //size_t subn_pwr = (0 - N) & (len-1);
 return static_cast<UT>(static_cast<UT>(0) - static_cast<UT>(nonzero)) << subn_pwr;
 }
 //*/
 template<typename T>
 constexpr bool check_sub(T a, T b)
 {
 static_assert(std::is_integral_v<T>, "Only supports integers");
 if (b < 0)
 return a <= std::numeric_limits<T>::max() + b;
 else
 return a >= std::numeric_limits<T>::min() + b;
 }
 constexpr bits_t width1b = sizeof(uint_least8_t) * CHAR_BIT;
 constexpr bits_t width2b = sizeof(uint_least16_t) * CHAR_BIT;
 constexpr bits_t width4b = sizeof(uint_least32_t) * CHAR_BIT;
 constexpr bits_t width8b = sizeof(uint_least64_t) * CHAR_BIT;
 template <size_t N>
 using LeastUIntBits = std::tuple_element_t< (N > width1b) + (N > width2b) + (N > width4b) + (N > width8b),
 std::tuple<uint_least8_t, uint_least16_t, uint_least32_t, uint_least64_t, uintmax_t>
 >;
 class NumberParent {};
 template <class T>
 constexpr bool is_instance_of_Number_v = std::is_base_of_v<NumberParent, T>;
 }
 //========================================//
 //========================================//
 //========================================//
 constexpr bits_t ExpMaxBits = sizeof(exp_t) * CHAR_BIT;
 constexpr bits_t MntMaxBits = sizeof(mnt_t) * CHAR_BIT;
 constexpr exp_t ExpBjs_Auto = std::numeric_limits<exp_t>::min();
 enum class FPclss
 {
 Zero = FP_ZERO,
 Normal = FP_NORMAL,
 Subnorm = FP_SUBNORMAL,
 Inf = FP_INFINITE,
 NaN = FP_NAN,
 };
 using FPrndg = std::float_round_style;
 //========================================//
 //========================================//
 template<
 size_t P_ExpBits,
 size_t P_MntBits,
 bool P_IsSigned = true,
 bool P_HasInfNan = true,
 exp_t P_ExpBjs = ExpBjs_Auto
 >
 class Number;
 //========================================//
 //========================================//
 template<typename T, typename = void>
 struct native_type_info {};
 //template<std::floating_point F>
 template<typename F>
 struct native_type_info<F, std::enable_if_t<std::is_floating_point_v<F>>>
 {
 public:
 static constexpr bool has_sgn = std::numeric_limits<F>::is_signed;
 static constexpr bool has_infnan = std::numeric_limits<F>::has_infinity || std::numeric_limits<F>::has_quiet_NaN || std::numeric_limits<F>::has_signaling_NaN;
 private:
 static constexpr size_t mexp = std::numeric_limits<F>::max_exponent - std::numeric_limits<F>::min_exponent + has_infnan;
 public:
 static constexpr std::size_t expb = std::bit_width(mexp); // count bits of exponent
 static constexpr std::size_t mntb = std::numeric_limits<F>::digits - 1; // subtract one implicit bit
 static constexpr exp_t expbjs = ExpBjs_Auto;
 };
 //template<std::integral I>
 template<typename I>
 struct native_type_info<I, std::enable_if_t<std::is_integral_v<I>>>
 {
 public:
 static constexpr bool has_sgn = std::numeric_limits<I>::is_signed;
 static constexpr bool has_infnan = false;
 static constexpr std::size_t expb = 0; // 0 of exponent
 static constexpr std::size_t mntb = std::numeric_limits<I>::digits; // sign already subtracted
 //static constexpr exp_t expbjs = -(static_cast<exp_t>(1) << (mntb - 1));
 static constexpr exp_t expbjs = -static_cast<exp_t>(mntb) + 1;
 };
 template<typename T, typename = void>
 struct from_native
 {
 private:
 using TI = native_type_info<T>;
 public:
 using type = Number<TI::expb, TI::mntb, TI::has_sgn, TI::has_infnan, TI::expbjs>;
 };
 template<typename T>
 using from_native_t = from_native<T>::type;
 //========================================//
 // Struct for storing "unpacked" float data - widest possible exponent and widest possible mantissa.
 struct Unpacked
 {
 template <typename F, std::enable_if_t<std::is_floating_point_v<F>, bool> = true>
 //static constexpr bits_t TypeMntBits = std::min<bits_t>(MntMaxBits, std::numeric_limits<F>::digits - 1); // -0 works, -1 works, -2 and more fails
 static constexpr bits_t TypeMntBits = std::numeric_limits<F>::digits - 1; // -0 works, -1 works, -2 and more fails
 template <typename F, std::enable_if_t<std::is_floating_point_v<F>, bool> = true>
 static constexpr F TypeMntMult = cexpr_exp2<F>(TypeMntBits<F>); // std::ldexp(static_cast<F>(1), TypeMntBits<F>);
 public:
 FPclss clss = FPclss::Zero; // Class cannot be Subnorm (except as return from .format). Exp and Mnt are meaningless if class is not (Sub)Normal. 
 exp_t e = 0; // ALWAYS unbiased (actual power of 2)
 mnt_t m = 0; // ALWAYS normalized (implicit 1.mmmmmm) (except when Subnorm as return from .format).
 bool s = false;
 Unpacked() = default;
 ~Unpacked() = default;
 Unpacked(const Unpacked&) = default;
 Unpacked(Unpacked&&) = default;
 Unpacked& operator=(const Unpacked&) = default;
 template <typename F, std::enable_if_t<std::is_floating_point_v<F>, bool> = true>
 cmath_ret Unpacked(F f)
 {
 clss = static_cast<FPclss>(std::fpclassify(f));
 s = std::signbit(f);
 f = std::abs(f);
 switch (clss)
 {
 case FPclss::Subnorm:
 case FPclss::Normal:
 {
 clss = FPclss::Normal;
 e = std::ilogb(f); // get exponent
 f /= std::scalbn(static_cast<F>(1), e); // normalize to [1, 2)
 // choose (A) or (B) or (C)
 // I dont think working in subnormal range (if it even exists) is a good idea, so I'd skip (A)
 // (B) seems faster than (C) but idk
 //f -= 1; // (A) subtract implicit bit, [1, 2) => [0, 1)
 //f *= TypeMntMult<F>; // (A) convert to "integer"
 f *= TypeMntMult<F>; // (B) convert to "integer"
 f -= TypeMntMult<F>; // (B) subtract implicit bit, "[1, 2)" => "[0, 1)"
 //f = std::fma(f, TypeMntMult<F>, -TypeMntMult<F>); // (C) fused MultiplyAdd, same as (B)
 f = std::nearbyint(f); // round before casting, in case of precision greater than MntMaxBits
 m = static_cast<mnt_t>(f) << (MntMaxBits - TypeMntBits<F>);
 break;
 }
 case FPclss::Zero:
 case FPclss::Inf:
 case FPclss::NaN:
 break;
 default:
 throw std::invalid_argument("Invalid classification");
 }
 }
 template <typename I, std::enable_if_t<std::is_integral_v<I>, bool> = true>
 cmath_ret Unpacked(I i)
 {
 using UI = std::make_unsigned_t<I>;
 if (i == 0)
 {
 clss = FPclss::Zero;
 return;
 }
 clss = FPclss::Normal;
 UI ui = i;
 if constexpr (std::is_signed_v<I>)
 {
 s = (i < 0);
 if (s)
 ui = static_cast<UI>(0) - ui; // abs with type change
 }
 
 m = static_cast<mnt_t>(ui);
 size_t pos = std::countl_zero(m);
 e = MntMaxBits - (pos + 1);
 //m = safe_shift_left(m, pos+1);
 m <<= pos;
 m <<= 1; // remove implicit 1.
 }
 template <typename F, std::enable_if_t<std::is_floating_point_v<F>, bool> = true>
 explicit cmath_ret operator F () const // reverse the code of constructor
 {
 F f;
 Unpacked n = format<false>(std::numeric_limits<F>::max_exponent - 1, std::numeric_limits<F>::min_exponent - 1, std::numeric_limits<F>::digits - 1);
 switch (n.clss)
 {
 case FPclss::Normal:
 {
 f = static_cast<F>(n.m >> (MntMaxBits - TypeMntBits<F>));
 // choose (A) or (B)
 // I dont think working in subnormal range (if it even exists) is a good idea, so I'd skip (A)
 //f /= TypeMntMult<F>; // (A) deconvert from "integer"
 //f += 1; // (A) add implicit bit, normalize [0, 1) => [1, 2)
 f += TypeMntMult<F>; // (B) add implicit bit, normalize "[0, 1)" => "[1, 2)"
 f /= TypeMntMult<F>; // (B) deconvert from "integer"
 f *= std::scalbn(static_cast<F>(1), n.e);
 break;
 }
 case FPclss::Zero:
 f = static_cast<F>(0);
 break;
 case FPclss::Inf:
 f = std::numeric_limits<F>::infinity();
 break;
 case FPclss::NaN:
 f = std::numeric_limits<F>::quiet_NaN();
 break;
 default:
 throw std::invalid_argument("Invalid classification");
 }
 //if (u.s)
 // f = std::copysign(f, static_cast<F>(-1));
 //f = -f;
 f = std::copysign(f, static_cast<F>(0) - n.s);
 return f;
 }
 
 template <typename I, std::enable_if_t<std::is_integral_v<I>, bool> = true>
 explicit cmath_ret operator I () const
 {
 I i;
 Unpacked n = format<true>(-std::numeric_limits<I>::digits, -std::numeric_limits<I>::digits + 1, std::numeric_limits<I>::digits);
 std::cout << n << std::endl;
 switch (n.clss)
 {
 case FPclss::Subnorm:
 {
 i = static_cast<I>(n.m >> (MntMaxBits - std::numeric_limits<I>::digits));
 break;
 }
 case FPclss::Zero:
 i = static_cast<I>(0);
 break;
 case FPclss::Inf:
 i = s ? std::numeric_limits<I>::min() : std::numeric_limits<I>::max();
 break;
 case FPclss::NaN:
 i = static_cast<I>(0);
 break;
 default:
 throw std::invalid_argument("Invalid classification");
 }
 if (n.s)
 i = -i;
 return i;
 }
 template <bool subnormalize = false>
 Unpacked format(exp_t ExpMaxVal, exp_t ExpMinVal, size_t MntBits) const
 {
 FPrndg rndg = static_cast<FPrndg>(FLT_ROUNDS);
 const exp_t ExpSubVal = ExpMinVal - static_cast<exp_t>(MntBits);
 Unpacked ret(*this);
 switch (clss)
 {
 case FPclss::Normal:
 {
 exp_t subn_pwr = (e < ExpMinVal) ? (ExpMinVal - e) : 0;
 const mnt_t RMNDR_MASK = mask_lowest_N_bits<mnt_t>(MntMaxBits - MntBits + subn_pwr);
 const mnt_t STICKY_MASK = mask_lowest_N_bits<mnt_t>(MntMaxBits - MntBits + subn_pwr - 1);
 const mnt_t RNDGBIT_MASK = RMNDR_MASK & ~STICKY_MASK;
 const mnt_t UNCHNGD_MASK = ~RMNDR_MASK;
 const mnt_t RSLTLSB_MASK = UNCHNGD_MASK & ~(UNCHNGD_MASK << 1);
 bool roundup = false;
 bool denyinf = false;
 bool denyzro = false;
 switch (rndg)
 {
 case FPrndg::round_toward_zero: // simple clear
 roundup = false;
 denyinf = true;
 denyzro = false;
 break;
 case FPrndg::round_toward_infinity: // round up if positive
 roundup = !s && (m & RMNDR_MASK);
 denyinf = s;
 denyzro = !s;
 break;
 case FPrndg::round_toward_neg_infinity: // round up if negative
 roundup = s && (m & RMNDR_MASK);
 denyinf = !s;
 denyzro = s;
 break;
 case FPrndg::round_to_nearest: // oh god
 case FPrndg::round_indeterminate:
 {
 bool rndg_bit = (m & RNDGBIT_MASK) || (!RNDGBIT_MASK);
 bool stck_bit = (m & STICKY_MASK);
 bool rslt_lsb = (m & RSLTLSB_MASK) || (!RSLTLSB_MASK && RNDGBIT_MASK);
 roundup = rndg_bit && stck_bit || rndg_bit && rslt_lsb;
 denyinf = false;
 denyzro = false;
 break;
 }
 default:
 throw std::invalid_argument("Unknown rounding mode");
 }
#ifdef DEBUG_ROUNDING
 std::cout << "\n\n";
 PRTB((((RMNDR_MASK))));
 PRTB((((UNCHNGD_MASK))));
 PRTB((((STICKY_MASK))));
 PRTB((((RNDGBIT_MASK))));
 PRTB((((RSLTLSB_MASK))));
 std::cout << "\n";
 PRTB(((m | m | m | m)));
 PRTB((m & RNDGBIT_MASK));
 PRTB((m & STICKY_MASK));
 PRTB((m & RSLTLSB_MASK));
#endif
 ret.m &= UNCHNGD_MASK;
 if (roundup)
 {
 ret.m += RSLTLSB_MASK;
 if (ret.m == 0)// [[unlikely]]
 {
#ifdef DEBUG_ROUNDING
 std::cout << "Overflow of mantissa" << std::endl;
#endif
#ifdef DEBUG_ROUNDING
 PRTB(RSLTLSB_MASK);
#endif
 ++ret.e;
 if (ret.e == std::numeric_limits<exp_t>::min()) [[unlikely]] // will this ever happen?
 {
 --ret.e;
 ret.clss = FPclss::Inf;
 throw std::range_error("Underflow of exponent");
 break;
 }
 }
 }
 if (ret.e > ExpMaxVal)
 {
 if (!denyinf)
 {
 ret.clss = FPclss::Inf;
 //ret.e = 0;
 //ret.m = 0;
 break;
 }
 else
 {
 ret.e = ExpMaxVal;
 ret.m = std::numeric_limits<mnt_t>::max();
 }
 }
 //*/
 if (ret.e < ExpSubVal)
 {
 if (!denyzro)
 {
 ret.clss = FPclss::Zero;
 //ret.e = 0;
 //ret.m = 0;
 break;
 }
 else
 {
 ret.e = ExpSubVal;
 //ret.m = 0;
 }
 
 }
 //*/
 if constexpr (subnormalize)
 if (ret.e < ExpMinVal)
 {
 exp_t subn_pwr = ExpMinVal - ret.e;
 ret.clss = FPclss::Subnorm;
 ret.e += subn_pwr;
 mnt_t l = safe_shift<mnt_t>(1, MntMaxBits - subn_pwr);
 #ifdef DEBUG_ROUNDING
 std::cout << "M, M>>, L, M|L\n";
 PRTB((ret.m | ret.m));
 #endif
 ret.m >>= subn_pwr; // ok
 #ifdef DEBUG_ROUNDING
 PRTB((ret.m | ret.m));
 PRTB((l | l | l | l));
 #endif
 ret.m |= l;
 #ifdef DEBUG_ROUNDING
 PRTB((ret.m | ret.m));
 std::cout << "\n";
 #endif
 }
 break;
 }
 case FPclss::Zero:
 case FPclss::Inf:
 case FPclss::NaN:
 break;
 default:
 throw std::invalid_argument("Invalid classification");
 }
 //ret.clss = FPclss::NaN;
 return ret;
 }
 };
 std::ostream& operator<<(std::ostream& os, const Unpacked& u)
 {
 //os << "(" << std::underlying_type_t<FPclss>(u.clss) << ") " << (u.s ? '-' : '+') << "0x1." << std::bitset<MntMaxBits>(u.m) << 'p' << u.e;
 os << (u.s ? '-' : '+');
 switch (u.clss)
 {
 case FPclss::Normal:
 os << "0x1." << std::bitset<MntMaxBits>(u.m) << 'p' << u.e; // this will fail for non-native mnt_t type
 break;
 case FPclss::Subnorm:
 os << "0x0." << std::bitset<MntMaxBits>(u.m) << 'p' << u.e;
 break;
 case FPclss::Zero:
 os << "0x0.0";
 break;
 case FPclss::Inf:
 os << "Inf";
 break;
 case FPclss::NaN:
 os << "NaN";
 break;
 default:
 throw std::invalid_argument("Invalid classification");
 }
 return os;
 }
 //========================================//
 //========================================//
 //========================================//
#define DOASSERT
#define DOTHROWINF
 //========================================//
#define HIDE_IMPL public
#ifdef DOASSERT
 #define ALFPN_ASSERT(x) static_assert((x))
#else
 #define ALFPN_ASSERT(x) 
#endif
 template<
 size_t P_ExpBits,
 size_t P_MntBits,
 bool P_IsSigned,
 bool P_HasInfNan,
 exp_t P_ExpBjs
 >
 class Number : private NumberParent
 {
 using this_t = Number;
 //template<size_t, size_t, bool, bool, bool, exp_t>
 //friend class Number;
 //
 //static_assert(P_ExpBits >= size_t(P_HasInfNan || P_HasSbnrmls));
 //static_assert(P_ExpBits >= 1);
 //static_assert(P_MntBits > 0);
 HIDE_IMPL:
 static constexpr size_t ExpBits = P_ExpBits;
 static constexpr size_t MntBits = P_MntBits;
 static constexpr bool HAS_SGN = P_IsSigned;
 static constexpr bool HAS_INF = P_HasInfNan && P_ExpBits > 0;
 static constexpr bool HAS_NAN = P_HasInfNan && P_ExpBits > 0 && P_MntBits > 0;
 static constexpr bool HAS_SUB = true;
 // P_ can only be used above this line
 static constexpr size_t TtlBits = MntBits + ExpBits + HAS_SGN; // Required space
 using raw_t = LeastUIntBits<TtlBits>;
 using Utype = std::make_unsigned_t<raw_t>;
 using Stype = std::make_signed_t<raw_t>;
 using Rtype = std::conditional_t<HAS_SGN, Stype, Utype>;
 static constexpr size_t TypeBits = sizeof(raw_t) * CHAR_BIT; // Storage space
 // We have to make sure that the format fits in the selected underlying type
 ALFPN_ASSERT(TtlBits <= TypeBits);
 //========================================//
 //========================================//
 HIDE_IMPL:
 static constexpr Utype ExpNulBits = 0;
 static constexpr Utype ExpAllBits = mask_lowest_N_bits<Utype>(ExpBits);
 static constexpr Utype ExpMinBits = ExpNulBits + 1; // Reduce range by 1 for subnormals
 static constexpr Utype ExpMaxBits = ExpAllBits - HAS_INF; // Reduce range by 1 for Inf/NaN
 static constexpr Utype MntZroBits = 0;
 static constexpr Utype MntAllBits = mask_lowest_N_bits<Utype>(MntBits);
 //
 static constexpr size_t MntShift = 0; // Position of Mantissa
 static constexpr size_t ExpShift = MntBits; // Position of Exponent
 static constexpr size_t SgnShift = MntBits + ExpBits; // Position of Sign
 static constexpr Utype MntMask = MntAllBits << MntShift; // Bitmask of Mantissa
 static constexpr Utype ExpMask = ExpAllBits << ExpShift; // Bitmask of Exponent
 static constexpr Utype SgnMask = Utype(0 - HAS_SGN) & ~(MntMask | ExpMask); // Bitmask of Sign
 
 //
 static constexpr exp_t ExpBjs = (P_ExpBjs == ExpBjs_Auto) ? (ExpAllBits / 2) : P_ExpBjs;
 // The smallest and largest actual (unbiased) exponent must fit into the Unpacked format. 
 ALFPN_ASSERT(check_sub<exp_t>(ExpMinBits, ExpBjs));
 ALFPN_ASSERT(check_sub<exp_t>(ExpMaxBits, ExpBjs));
 //static constexpr exp_t ExpNulVal = static_cast<exp_t>(ExpNulBits) - ExpBjs; // no guarantee of existence, should be unused
 //static constexpr exp_t ExpInfVal = static_cast<exp_t>(ExpAllBits) - ExpBjs; // same
 static constexpr exp_t ExpMinVal = static_cast<exp_t>(ExpMinBits) - ExpBjs;
 static constexpr exp_t ExpMaxVal = static_cast<exp_t>(ExpMaxBits) - ExpBjs;
 // The smallest subnormal must fit into the Unpacked format.
 ALFPN_ASSERT(check_sub<exp_t>(ExpMinVal, MntBits));
 //static constexpr exp_t ExpSubVal = ExpMinVal - static_cast<exp_t>(HAS_SUB ? MntBits : 0);
 //ALFPN_ASSERT(ExpMinVal <= ExpMaxVal); // false for integers
 static constexpr mnt_t MntZroVal = 0; // un-use?
 //========================================//
 HIDE_IMPL:
 Utype strg = 0;
 //========================================//
 //========================================//
 public:
 static constexpr this_t Inf()
 {
 if constexpr (HAS_INF)
 {
 this_t temp;
 temp.set_exp_raw(ExpAllBits);
 return temp;
 }
 else
 {
#ifdef DOTHROWINF
 throw std::range_error("This format has no Inf representation!");
#endif
 this_t temp;
 temp.set_exp_raw(ExpMaxBits);
 temp.set_mnt_raw(MntAllBits);
 return temp;
 }
 }
 static constexpr this_t NaN()
 {
 if constexpr (HAS_NAN)
 {
 this_t temp;
 temp.set_exp_raw(ExpAllBits);
 temp.set_mnt_raw(MntAllBits & ~(MntAllBits >> 1));
 return temp;
 }
 else
 throw std::domain_error("This format has no NaN representation!");
 }
 static constexpr this_t Zero()
 {
 this_t temp;
 return temp;
 }
 //========================================//
 //========================================//
 public:
 Number() = default;
 ~Number() = default;
 Number(const Unpacked& u)
 {
 pack(u);
 }
 template <typename N, std::enable_if_t<std::is_arithmetic_v<N>, bool> = true>
 cmath_ret Number(N n) : Number(Unpacked(n))
 {
 }
 constexpr Number(const Number&) = default;
 constexpr Number(Number&&) = default;
 template<typename RHS, std::enable_if_t<!std::is_same_v<RHS, this_t> && is_instance_of_Number_v<RHS>, bool> = true>
 Number(const RHS& rhs)
 {
 operator=(rhs);
 }
 //========================================//
 //========================================//
 HIDE_IMPL:
 inline constexpr Utype get_exp_raw() const
 {
 return (strg & ExpMask) >> ExpShift;
 }
 inline constexpr Utype get_mnt_raw() const
 {
 return (strg & MntMask) >> MntShift;
 }
 inline constexpr exp_t get_exp() const
 {
 return static_cast<exp_t>(get_exp_raw()) - ExpBjs;
 }
 inline constexpr mnt_t get_mnt() const
 {
 return static_cast<mnt_t>(get_mnt_raw()) << (MntMaxBits - MntBits);
 }
 //========================================//
 inline constexpr void set_exp_raw(Utype e)
 {
 strg &= ~ExpMask;
 strg |= (e << ExpShift) & ExpMask;
 }
 inline constexpr void set_mnt_raw(Utype m)
 {
 strg &= ~MntMask;
 strg |= m & MntMask;
 }
 inline constexpr void set_exp(exp_t e)
 {
 if (e < ExpMinVal || e > ExpMaxVal)
 throw std::range_error("Invalid exponent");
 set_exp_raw(static_cast<Utype>(e + ExpBjs));
 }
 inline constexpr void set_mnt(mnt_t m)
 {
 mnt_t mnt = m >> (MntMaxBits - MntBits);
 return set_mnt_raw(static_cast<Utype>(mnt));
 }
 //========================================//
 inline constexpr void set_sgn(bool neg)
 {
 if constexpr (HAS_SGN)
 {
 if (neg)
 strg |= SgnMask;
 else
 strg &= ~SgnMask;
 }
 else
 if (neg)
 throw std::range_error("This format has no sign!");
 }
 //========================================//
 //public?
 inline constexpr this_t& be_nan()
 {
 return operator=(NaN());
 }
 inline constexpr this_t& be_inf()
 {
 return operator=(Inf());
 }
 inline constexpr this_t& be_zero()
 {
 return operator=(Zero());
 }
 //========================================//
 constexpr Unpacked unpack() const
 {
 Unpacked u;
 u.clss = classify();
 u.s = is_neg();
 // if IF commented, Unpacked e & m can have trash - dont care, didnt ask, check your classifications
 if (u.clss == FPclss::Subnorm || u.clss == FPclss::Normal) 
 {
 u.e = get_exp();
 u.m = get_mnt();
 }
 if (u.clss == FPclss::Subnorm)
 {
 auto subn_pwr = std::countl_zero(u.m) + 1;
 u.e += 1; // because exp is actually the smallest normal one
 u.e -= subn_pwr; // decrease exp by subn_pwr
 u.m <<= subn_pwr; // increase mnt by subn_pwr
 u.clss = FPclss::Normal;
 }
 return u;
 }
 constexpr void pack(const Unpacked& u)//const Unpacked& u)
 {
 Unpacked n = u.format<true>(ExpMaxVal, ExpMinVal, MntBits);
 //PRT(n);
 switch (n.clss)
 {
 case FPclss::Normal:
 set_exp(n.e);
 set_mnt(n.m);
 break;
 case FPclss::Subnorm:
 set_exp_raw(ExpNulBits);
 set_mnt(n.m);
 break;
 case FPclss::Zero:
 be_zero();
 break;
 case FPclss::Inf:
 be_inf();
 break;
 case FPclss::NaN:
 be_nan();
 break;
 default:
 throw std::invalid_argument("Invalid classification");
 }
 set_sgn(u.s);
 }
 //========================================//
 //========================================//
 public:
 constexpr FPclss classify() const
 {
 Utype exp = get_exp_raw();
 Utype mnt = get_mnt_raw();
 size_t eidx = (HAS_INF && exp == ExpAllBits) + (exp != ExpNulBits); // 0 = nul, 1 = xxx, 2 = inf
 size_t midx = (mnt != MntZroVal); // 0 = nul, 1 = xxx
 static constexpr FPclss LUT[3][2] =
 { // mnt is zero / mnt is nonzero
 {FPclss::Zero, FPclss::Subnorm }, // exp is zero
 {FPclss::Normal, FPclss::Normal }, // exp is nonzero
 {HAS_INF ? FPclss::Inf : FPclss::Normal, HAS_NAN ? FPclss::NaN : FPclss::Normal }, // exp is inf
 };
 return LUT[eidx][midx];
 }
 //
 inline bool is_inf() const
 {
 return HAS_INF && (classify() == FPclss::Inf);
 }
 inline bool is_nan() const
 {
 return HAS_NAN && (classify() == FPclss::NaN);
 }
 inline bool is_zro() const
 {
 return classify() == FPclss::Zero;
 }
 inline bool is_sub() const
 {
 return classify() == FPclss::Subnorm;
 }
 inline bool is_neg() const
 {
 return HAS_SGN && (strg & SgnMask);
 }
 //========================================//
 //========================================//
 this_t& operator=(const this_t& rhs)
 {
 strg = rhs.strg;
 return *this;
 }
 template<typename RHS, std::enable_if_t<!std::is_same_v<RHS, this_t> && is_instance_of_Number_v<RHS>, bool> = true>
 this_t& operator=(const RHS& rhs)
 {
 Unpacked u = rhs.unpack();
 pack(u);
 return *this;
 }
 template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, bool> = true>
 explicit constexpr operator T () const
 {
 return static_cast<T>(unpack());
 }
 
 Rtype export_sm() const // sign-magnitude representation
 {
 return strg;
 }
 void import_sm(Rtype n) const // sign-magnitude representation
 {
 strg = n;
 }
 Rtype export_1c() const // 1s-complement representation
 {
 return export_sm() ^ ~SgnMask;
 }
 void import_1c(Rtype n) const // 1s-complement representation
 {
 import_sm(n ^ ~SgnMask);
 }
 };
 //
 using Single = from_native<float>::type;
 using Double = from_native<double>::type;
 using LDouble = from_native<long double>::type;
 using Half = Number<5, 10>;
 //using Quad = ALFPN<int128_t, 15, 112>;
 //using Octuple = ALFPN<int256_t, 19, 236>;
}

I will provide the tests as a link. Tests

The code is still a bit dirty - macros for debug, testing, etc.

I'm looking for any advice: optimization (removing branching?), design ideas (eg. how to best allow external types for exp_t, mnt_t and LeastUInt), code structure, exceptions (usage and types), even naming, whatever comes to mind.

Edit: I noticed the mistake in export/import 1's complement - not checking sign.

Edit 2: Here is the updated code, as of 2025年06月06日, implementing the changes suggested by Toby's partial review up to line 450 of the original code.

Question 2

Nice clean code, btw. :-) A couple of if() { yadda; return; } else ... that do not need to be flagged as else. Elvis has left the building... And a couple of places where an 'early return' might remove a bit of indent...

Question 3

I get a few compilation errors. Missing definition of CHAR_BIT is easily resolved by including <climits>. To fix the lack of iostream << for ALFPN::Unpacked, I had to remove line 421.

Question 4

@TobySpeight the overload for iostream << for Unpacked is at line 638. I dont know why it doesn't work for you, it worked for me?

Question 5

Also seems to be a missing include of <stdint.h> (though I'd prefer to namespace-qualify the types and include <cstdint> instead).

Question 6

Do you have unit tests that go with this? It would be really helpful to include them in the review. Failing that, at least some example usage would certainly help our understanding of the code context.

Question 7

Naming

The established convention in C and C++ is that we use all-caps names for preprocessor macros and not for other identifiers (except for initial-caps type name placeholders in templates, which may be as short as one letter). This helps alert readers to these textual transformations that don't obey the rules of scope and precedence we expect from the language.

The macros PRT() and PRTB() conform to this convention, but macros cmath_ver and cmath_ret do not, nor the namespace ALFPN, nor parameter N to mask_lowest_N_bits(), nor variables RMNDR_MASK, RNDGBIT_MASK, etc. That dilutes the value of the convention and makes life harder for readers.

I expect the abbreviations are somewhat meaningful for you, right now, but be warned that names such as RSLTLSB_MASK and P_ExpBjs are an impediment to others (including Future You) trying to understand the code. I recommend the use of more vowels; they are not expensive.

Includes

We use types such as size_t, uint_least64_t, etc without including <stdint.h>, which is an error. That said, I would include instead and change those usages to std::size_t, std::uint_least64_t, etc.

Also missing is <climits>, to provide definition of CHAR_BIT.

Namespaces

We frequently refer to size_t without a definition. I'm guessing that this is intended to be std::size_t (which it will be if you use a Standard Library that exercises its latitude to declare both when only one is asked for - it's not portable to rely on that).

Utility functions

safe_shift_left() and safe_shift_right() both fail to handle negative shifts correctly. We could fix that, but probably simpler to remove these functions and inline into safe_shift(), which is the only one that's actually called:

 template<std::integral T>
 constexpr T safe_shift(T val, int shift)
 {
 constexpr auto bit_width = sizeof val * CHAR_BIT;
 if (shift <= -bit_width) {
 // overshift right
 return val >> (bit_width - 1);
 } else if (shift < 0) {
 return val >> -shift;
 } else if (shift < bit_width) {
 return val << shift;
 } else {
 // overshift left
 return 0;
 }
 }

The computation sizeof (T) * CHAR_BIT occurs in several places; it might be worth writing a small template for this:

 template<typename T>
 int type_width = sizeof (T) * CHAR_BIT;

(Not called bit_width, to avoid confusion with standard-library function of that name)

Standard warning about possible overflow in cexpr_exp2() when n == INT_MIN. We might be able to prove that would never happen, but then we should have a comment justifying that. Or we could avoid the issue altogether, using result /= for negative n.

mask_lowest_N_bits() is unclear, and the comments aren't especially helpful. In particular, I cannot see why the static_cast of the return value from UT to UT is needed.

I think the intent is that for n = 0, 1, 2, 3, we should return 0, 0b1, 0b11, 0b111, respectively. The usual technique for that would be to form an all-ones value, shift that left by n bits, then complement it:

 template <typename T>
 constexpr T mask_lowest_N_bits(std::size_t N)
 {
 return ~safe_shift(~T{}, n);
 }

Public constants

The public parts of the namespace are as underdocumented as the anonymous namespace. For example, there's no clarity of what ExpBjs_Auto means, or what I might use it for.

Public types

I think it would be clearer to use Concepts than std::enable_if. Compare

template<typename F>
struct native_type_info<F, std::enable_if_t<std::is_floating_point_v<F>>>

with

template<std::floating_point F>
struct native_type_info<F>

(We're evidently not concerned with C++17 compatibility, as we already fail to compile with that standard).

I don't recommend using default in switch statements that are intended to be exhaustive. Doing so inhibits a useful compiler warning when an enumeration is incomplete. For instance, this pattern:

switch (n.clss)
{
case FPclss::Normal:
 f = ...
 break;
case FPclss::Zero:
 f = ...
 break;
case FPclss::Inf:
 f = ...
 break;
case FPclss::NaN:
 f = ...
 break;
default:
 throw std::invalid_argument("Invalid classification");
}
return f;

would be better as

switch (n.clss)
{
case FPclss::Normal:
 return ...
case FPclss::Zero:
 return ...
case FPclss::Inf:
 return ...
case FPclss::NaN:
 return ...
case FPclss::Subnorm:
 throw std::invalid_argument("Unexpected subnormal value");
}
throw std::invalid_argument("Invalid classification");

Generalisation

At present, the code only represents binary floating-point formats. Before getting too deep, it might be worth considering how decimal floating-point differs and in what ways it's the same, so we can share as much of the implementation as possible between the two.

Question 8

I think the usage of all-caps T as a template argument is well accepted and that's not a macro.

Question 9

Your safe_shift isn't safe, it attempts to pass a negative operand to << (in the else block). That's undefined behavior. Original code's comparison to 0 is quite necessary. Original safe_shift helpers for _left and _right don't have a requirement to handle negative inputs, only inputs passed to them by the safe_shift entrypoint.

Question 10

Yes @Ben, single-letter uppercase don't fall into this convention but into a different one. I perhaps over-simplified. I forgot negative shifts are UB; I'll fix that now.

Question 11

I only mentioned the single-letter uppercase because you gave an example of N as breaking the rule on macros.

Question 12

Now mentioned as a specific exception @Ben - thanks.

Toby Speight Toby Speight 87.9k14 gold badges104 silver badges325 bronze badges · Answer 1 · 2025-05-28 08:55:10Z

Naming

The established convention in C and C++ is that we use all-caps names for preprocessor macros and not for other identifiers (except for initial-caps type name placeholders in templates, which may be as short as one letter). This helps alert readers to these textual transformations that don't obey the rules of scope and precedence we expect from the language.

The macros PRT() and PRTB() conform to this convention, but macros cmath_ver and cmath_ret do not, nor the namespace ALFPN, nor parameter N to mask_lowest_N_bits(), nor variables RMNDR_MASK, RNDGBIT_MASK, etc. That dilutes the value of the convention and makes life harder for readers.

I expect the abbreviations are somewhat meaningful for you, right now, but be warned that names such as RSLTLSB_MASK and P_ExpBjs are an impediment to others (including Future You) trying to understand the code. I recommend the use of more vowels; they are not expensive.

Includes

We use types such as size_t, uint_least64_t, etc without including <stdint.h>, which is an error. That said, I would include instead and change those usages to std::size_t, std::uint_least64_t, etc.

Also missing is <climits>, to provide definition of CHAR_BIT.

Namespaces

We frequently refer to size_t without a definition. I'm guessing that this is intended to be std::size_t (which it will be if you use a Standard Library that exercises its latitude to declare both when only one is asked for - it's not portable to rely on that).

Utility functions

safe_shift_left() and safe_shift_right() both fail to handle negative shifts correctly. We could fix that, but probably simpler to remove these functions and inline into safe_shift(), which is the only one that's actually called:

 template<std::integral T>
 constexpr T safe_shift(T val, int shift)
 {
 constexpr auto bit_width = sizeof val * CHAR_BIT;
 if (shift <= -bit_width) {
 // overshift right
 return val >> (bit_width - 1);
 } else if (shift < 0) {
 return val >> -shift;
 } else if (shift < bit_width) {
 return val << shift;
 } else {
 // overshift left
 return 0;
 }
 }

The computation sizeof (T) * CHAR_BIT occurs in several places; it might be worth writing a small template for this:

 template<typename T>
 int type_width = sizeof (T) * CHAR_BIT;

(Not called bit_width, to avoid confusion with standard-library function of that name)

Standard warning about possible overflow in cexpr_exp2() when n == INT_MIN. We might be able to prove that would never happen, but then we should have a comment justifying that. Or we could avoid the issue altogether, using result /= for negative n.

mask_lowest_N_bits() is unclear, and the comments aren't especially helpful. In particular, I cannot see why the static_cast of the return value from UT to UT is needed.

I think the intent is that for n = 0, 1, 2, 3, we should return 0, 0b1, 0b11, 0b111, respectively. The usual technique for that would be to form an all-ones value, shift that left by n bits, then complement it:

 template <typename T>
 constexpr T mask_lowest_N_bits(std::size_t N)
 {
 return ~safe_shift(~T{}, n);
 }

Public constants

The public parts of the namespace are as underdocumented as the anonymous namespace. For example, there's no clarity of what ExpBjs_Auto means, or what I might use it for.

Public types

I think it would be clearer to use Concepts than std::enable_if. Compare

template<typename F>
struct native_type_info<F, std::enable_if_t<std::is_floating_point_v<F>>>

with

template<std::floating_point F>
struct native_type_info<F>

(We're evidently not concerned with C++17 compatibility, as we already fail to compile with that standard).

I don't recommend using default in switch statements that are intended to be exhaustive. Doing so inhibits a useful compiler warning when an enumeration is incomplete. For instance, this pattern:

switch (n.clss)
{
case FPclss::Normal:
 f = ...
 break;
case FPclss::Zero:
 f = ...
 break;
case FPclss::Inf:
 f = ...
 break;
case FPclss::NaN:
 f = ...
 break;
default:
 throw std::invalid_argument("Invalid classification");
}
return f;

would be better as

switch (n.clss)
{
case FPclss::Normal:
 return ...
case FPclss::Zero:
 return ...
case FPclss::Inf:
 return ...
case FPclss::NaN:
 return ...
case FPclss::Subnorm:
 throw std::invalid_argument("Unexpected subnormal value");
}
throw std::invalid_argument("Invalid classification");

Generalisation

At present, the code only represents binary floating-point formats. Before getting too deep, it might be worth considering how decimal floating-point differs and in what ways it's the same, so we can share as much of the implementation as possible between the two.

I think the usage of all-caps T as a template argument is well accepted and that's not a macro.
Your safe_shift isn't safe, it attempts to pass a negative operand to << (in the else block). That's undefined behavior. Original code's comparison to 0 is quite necessary. Original safe_shift helpers for _left and _right don't have a requirement to handle negative inputs, only inputs passed to them by the safe_shift entrypoint.
Yes @Ben, single-letter uppercase don't fall into this convention but into a different one. I perhaps over-simplified. I forgot negative shifts are UB; I'll fix that now.
I only mentioned the single-letter uppercase because you gave an example of N as breaking the rule on macros.

Stack Exchange Network

Arbitrary-layout Floating-point Number conversion library

1 Answer 1

Naming

Includes

Namespaces

Utility functions

Public constants

Public types

Generalisation

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Arbitrary-layout Floating-point Number conversion library

1 Answer 1

Naming

Includes

Namespaces

Utility functions

Public constants

Public types

Generalisation

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions