The code makes following assumptions:
std::string_view
is not null-terminated- there are no exponents, all floating point inputs are in the way 123.123
- limiting digits before and after decimal point
<ctype.h>
free. No locale. Uses ownisspace()
andisdigits()
#include <cstdint>
#include <string_view>
#include <limits>
// based on http://www.jbox.dk/sanos/source/lib/strtod.c.html
namespace to_fp_impl_{
namespace{
constexpr char decimal_point = '.';
auto isspace(char c){
return c == ' ' || c == '\t';
};
auto isdigit(char c){
return c >= '0' && c <= '9';
}
template<typename FP>
void add_char(FP &number, char c){
number = number * 10 + (c - '0');
}
}
template<typename FP>
struct ResultFP{
FP num;
bool ok = true;
static auto err(){
return ResultFP{ 0, false };
}
};
} // namespace to_fp_impl_
template<
typename FP,
uint8_t digits = std::numeric_limits<FP>::digits10,
uint8_t decimals = digits
>
auto to_fp(std::string_view const str){
static_assert(
std::is_floating_point_v<FP> &&
digits <= std::numeric_limits<FP>::digits10 &&
decimals <= std::numeric_limits<FP>::digits10
);
// -----
using namespace to_fp_impl_;
using Result = ResultFP<FP>;
auto it = std::begin(str);
auto end = std::end(str);
// -----
// Skip leading whitespace
while (it != end && isspace(*it))
++it;
// -----
// Handle optional sign
int8_t sign = +1;
if (it != end){
if (*it == '-'){
sign = -1;
++it;
}else if (*it == '+'){
// sign = +1;
++it;
}
}
// -----
FP number = 0.0;
uint8_t total_digits = 0;
// Process digits
{
uint8_t num_digits = 0;
while (it != end && isdigit(*it)){
if (++num_digits > digits)
return Result::err();
add_char(number, *it);
++it;
}
total_digits += num_digits;
}
// -----
// Process decimal part
if (it != end && *it == decimal_point){
++it;
FP exp = 1;
uint8_t num_digits = 0;
while (it != end && isdigit(*it)){
if (++num_digits > decimals)
return Result::err();
add_char(number, *it);
++it;
exp *= 0.1;
}
total_digits += num_digits;
number *= exp;
}
// -----
return Result{ number * sign, total_digits != 0 };
}
template<
typename FP,
uint8_t digits = std::numeric_limits<FP>::digits10,
uint8_t decimals = digits
>
auto to_fp_def(std::string_view const str, FP def = 0){
auto[num, ok] = to_fp<FP, digits, decimals>(str);
return ok ? num : def;
}
template<
uint8_t digits = std::numeric_limits<double>::digits10,
uint8_t decimals = digits
>
auto to_double_def(std::string_view const str, double def = 0){
return to_fp_def<double, digits, decimals>(str, def);
}
template<
uint8_t digits = std::numeric_limits<float>::digits10,
uint8_t decimals = digits
>
auto to_float_def(std::string_view const str, float def = 0){
return to_fp_def<float, digits, decimals>(str, def);
}
#include <cstdio>
int main(){
auto _ = [](const char *s){
const char *mask = ">%32.15f<\n";
constexpr double def = -0.111111111111111;
printf(mask, to_double_def(s, def));
};
_(" ." ); //error
_(" -." ); //error
_(" +." ); //error
_(" .0" );
_(" -.0" );
_(" +.0" );
_(" 0." );
_(" -0." );
_(" +0." );
_(" 0" );
_(" -0" );
_(" +0" );
_(" 100.1" );
_("-100.1" );
_("+100.1" );
_(" 123456789012345" );
_(" 123456789012345." );
_(" .123456789012345" );
_(" 123456789012345.123456789012345" ); // OK, but truncated
_(" -123456789012345" );
_(" -123456789012345." );
_(" -.123456789012345" );
_(" -123456789012345.123456789012345" ); // OK, but truncated
_(" 1234567890123456." ); //error, too long
_(" .1234567890123456" ); //error, too long
_(" 1234567890123456.1234567890123456" ); //error, too long
}
Result:
> -0.111111111111111<
> -0.111111111111111<
> -0.111111111111111<
> 0.000000000000000<
> -0.000000000000000<
> 0.000000000000000<
> 0.000000000000000<
> -0.000000000000000<
> 0.000000000000000<
> 0.000000000000000<
> -0.000000000000000<
> 0.000000000000000<
> 100.100000000000009<
> -100.100000000000009<
> 100.100000000000009<
> 123456789012345.000000000000000<
> 123456789012345.000000000000000<
> 0.123456789012345<
> 123456789012345.250000000000000<
>-123456789012345.000000000000000<
>-123456789012345.000000000000000<
> -0.123456789012345<
>-123456789012345.250000000000000<
> -0.111111111111111<
> -0.111111111111111<
> -0.111111111111111<
Benchmark:
the code is 4.6 x faster than std::strtod()
, but slower than std::from_chars()
https://quick-bench.com/q/cHVL5PW9m4WSp6OvPnrF-ApCrG4
2 Answers 2
hoist out of loop
Consider comparing num_digits
to decimals
at the very end,
so you bench quicker.
FP accumulated error
This expression concerns me:
exp *= 0.1;
We're supposed to be able to roundtrip any expression between FP and string, losslessly. Yet this is an inexact representation of 1/10th.
Consider incrementing an integer and then
computing the exp
multiplier afresh each time
from that integer.
EDIT
Actually, it turns out that this is pretty terrible:
number *= exp;
Why?
Because exp
is inexact.
It is 10 ** -N, e.g. 10 ** -6.
A much better approach would be
to assign it 10 ** N, e.g. a million, and compute
number /= exp;
In this case we're dividing an exact quantity,
an integer (many integers fit fine within a double),
by another exact quantity: number / exp
.
Only then do we round-off to 53 bits.
In contrast, the OP code suffers from a pair of rounding operations:
- infinite repeating binary fraction for 10 ** -N
- division
Truncating twice, to 53 bits, leads to worse errors.
Try it and see.
Good examples are 0.3
and 0.7
.
We don't want to obtain a result
like 0.30000000000000004
or 0.7000000000000001 -- we'd much
rather be able to roundtrip back and forth
without such noise creeping in.
There is a subtler difficulty with this computation. The spec requires rounding toward an even number in some cases.
Let's back up a moment.
A 64-bit double
doesn't quite represent a number.
Rather, it represents an interval on the real number line,
bounded by adjacent bit patterns.
Here, "adjacent" pretty much corresponds to
C cast punning where you treat the significand
bits as an int and use the ++
increment operator.
While the real number line is infinitely divisible,
FP space consists of a finite number of adjacent intervals
(plus NaNs, infinities, and that whole -0.0
thing).
So the double 1.0
covers 1 through 1+ε,
and 2.0
covers 2 through 2 plus a slightly bigger epsilon.
If our division result falls within that ε interval,
then we can roundtrip back and forth all day long
without noise, going from string to double to string to double.
Many numbers are amenable to this, such as 0.2
and even 0.3
.
They are infinite repeating binary fractions
which we truncate to 53 bits and obtain a perfectly nice result.
The fraction goes on infinitely but it lands squarely
within an interval.
The truncation can be viewed as appending an
infinite number of zeros on the end.
For a number that appears to land right on an interval boundary, when rounding to 53 bits we're required to round to even. Kind of a tricky requirement. Again, this is like having infinite zeros to the right.
Here is an example ASCII string which illustrates this:
"0.8750000000582075"
.
The correct to_fp()
result would be
0.8750000000582076 rather than
0.8750000000582075.
Once we see that ...76 ending in the double,
we can roundtrip to string to double to string
all day long and it remains ...76.
The underlying representation for ...75, and for ...76, is
0x1.c00000007ffffp-1
, while for ...77 it is
0x1.c000000080000p-1
.
-
\$\begingroup\$ FP accumulated error - I did changed this to a lookup array with values 1, 0.1, 0.01 and so on. If I benchmark it. Is not faster, but probably error is smaller. However I dont get what do you mean with comparison - what / how do you mean to do it? \$\endgroup\$Nick– Nick2023年07月14日 17:39:57 +00:00Commented Jul 14, 2023 at 17:39
-
\$\begingroup\$ I understand what you mean. That's very clever :) however, if I do it, nobody will understand the code. Also there may be a case like 5.1 where both are equal to 1. So may be better way will be to use bool, but still the code will became hard to understand. \$\endgroup\$Nick– Nick2023年07月14日 19:19:45 +00:00Commented Jul 14, 2023 at 19:19
-
\$\begingroup\$ A "slightly bigger" epsilon is actually twice the size, because that's where we move up to the next exponent. \$\endgroup\$Toby Speight– Toby Speight2023年07月19日 09:54:17 +00:00Commented Jul 19, 2023 at 9:54
I don't see any benefit to ResultFP
over simply using a std::optional
type, which is already familiar to users rather than something new to learn.
The helper functions could (and probably should) be declared constexpr
.
std::uint8_t
is written throughout as uint8_t
as if we'd included <stdint.h>
rather than <cstdint>
. Also, in the test program, std::printf
is similarly misspelt.
Is std::uint8_t
really the appropriate type for counting? std::uint_fast8_t
seems a better choice, since it's guaranteed to exist. Or perhaps just use int
for consistency with std::numeric_limits<FP>::digits10
.
I'm not sure why we limit the integral and fractional components to digits10
. That restricts us to quite a small portion of the floating-point range; consider max_exponent10
and digits10-min_exponent10
respectively for full coverage.
I'd prefer separate static_assert()
s for the three tests - that gives more informative feedback to client code which violates them. We should also test that digits
and decimals
aren't both zero.
We're missing an include of <iterator>
for the declarations of std::begin()
and std::end()
. However, it's simpler just to use std::string_view
's member begin()
and end()
here.
As J_H's answer points out, we should avoid inexact numbers such as 0.1
or 1 / 10.
The easy way to do this is to maintain FP divisor{1}
and multiply that by 10.0
each time we read a digit from the fractional part. And at the end, return number / divisor
.
The test program is better than nothing, but we have a ready-made source of truth we can compare against in the standard library, so we could make sure that we get the same results as std::strtod()
and similar, and make the tests self-checking. We don't test any template arguments other than the defaults, so it's quite a weak test. Also, we don't test enough invalid inputs (the empty string would generally be my first test, when I'm deciding on the interface, for example).
I'm not a big fan of the name _
for the test function.
My suggested replacement:
#include <limits>
#include <optional>
#include <string_view>
template<
typename FP = double,
int digits = std::numeric_limits<FP>::digits10,
int decimals = digits
>
std::optional<FP> to_fp(std::string_view const str) {
static_assert(std::is_floating_point_v<FP>);
static_assert(0 <= digits);
static_assert(0 <= decimals);
static_assert(digits + decimals);
static_assert(digits <= std::numeric_limits<FP>::max_exponent10);
static_assert(decimals <= std::numeric_limits<FP>::digits10-std::numeric_limits<FP>::min_exponent10);
static auto const isspace = [](char c){
return c == ' ' || c == '\t';
};
static auto const isdigit = [](char c){
return c >= '0' && c <= '9';
};
static auto const add_char = [](FP &number, char c){
number = number * 10 + (c - '0');
};
// -----
auto it = str.begin();
auto const end = str.end();
// Skip leading whitespace
while (it != end && isspace(*it)) {
++it;
}
if (it == end) { return {}; }
// Handle optional sign
FP sign = +1;
if (*it == '-') {
sign = -1;
++it;
} else if (*it == '+') {
++it;
}
FP number = 0.0;
bool digits_seen = false;
// Process integer part
for (int num_digits = 0; it != end && isdigit(*it); ++it) {
if (++num_digits > digits) { return {}; }
add_char(number, *it);
digits_seen = true;
}
// Process decimal part
if (it == end || *it != '.') { return sign * number; }
++it;
FP divisor = 1;
for (int num_digits = 0; it != end && isdigit(*it); ++it) {
if (++num_digits > decimals) { return {}; }
add_char(number, *it);
digits_seen = true;
divisor *= 10;
}
if (!digits_seen) { return {}; }
return sign * number / divisor;
}
template<
typename FP,
int digits = std::numeric_limits<FP>::digits10,
int decimals = digits
>
auto to_fp_def(std::string_view const str, FP def = 0){
return to_fp<FP, digits, decimals>(str).value_or(def);
}
There's still a tiny optimisation that can be made by tweaking the sign bit instead of multiplying by sign
.
And it still fails to round correctly in the least-significant digit.
std::strtod()
andstd::from_chars()
? \$\endgroup\$std::strtod()
, but slower thanstd::from_chars()
quick-bench.com/q/cHVL5PW9m4WSp6OvPnrF-ApCrG4 \$\endgroup\$