Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit c811592

Browse files
Optimize performance of fmod with Barrett multiplication
1 parent bf85502 commit c811592

File tree

2 files changed

+98
-8
lines changed

2 files changed

+98
-8
lines changed

‎libm/src/math/generic/fmod.rs‎

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/* SPDX-License-Identifier: MIT OR Apache-2.0 */
22
use super::super::{CastFrom, Float, Int, MinInt};
3+
use crate::support::{DInt, HInt, Reducer};
34

45
#[inline]
56
pub fn fmod<F: Float>(x: F, y: F) -> F {
@@ -59,10 +60,102 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
5960

6061
/// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
6162
fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
63+
// FIXME: This is a temporary hack to get around the lack of `u256 / u256`.
64+
// Actually, the algorithm only needs the operation `(x << I::BITS) / y`
65+
// where `x < y`. That is, a division `u256 / u128` where the quotient must
66+
// not overflow `u128` would be sufficient for `f128`.
67+
unsafe {
68+
use core::mem::transmute_copy;
69+
if I::BITS == 64 {
70+
let x = transmute_copy::<I, u64>(&x);
71+
let y = transmute_copy::<I, u64>(&y);
72+
let r = fast_reduction::<f64, u64>(x, e, y);
73+
return transmute_copy::<u64, I>(&r);
74+
}
75+
if I::BITS == 32 {
76+
let x = transmute_copy::<I, u32>(&x);
77+
let y = transmute_copy::<I, u32>(&y);
78+
let r = fast_reduction::<f32, u32>(x, e, y);
79+
return transmute_copy::<u32, I>(&r);
80+
}
81+
#[cfg(f16_enabled)]
82+
if I::BITS == 16 {
83+
let x = transmute_copy::<I, u16>(&x);
84+
let y = transmute_copy::<I, u16>(&y);
85+
let r = fast_reduction::<f16, u16>(x, e, y);
86+
return transmute_copy::<u16, I>(&r);
87+
}
88+
}
89+
6290
x %= y;
6391
for _ in 0..e {
6492
x <<= 1;
6593
x = x.checked_sub(y).unwrap_or(x);
6694
}
6795
x
6896
}
97+
98+
trait SafeShift: Float {
99+
// How many guaranteed leading zeros do the values have?
100+
// A normalized floating point mantissa has `EXP_BITS` guaranteed leading
101+
// zeros (exludes the implicit bit, but includes the now-zeroed sign bit)
102+
// `-1` because we want to shift by either `BASE_SHIFT` or `BASE_SHIFT + 1`
103+
const BASE_SHIFT: u32 = Self::EXP_BITS - 1;
104+
}
105+
impl<F: Float> SafeShift for F {}
106+
107+
fn fast_reduction<F, I>(x: I, e: u32, y: I) -> I
108+
where
109+
F: Float<Int = I>,
110+
I: Int + HInt,
111+
I::D: Int + DInt<H = I>,
112+
{
113+
let _0 = I::ZERO;
114+
let _1 = I::ONE;
115+
116+
if y == _1 {
117+
return _0;
118+
}
119+
120+
if e <= F::BASE_SHIFT {
121+
return (x << e) % y;
122+
}
123+
124+
// Find least depth s.t. `(e >> depth) < I::BITS`
125+
let depth = (I::BITS - 1)
126+
.leading_zeros()
127+
.saturating_sub(e.leading_zeros());
128+
129+
let initial = (e >> depth) - F::BASE_SHIFT;
130+
131+
let max_rem = y.wrapping_sub(_1);
132+
let max_ilog2 = max_rem.ilog2();
133+
let mut pow2 = _1 << max_ilog2.min(initial);
134+
for _ in max_ilog2..initial {
135+
pow2 <<= 1;
136+
pow2 = pow2.checked_sub(y).unwrap_or(pow2);
137+
}
138+
139+
// At each step `k in [depth, ..., 0]`,
140+
// `p` is `(e >> k) - BASE_SHIFT`
141+
// `m` is `(1 << p) % y`
142+
let mut k = depth;
143+
let mut p = initial;
144+
let mut m = Reducer::new(pow2, y);
145+
146+
while k > 0 {
147+
k -= 1;
148+
p = p + p + F::BASE_SHIFT;
149+
if e & (1 << k) != 0 {
150+
m = m.squared_with_shift(F::BASE_SHIFT + 1);
151+
p += 1;
152+
} else {
153+
m = m.squared_with_shift(F::BASE_SHIFT);
154+
};
155+
156+
debug_assert!(p == (e >> k) - F::BASE_SHIFT);
157+
}
158+
159+
// (x << BASE_SHIFT) * (1 << p) == x << e
160+
m.mul_into_div_rem(x << F::BASE_SHIFT).1
161+
}

‎libm/src/math/support/int_traits/mod_mul.rs‎

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,11 @@ use super::{DInt, HInt, Int};
22

33
/// Barrett reduction using the constant `R == (1 << K) == (1 << U::BITS)`
44
///
5-
/// More specifically, implements single-word [Barrett multiplication]
6-
/// (https://en.wikipedia.org/wiki/Barrett_reduction#Single-word_Barrett_multiplication)
7-
/// and [division]
8-
/// (https://en.wikipedia.org/wiki/Barrett_reduction#Barrett_Division)
9-
/// for unsigned integers.
5+
/// For a more detailed description, see
6+
/// <https://en.wikipedia.org/wiki/Barrett_reduction>.
107
///
118
/// After constructing as `Reducer::new(b, n)`,
12-
/// provides operations to efficiently compute
9+
/// has operations to efficiently compute
1310
/// - `(a * b) / n` and `(a * b) % n`
1411
/// - `Reducer::new((a * b * b) % n, n)`, as long as `a * (n - 1) < R`
1512
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
@@ -103,7 +100,7 @@ where
103100
let ar_ns = a.widen_mul(self.rem) + _s.widen_mul(self.div);
104101
assert!(ab_tn.hi().is_zero());
105102
assert!(ar_ns.lo().is_zero());
106-
assert_eq!(ab_tn.lo(), ar_ns.hi());
103+
assert!(ab_tn.lo() == ar_ns.hi());
107104
}
108105
// Since `s < R` and `r < n`,
109106
// ```
@@ -124,7 +121,7 @@ where
124121
/// Requires `r * ab == ra * b`, where `r = bR % n`.
125122
#[inline(always)]
126123
fn with_scaled_num_rem(&self, ab: U, ra: U) -> Self {
127-
debug_assert_eq!(ab.widen_mul(self.rem), ra.widen_mul(self.num));
124+
debug_assert!(ab.widen_mul(self.rem) == ra.widen_mul(self.num));
128125
// The new factor `v = abb mod n`:
129126
let (_, v) = self.mul_into_div_rem(ab);
130127

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /