Commit c811592

committed

Optimize performance of fmod with Barrett multiplication

1 parent bf85502 commit c811592Copy full SHA for c811592

File tree

2 files changed

+98

-8

lines changed

libm/src/math
- generic
  - fmod.rs
- support/int_traits
  - mod_mul.rs

2 files changed

+98

-8

lines changed

`‎libm/src/math/generic/fmod.rs‎`

Lines changed: 93 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`/* SPDX-License-Identifier: MIT OR Apache-2.0 */`
`2`	`2`	`use super::super::{CastFrom, Float, Int, MinInt};`
	`3`	`+use crate::support::{DInt, HInt, Reducer};`
`3`	`4`
`4`	`5`	`#[inline]`
`5`	`6`	`pub fn fmod<F: Float>(x: F, y: F) -> F {`
`@@ -59,10 +60,102 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {`
`59`	`60`
`60`	`61`	/// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
`61`	`62`	`fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {`
	`63`	+ // FIXME: This is a temporary hack to get around the lack of `u256 / u256`.
	`64`	+ // Actually, the algorithm only needs the operation `(x << I::BITS) / y`
	`65`	+ // where `x < y`. That is, a division `u256 / u128` where the quotient must
	`66`	+ // not overflow `u128` would be sufficient for `f128`.
	`67`	`+ unsafe {`
	`68`	`+ use core::mem::transmute_copy;`
	`69`	`+ if I::BITS == 64 {`
	`70`	`+ let x = transmute_copy::<I, u64>(&x);`
	`71`	`+ let y = transmute_copy::<I, u64>(&y);`
	`72`	`+ let r = fast_reduction::<f64, u64>(x, e, y);`
	`73`	`+ return transmute_copy::<u64, I>(&r);`
	`74`	`+ }`
	`75`	`+ if I::BITS == 32 {`
	`76`	`+ let x = transmute_copy::<I, u32>(&x);`
	`77`	`+ let y = transmute_copy::<I, u32>(&y);`
	`78`	`+ let r = fast_reduction::<f32, u32>(x, e, y);`
	`79`	`+ return transmute_copy::<u32, I>(&r);`
	`80`	`+ }`
	`81`	`+ #[cfg(f16_enabled)]`
	`82`	`+ if I::BITS == 16 {`
	`83`	`+ let x = transmute_copy::<I, u16>(&x);`
	`84`	`+ let y = transmute_copy::<I, u16>(&y);`
	`85`	`+ let r = fast_reduction::<f16, u16>(x, e, y);`
	`86`	`+ return transmute_copy::<u16, I>(&r);`
	`87`	`+ }`
	`88`	`+ }`
	`89`	`+`
`62`	`90`	`x %= y;`
`63`	`91`	`for _ in 0..e {`
`64`	`92`	`x <<= 1;`
`65`	`93`	`x = x.checked_sub(y).unwrap_or(x);`
`66`	`94`	`}`
`67`	`95`	`x`
`68`	`96`	`}`
	`97`	`+`
	`98`	`+trait SafeShift: Float {`
	`99`	`+ // How many guaranteed leading zeros do the values have?`
	`100`	+ // A normalized floating point mantissa has `EXP_BITS` guaranteed leading
	`101`	`+ // zeros (exludes the implicit bit, but includes the now-zeroed sign bit)`
	`102`	+ // `-1` because we want to shift by either `BASE_SHIFT` or `BASE_SHIFT + 1`
	`103`	`+ const BASE_SHIFT: u32 = Self::EXP_BITS - 1;`
	`104`	`+}`
	`105`	`+impl<F: Float> SafeShift for F {}`
	`106`	`+`
	`107`	`+fn fast_reduction<F, I>(x: I, e: u32, y: I) -> I`
	`108`	`+where`
	`109`	`+ F: Float<Int = I>,`
	`110`	`+ I: Int + HInt,`
	`111`	`+ I::D: Int + DInt<H = I>,`
	`112`	`+{`
	`113`	`+ let _0 = I::ZERO;`
	`114`	`+ let _1 = I::ONE;`
	`115`	`+`
	`116`	`+ if y == _1 {`
	`117`	`+ return _0;`
	`118`	`+ }`
	`119`	`+`
	`120`	`+ if e <= F::BASE_SHIFT {`
	`121`	`+ return (x << e) % y;`
	`122`	`+ }`
	`123`	`+`
	`124`	+ // Find least depth s.t. `(e >> depth) < I::BITS`
	`125`	`+ let depth = (I::BITS - 1)`
	`126`	`+ .leading_zeros()`
	`127`	`+ .saturating_sub(e.leading_zeros());`
	`128`	`+`
	`129`	`+ let initial = (e >> depth) - F::BASE_SHIFT;`
	`130`	`+`
	`131`	`+ let max_rem = y.wrapping_sub(_1);`
	`132`	`+ let max_ilog2 = max_rem.ilog2();`
	`133`	`+ let mut pow2 = _1 << max_ilog2.min(initial);`
	`134`	`+ for _ in max_ilog2..initial {`
	`135`	`+ pow2 <<= 1;`
	`136`	`+ pow2 = pow2.checked_sub(y).unwrap_or(pow2);`
	`137`	`+ }`
	`138`	`+`
	`139`	+ // At each step `k in [depth, ..., 0]`,
	`140`	+ // `p` is `(e >> k) - BASE_SHIFT`
	`141`	+ // `m` is `(1 << p) % y`
	`142`	`+ let mut k = depth;`
	`143`	`+ let mut p = initial;`
	`144`	`+ let mut m = Reducer::new(pow2, y);`
	`145`	`+`
	`146`	`+ while k > 0 {`
	`147`	`+ k -= 1;`
	`148`	`+ p = p + p + F::BASE_SHIFT;`
	`149`	`+ if e & (1 << k) != 0 {`
	`150`	`+ m = m.squared_with_shift(F::BASE_SHIFT + 1);`
	`151`	`+ p += 1;`
	`152`	`+ } else {`
	`153`	`+ m = m.squared_with_shift(F::BASE_SHIFT);`
	`154`	`+ };`
	`155`	`+`
	`156`	`+ debug_assert!(p == (e >> k) - F::BASE_SHIFT);`
	`157`	`+ }`
	`158`	`+`
	`159`	`+ // (x << BASE_SHIFT) * (1 << p) == x << e`
	`160`	`+ m.mul_into_div_rem(x << F::BASE_SHIFT).1`
	`161`	`+}`

`‎libm/src/math/support/int_traits/mod_mul.rs‎`

Lines changed: 5 additions & 8 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,14 +2,11 @@ use super::{DInt, HInt, Int};`
`2`	`2`
`3`	`3`	/// Barrett reduction using the constant `R == (1 << K) == (1 << U::BITS)`
`4`	`4`	`///`
`5`		`-/// More specifically, implements single-word [Barrett multiplication]`
`6`		`-/// (https://en.wikipedia.org/wiki/Barrett_reduction#Single-word_Barrett_multiplication)`
`7`		`-/// and [division]`
`8`		`-/// (https://en.wikipedia.org/wiki/Barrett_reduction#Barrett_Division)`
`9`		`-/// for unsigned integers.`
	`5`	`+/// For a more detailed description, see`
	`6`	`+/// <https://en.wikipedia.org/wiki/Barrett_reduction>.`
`10`	`7`	`///`
`11`	`8`	/// After constructing as `Reducer::new(b, n)`,
`12`		`-/// provides operations to efficiently compute`
	`9`	`+/// has operations to efficiently compute`
`13`	`10`	/// - `(a * b) / n` and `(a * b) % n`
`14`	`11`	/// - `Reducer::new((a * b * b) % n, n)`, as long as `a * (n - 1) < R`
`15`	`12`	`#[derive(Clone, Copy, PartialEq, Eq, Debug)]`
`@@ -103,7 +100,7 @@ where`
`103`	`100`	`let ar_ns = a.widen_mul(self.rem) + _s.widen_mul(self.div);`
`104`	`101`	`assert!(ab_tn.hi().is_zero());`
`105`	`102`	`assert!(ar_ns.lo().is_zero());`
`106`		`- assert_eq!(ab_tn.lo(), ar_ns.hi());`
	`103`	`+ assert!(ab_tn.lo() == ar_ns.hi());`
`107`	`104`	`}`
`108`	`105`	// Since `s < R` and `r < n`,
`109`	`106`	// ```
`@@ -124,7 +121,7 @@ where`
`124`	`121`	/// Requires `r * ab == ra * b`, where `r = bR % n`.
`125`	`122`	`#[inline(always)]`
`126`	`123`	`fn with_scaled_num_rem(&self, ab: U, ra: U) -> Self {`
`127`		`- debug_assert_eq!(ab.widen_mul(self.rem), ra.widen_mul(self.num));`
	`124`	`+ debug_assert!(ab.widen_mul(self.rem) == ra.widen_mul(self.num));`
`128`	`125`	// The new factor `v = abb mod n`:
`129`	`126`	`let (_, v) = self.mul_into_div_rem(ab);`
`130`	`127`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit c811592

File tree

2 files changed

2 files changed

`‎libm/src/math/generic/fmod.rs‎`

`‎libm/src/math/support/int_traits/mod_mul.rs‎`

0 commit comments