Files
another-boids-in-rust/vendor/pxfm/src/double_double.rs

1013 lines
32 KiB
Rust

/*
* // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
* //
* // Redistribution and use in source and binary forms, with or without modification,
* // are permitted provided that the following conditions are met:
* //
* // 1. Redistributions of source code must retain the above copyright notice, this
* // list of conditions and the following disclaimer.
* //
* // 2. Redistributions in binary form must reproduce the above copyright notice,
* // this list of conditions and the following disclaimer in the documentation
* // and/or other materials provided with the distribution.
* //
* // 3. Neither the name of the copyright holder nor the names of its
* // contributors may be used to endorse or promote products derived from
* // this software without specific prior written permission.
* //
* // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
use crate::bits::get_exponent_f64;
#[allow(unused_imports)]
use crate::common::*;
use std::ops::{Mul, Neg};
// https://hal.science/hal-01351529v3/document
#[derive(Copy, Clone, Default, Debug)]
pub(crate) struct DoubleDouble {
pub(crate) lo: f64,
pub(crate) hi: f64,
}
impl Neg for DoubleDouble {
type Output = Self;
#[inline]
fn neg(self) -> Self::Output {
Self {
hi: -self.hi,
lo: -self.lo,
}
}
}
impl DoubleDouble {
#[inline]
pub(crate) const fn from_bit_pair(pair: (u64, u64)) -> Self {
Self {
lo: f64::from_bits(pair.0),
hi: f64::from_bits(pair.1),
}
}
#[inline]
pub(crate) const fn new(lo: f64, hi: f64) -> Self {
DoubleDouble { lo, hi }
}
// Non FMA helper
#[allow(dead_code)]
#[inline]
pub(crate) const fn split(a: f64) -> DoubleDouble {
// CN = 2^N.
const CN: f64 = (1 << 27) as f64;
const C: f64 = CN + 1.0;
let t1 = C * a;
let t2 = a - t1;
let r_hi = t1 + t2;
let r_lo = a - r_hi;
DoubleDouble::new(r_lo, r_hi)
}
// Non FMA helper
#[allow(dead_code)]
#[inline]
fn from_exact_mult_impl_non_fma(asz: DoubleDouble, a: f64, b: f64) -> Self {
let bs = DoubleDouble::split(b);
let r_hi = a * b;
let t1 = asz.hi * bs.hi - r_hi;
let t2 = asz.hi * bs.lo + t1;
let t3 = asz.lo * bs.hi + t2;
let r_lo = asz.lo * bs.lo + t3;
DoubleDouble::new(r_lo, r_hi)
}
// valid only for |a| > b
#[inline]
pub(crate) const fn from_exact_add(a: f64, b: f64) -> DoubleDouble {
let r_hi = a + b;
let t = r_hi - a;
let r_lo = b - t;
DoubleDouble::new(r_lo, r_hi)
}
// valid only for |a| > b
#[inline]
pub(crate) const fn from_exact_sub(a: f64, b: f64) -> DoubleDouble {
let r_hi = a - b;
let t = a - r_hi;
let r_lo = t - b;
DoubleDouble::new(r_lo, r_hi)
}
#[inline]
pub(crate) const fn from_full_exact_add(a: f64, b: f64) -> DoubleDouble {
let r_hi = a + b;
let t1 = r_hi - a;
let t2 = r_hi - t1;
let t3 = b - t1;
let t4 = a - t2;
let r_lo = t3 + t4;
DoubleDouble::new(r_lo, r_hi)
}
#[allow(unused)]
#[inline]
pub(crate) fn dd_f64_mul_add(a: f64, b: f64, c: f64) -> f64 {
let ddx2 = DoubleDouble::from_exact_mult(a, b);
let zv = DoubleDouble::full_add_f64(ddx2, c);
zv.to_f64()
}
#[inline]
pub(crate) const fn from_full_exact_sub(a: f64, b: f64) -> Self {
let r_hi = a - b;
let t1 = r_hi - a;
let t2 = r_hi - t1;
let t3 = -b - t1;
let t4 = a - t2;
let r_lo = t3 + t4;
DoubleDouble::new(r_lo, r_hi)
}
#[inline]
pub(crate) fn add(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble {
let s = a.hi + b.hi;
let d = s - a.hi;
let l = ((b.hi - d) + (a.hi + (d - s))) + (a.lo + b.lo);
DoubleDouble::new(l, s)
}
#[inline]
pub(crate) fn quick_dd_add(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble {
let DoubleDouble { hi: sh, lo: sl } = DoubleDouble::from_full_exact_add(a.hi, b.hi);
let v = a.lo + b.lo;
let w = sl + v;
DoubleDouble::from_exact_add(sh, w)
}
#[inline]
pub(crate) fn quick_dd_sub(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble {
let DoubleDouble { hi: sh, lo: sl } = DoubleDouble::from_full_exact_sub(a.hi, b.hi);
let v = a.lo - b.lo;
let w = sl + v;
DoubleDouble::from_exact_add(sh, w)
}
#[inline]
pub(crate) fn full_dd_add(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble {
let DoubleDouble { hi: sh, lo: sl } = DoubleDouble::from_full_exact_add(a.hi, b.hi);
let DoubleDouble { hi: th, lo: tl } = DoubleDouble::from_full_exact_add(a.lo, b.lo);
let c = sl + th;
let v = DoubleDouble::from_exact_add(sh, c);
let w = tl + v.lo;
DoubleDouble::from_exact_add(v.hi, w)
}
#[inline]
pub(crate) fn full_dd_sub(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble {
DoubleDouble::full_dd_add(a, -b)
}
#[inline]
pub(crate) fn sub(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble {
let s = a.hi - b.hi;
let d = s - a.hi;
let l = ((-b.hi - d) + (a.hi + (d - s))) + (a.lo - b.lo);
DoubleDouble::new(l, s)
}
/// DoubleDouble-style square root for a double-double number
#[inline]
pub(crate) fn sqrt(self) -> DoubleDouble {
let a = self.hi + self.lo;
if a == 0.0 {
return DoubleDouble { hi: 0.0, lo: 0.0 };
}
if a < 0.0 || a.is_nan() {
return DoubleDouble {
hi: f64::NAN,
lo: 0.0,
};
}
if a.is_infinite() {
return DoubleDouble {
hi: f64::INFINITY,
lo: 0.0,
};
}
let x = a.sqrt();
let x2 = DoubleDouble::from_exact_mult(x, x);
// Residual = self - x²
let mut r = self.hi - x2.hi;
r += self.lo;
r -= x2.lo;
let dx = r / (2.0 * x);
let hi = x + dx;
let lo = (x - hi) + dx;
DoubleDouble { hi, lo }
}
/// DoubleDouble-style square root for a double-double number
#[inline]
pub(crate) fn fast_sqrt(self) -> DoubleDouble {
let a = self.hi + self.lo;
let x = a.sqrt();
let x2 = DoubleDouble::from_exact_mult(x, x);
// Residual = self - x²
let mut r = self.hi - x2.hi;
r += self.lo;
r -= x2.lo;
let dx = r / (2.0 * x);
let hi = x + dx;
let lo = (x - hi) + dx;
DoubleDouble { hi, lo }
}
/// `a*b+c`
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
#[inline]
pub(crate) fn mul_add_f64(a: DoubleDouble, b: DoubleDouble, c: f64) -> DoubleDouble {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_full_exact_add(c, h);
DoubleDouble::new(r + q, p)
}
/// `a*b+c`
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
#[inline]
pub(crate) fn quick_mul_add_f64(a: DoubleDouble, b: DoubleDouble, c: f64) -> DoubleDouble {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_exact_add(c, h);
DoubleDouble::new(r + q, p)
}
/// `a*b+c`
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
#[inline]
pub(crate) fn mul_f64_add_f64(a: DoubleDouble, b: f64, c: f64) -> DoubleDouble {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_f64(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_full_exact_add(c, h);
DoubleDouble::new(r + q, p)
}
// /// Accurate reciprocal: 1 / self
// #[inline]
// pub(crate) fn recip_raphson(self) -> DoubleDouble {
// let y0 = DoubleDouble::recip(self);
// let z = DoubleDouble::mul_add_f64(-self, y0, 1.0);
// DoubleDouble::mul_add(y0, z, y0)
// }
/// Accurate reciprocal: 1 / self
#[inline]
pub(crate) fn recip(self) -> DoubleDouble {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let y = 1. / self.hi;
let e1 = f_fmla(-self.hi, y, 1.0);
let e2 = f_fmla(-self.lo, y, e1);
let e = y * e2;
DoubleDouble::new(e, y)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let y = 1.0 / self.hi;
let DoubleDouble { hi: p1, lo: err1 } = DoubleDouble::from_exact_mult(self.hi, y);
let e1 = (1.0 - p1) - err1;
let DoubleDouble { hi: p2, lo: err2 } = DoubleDouble::from_exact_mult(self.lo, y);
let e2 = (e1 - p2) - err2;
let e = y * e2;
DoubleDouble::new(e, y)
}
}
#[inline]
pub(crate) fn from_recip(b: f64) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let x_hi = 1.0 / b;
let err = f_fmla(-x_hi, b, 1.0);
let x_lo = err / b;
Self::new(x_lo, x_hi)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let x_hi = 1.0 / b;
let prod = Self::from_exact_mult(x_hi, b);
let err = (1.0 - prod.hi) - prod.lo;
let x_lo = err / b;
Self::new(x_lo, x_hi)
}
}
#[inline]
pub(crate) fn from_quick_recip(b: f64) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let h = 1.0 / b;
let hl = f_fmla(h, -b, 1.) * h;
DoubleDouble::new(hl, h)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let h = 1.0 / b;
let pr = DoubleDouble::from_exact_mult(h, b);
let err = (1.0 - pr.hi) - pr.lo;
let hl = err * h;
DoubleDouble::new(hl, h)
}
}
#[inline]
pub(crate) fn from_exact_div(a: f64, b: f64) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let q_hi = a / b;
let r = f_fmla(-q_hi, b, a);
let q_lo = r / b;
Self::new(q_lo, q_hi)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let q_hi = a / b;
let p = DoubleDouble::from_exact_mult(q_hi, b);
let r = DoubleDouble::from_exact_sub(a, p.hi);
let r = r.hi + (r.lo - p.lo);
let q_lo = r / b;
Self::new(q_lo, q_hi)
}
}
// Resistant to overflow without FMA
#[inline]
pub(crate) fn from_exact_safe_div(a: f64, b: f64) -> Self {
let q_hi = a / b;
let r = f64::mul_add(-q_hi, b, a);
let q_lo = r / b;
Self::new(q_lo, q_hi)
}
#[inline]
pub(crate) fn from_sqrt(x: f64) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let h = x.sqrt();
/* h = sqrt(x) * (1 + e1) with |e1| < 2^-52
thus h^2 = x * (1 + e2) with |e2| < 2^-50.999 */
let e = -f_fmla(h, h, -x); // exact
/* e = x - h^2 */
let l = e / (h + h);
DoubleDouble::new(l, h)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let h = x.sqrt();
let prod_hh = DoubleDouble::from_exact_mult(h, h);
let e = (x - prod_hh.hi) - prod_hh.lo; // exact
/* e = x - h^2 */
let l = e / (h + h);
DoubleDouble::new(l, h)
}
}
/// Safe to overflow underflow division using mandatory FMA.
#[inline]
#[allow(dead_code)]
pub(crate) fn div_safe_dd_f64(a: DoubleDouble, b: f64) -> Self {
let q1 = a.hi / b;
let r = f64::mul_add(-q1, b, a.hi);
let r = r + a.lo;
let q2 = r / b;
DoubleDouble::new(q2, q1)
}
#[inline]
pub(crate) fn div_dd_f64(a: DoubleDouble, b: f64) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let q1 = a.hi / b;
let r = f_fmla(-q1, b, a.hi);
let r = r + a.lo;
let q2 = r / b;
DoubleDouble::new(q2, q1)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let th = a.hi / b;
let prod = DoubleDouble::from_exact_mult(th, b);
let beta_h = a.hi - prod.hi;
let beta_l = beta_h - prod.lo;
let beta = beta_l + a.lo;
let tl = beta / b;
DoubleDouble::new(tl, th)
}
}
// /// Dekker division with one refinement step
// #[inline]
// pub(crate) fn div_dd_f64_newton_raphson(a: DoubleDouble, b: f64) -> Self {
// // Initial estimate q = a / b
// let q = DoubleDouble::div_dd_f64(a, b);
//
// // One Newton-Raphson refinement step:
// // e = a - q * b
// let qb = DoubleDouble::quick_mult_f64(q, b);
// let e = DoubleDouble::sub(a, qb);
// let e_div_b = DoubleDouble::div_dd_f64(e, b);
//
// DoubleDouble::add(q, e_div_b)
// }
// /// Dekker division with two Newton-Raphson refinement steps
// #[inline]
// pub(crate) fn div_dd_f64_newton_raphson_2(a: Dekker, b: f64) -> Self {
// // First estimate: q = a / b (one round of Dekker division)
// let q1 = Dekker::div_dd_f64(a, b);
//
// // First refinement: q2 = q1 + (a - q1 * b) / b
// let qb1 = Dekker::quick_mult_f64(q1, b);
// let e1 = Dekker::sub(a, qb1);
// let dq1 = Dekker::div_dd_f64(e1, b);
// let q2 = Dekker::add(q1, dq1);
//
// // Second refinement: q3 = q2 + (a - q2 * b) / b
// let qb2 = Dekker::quick_mult_f64(q2, b);
// let e2 = Dekker::sub(a, qb2);
// let dq2 = Dekker::div_dd_f64(e2, b);
//
// Dekker::add(q2, dq2)
// }
// #[inline]
// pub(crate) fn neg(self) -> Self {
// Self {
// lo: -self.lo, hi: -self.hi,
// }
// }
#[inline]
pub(crate) fn from_f64_div_dd(a: f64, b: DoubleDouble) -> Self {
let q1 = a / b.hi;
let prod = DoubleDouble::from_exact_mult(q1, b.hi);
let prod_lo = f_fmla(q1, b.lo, prod.lo);
let rem = f_fmla(-1.0, prod.hi, a) - prod_lo;
let q2 = rem / b.hi;
DoubleDouble::new(q2, q1)
}
// #[inline]
// pub(crate) fn mla_f64(a: Dekker, b: f64, c: f64) -> Self {
// let q = Dekker::mult_f64(a, b);
// Dekker::add_f64(q, c)
// }
//
// #[inline]
// pub(crate) fn mla_dd_f64(a: Dekker, b: Dekker, c: f64) -> Self {
// let q = Dekker::quick_mult(a, b);
// Dekker::add_f64(q, c)
// }
#[inline]
pub(crate) fn div(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble {
let q = 1.0 / b.hi;
let r_hi = a.hi * q;
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let e_hi = f_fmla(b.hi, -r_hi, a.hi);
let e_lo = f_fmla(b.lo, -r_hi, a.lo);
let r_lo = q * (e_hi + e_lo);
DoubleDouble::new(r_lo, r_hi)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let b_hi_r_hi = DoubleDouble::from_exact_mult(b.hi, -r_hi);
let b_lo_r_hi = DoubleDouble::from_exact_mult(b.lo, -r_hi);
let e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo;
let e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo;
let r_lo = q * (e_hi + e_lo);
DoubleDouble::new(r_lo, r_hi)
}
}
#[inline]
pub(crate) fn from_exact_mult(a: f64, b: f64) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let r_hi = a * b;
let r_lo = f_fmla(a, b, -r_hi);
DoubleDouble::new(r_lo, r_hi)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let splat = DoubleDouble::split(a);
DoubleDouble::from_exact_mult_impl_non_fma(splat, a, b)
}
}
// #[inline]
// pub(crate) fn add_f64(&self, other: f64) -> DoubleDouble {
// let r = DoubleDouble::from_exact_add(self.hi, other);
// Dekker::from_exact_add(r.hi, r.lo + self.lo)
// }
// #[inline]
// pub(crate) fn to_triple(self) -> TripleDouble {
// TripleDouble::new(0., self.lo, self.hi)
// }
/// Computes `a * b + c`
/// `b` is an `f64`, `a` and `c` are `DoubleDouble`.
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
#[inline]
pub(crate) fn mul_f64_add(a: DoubleDouble, b: f64, c: DoubleDouble) -> Self {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_f64(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::full_add_f64(c, h);
DoubleDouble::new(r + q, p)
}
/// Computes `a * b + c`
/// `b` is an `f64`, `a` and `c` are `DoubleDouble`.
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
///
/// *Correctness*
/// |c.hi| > |a.hi * b.hi|
#[inline]
pub(crate) fn quick_mul_f64_add(a: DoubleDouble, b: f64, c: DoubleDouble) -> Self {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_f64(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::add_f64(c, h);
DoubleDouble::new(r + q, p)
}
/// Computes `a * b + c`
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
///
/// *Correctness*
/// |c.hi| > |a.hi * b.hi|
#[inline]
pub(crate) fn quick_mul_f64_add_f64(a: DoubleDouble, b: f64, c: f64) -> Self {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_f64(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_exact_add(c, h);
DoubleDouble::new(r + q, p)
}
// #[inline]
// pub(crate) fn mul_f64_add_full(a: DoubleDouble, b: f64, c: DoubleDouble) -> Self {
// /*
// double _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8; \
// \
// Mul12(&_t1,&_t2,(a),(bh)); \
// Add12(_t3,_t4,(ch),_t1); \
// _t5 = (bl) * (a); \
// _t6 = (cl) + _t2; \
// _t7 = _t5 + _t6; \
// _t8 = _t7 + _t4; \
// Add12((*(resh)),(*(resl)),_t3,_t8); \
// */
// let DoubleDouble { hi: t1, lo: t2 } = DoubleDouble::from_exact_mult(a.hi, b);
// let DoubleDouble { hi: t3, lo: t4 } = DoubleDouble::from_full_exact_add(c.hi, t1);
// let t5 = a.lo * b;
// let t6 = c.lo + t2;
// let t7 = t5 + t6;
// let t8 = t7 + t4;
// DoubleDouble::from_full_exact_add(t3, t8)
// }
/// Computes `a * b + c`
/// `b` is an `f64`, `a` and `c` are `DoubleDouble`.
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
#[inline]
pub(crate) fn f64_mul_f64_add(a: f64, b: f64, c: DoubleDouble) -> Self {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::from_exact_mult(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::full_add_f64(c, h);
DoubleDouble::new(r + q, p)
}
// /// Computes `a * b + c`
// /// `b` is an `f64`, `a` and `c` are `DoubleDouble`.
// ///
// /// *Accurate dot product (Ogita, Rump and Oishi 2004)*
// #[inline]
// pub(crate) fn single_mul_add(a: f64, b: f64, c: f64) -> Self {
// let DoubleDouble { hi: h, lo: r } = DoubleDouble::from_exact_mult(a, b);
// let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_full_exact_add(c, h);
// DoubleDouble::new(r + q, p)
// }
// /// Computes `a * b + c` safe to overflow without FMA
// /// `b` is an `f64`, `a` and `c` are `DoubleDouble`.
// ///
// /// *Accurate dot product (Ogita, Rump and Oishi 2004)*
// #[inline]
// pub(crate) fn mul_f64_safe_add(a: DoubleDouble, b: f64, c: DoubleDouble) -> Self {
// let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_safe_f64(a, b);
// let DoubleDouble { hi: p, lo: q } = DoubleDouble::full_add_f64(c, h);
// DoubleDouble::new(r + q, p)
// }
/// `a*b+c`
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
#[inline]
pub(crate) fn mul_add(a: DoubleDouble, b: DoubleDouble, c: DoubleDouble) -> Self {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::full_add_f64(c, h);
DoubleDouble::new(r + q, p)
}
/// `a*b+c`
///
/// *Accurate dot product (Ogita, Rump and Oishi 2004)*
///
/// *Correctness*
/// |c.hi| > |a.hi * b.hi|
#[inline]
pub(crate) fn quick_mul_add(a: DoubleDouble, b: DoubleDouble, c: DoubleDouble) -> Self {
let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult(a, b);
let DoubleDouble { hi: p, lo: q } = DoubleDouble::add_f64(c, h);
DoubleDouble::new(r + q, p)
}
#[inline]
pub(crate) fn quick_mult(a: DoubleDouble, b: DoubleDouble) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let mut r = DoubleDouble::from_exact_mult(a.hi, b.hi);
let t1 = f_fmla(a.hi, b.lo, r.lo);
let t2 = f_fmla(a.lo, b.hi, t1);
r.lo = t2;
r
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b.hi);
let tl1 = a.hi * b.lo;
let tl2 = a.lo * b.hi;
let cl2 = tl1 + tl2;
let cl3 = cl1 + cl2;
DoubleDouble::new(cl3, ch)
}
}
#[inline]
pub(crate) fn mult(a: DoubleDouble, b: DoubleDouble) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b.hi);
let tl0 = a.lo * b.lo;
let tl1 = f_fmla(a.hi, b.lo, tl0);
let cl2 = f_fmla(a.lo, b.hi, tl1);
let cl3 = cl1 + cl2;
DoubleDouble::from_exact_add(ch, cl3)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b.hi);
let tl1 = a.hi * b.lo;
let tl2 = a.lo * b.hi;
let cl2 = tl1 + tl2;
let cl3 = cl1 + cl2;
DoubleDouble::from_exact_add(ch, cl3)
}
}
#[inline]
pub(crate) fn mult_f64(a: DoubleDouble, b: f64) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b);
let cl3 = f_fmla(a.lo, b, cl1);
DoubleDouble::from_exact_add(ch, cl3)
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b);
let cl2 = a.lo * b;
let t = DoubleDouble::from_exact_add(ch, cl2);
let tl2 = t.lo + cl1;
DoubleDouble::from_exact_add(t.hi, tl2)
}
}
#[inline]
pub(crate) fn quick_f64_mult(a: f64, b: DoubleDouble) -> DoubleDouble {
DoubleDouble::quick_mult_f64(b, a)
}
#[inline]
pub(crate) fn quick_mult_f64(a: DoubleDouble, b: f64) -> Self {
#[cfg(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
))]
{
let h = b * a.hi;
let l = f_fmla(b, a.lo, f_fmla(b, a.hi, -h));
Self { lo: l, hi: h }
}
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b);
let cl2 = a.lo * b;
let cl3 = cl1 + cl2;
DoubleDouble::new(cl3, ch)
}
}
// /// Double-double multiplication safe to overflow without FMA
// #[inline]
// pub(crate) fn quick_mult_safe_f64(a: DoubleDouble, b: f64) -> Self {
// let h = b * a.hi;
// let l = f64::mul_add(b, a.lo, f64::mul_add(b, a.hi, -h));
// Self { lo: l, hi: h }
// }
/// Valid only |a.hi| > |b|
#[inline]
pub(crate) fn add_f64(a: DoubleDouble, b: f64) -> Self {
let t = DoubleDouble::from_exact_add(a.hi, b);
let l = a.lo + t.lo;
Self { lo: l, hi: t.hi }
}
#[inline]
pub(crate) fn full_add_f64(a: DoubleDouble, b: f64) -> Self {
let t = DoubleDouble::from_full_exact_add(a.hi, b);
let l = a.lo + t.lo;
Self { lo: l, hi: t.hi }
}
/// Valid only |b| > |a.hi|
#[inline]
pub(crate) fn f64_add(b: f64, a: DoubleDouble) -> Self {
let t = DoubleDouble::from_exact_add(b, a.hi);
let l = a.lo + t.lo;
Self { lo: l, hi: t.hi }
}
#[inline]
pub(crate) const fn to_f64(self) -> f64 {
self.lo + self.hi
}
// #[inline]
// pub(crate) fn from_rsqrt(x: f64) -> DoubleDouble {
// let r = DoubleDouble::div_dd_f64(DoubleDouble::from_sqrt(x), x);
// let rx = DoubleDouble::quick_mult_safe_f64(r, x);
// let drx = DoubleDouble::mul_f64_safe_add(r, x, -rx);
// let h = DoubleDouble::mul_add(r, drx, DoubleDouble::mul_add_f64(r, rx, -1.0));
// let dr = DoubleDouble::quick_mult(DoubleDouble::quick_mult_f64(r, 0.5), h);
// DoubleDouble::add(r, dr)
// }
#[inline]
pub(crate) fn from_rsqrt_fast(x: f64) -> DoubleDouble {
let sqrt_x = DoubleDouble::from_sqrt(x);
sqrt_x.recip()
}
}
impl Mul<DoubleDouble> for DoubleDouble {
type Output = Self;
#[inline]
fn mul(self, rhs: DoubleDouble) -> Self::Output {
DoubleDouble::quick_mult(self, rhs)
}
}
/// check if number is valid for Exact mult
#[allow(dead_code)]
#[inline]
pub(crate) fn two_product_compatible(x: f64) -> bool {
let exp = get_exponent_f64(x);
!(exp >= 970 || exp <= -970)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_f64_mult() {
let d1 = 1.1231;
let d2 = DoubleDouble::new(1e-22, 3.2341);
let p = DoubleDouble::quick_f64_mult(d1, d2);
assert_eq!(p.hi, 3.6322177100000004);
assert_eq!(p.lo, -1.971941841373783e-16);
}
#[test]
fn test_mult_64() {
let d1 = 1.1231;
let d2 = DoubleDouble::new(1e-22, 3.2341);
let p = DoubleDouble::mult_f64(d2, d1);
assert_eq!(p.hi, 3.6322177100000004);
assert_eq!(p.lo, -1.971941841373783e-16);
}
#[test]
fn recip_test() {
let d1 = 1.54352432142;
let recip = DoubleDouble::new(0., d1).recip();
assert_eq!(recip.hi, d1.recip());
assert_ne!(recip.lo, 0.);
}
#[test]
fn from_recip_test() {
let d1 = 1.54352432142;
let recip = DoubleDouble::from_recip(d1);
assert_eq!(recip.hi, d1.recip());
assert_ne!(recip.lo, 0.);
}
#[test]
fn from_quick_recip_test() {
let d1 = 1.54352432142;
let recip = DoubleDouble::from_quick_recip(d1);
assert_eq!(recip.hi, d1.recip());
assert_ne!(recip.lo, 0.);
}
}