/* * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved. * // * // Redistribution and use in source and binary forms, with or without modification, * // are permitted provided that the following conditions are met: * // * // 1. Redistributions of source code must retain the above copyright notice, this * // list of conditions and the following disclaimer. * // * // 2. Redistributions in binary form must reproduce the above copyright notice, * // this list of conditions and the following disclaimer in the documentation * // and/or other materials provided with the distribution. * // * // 3. Neither the name of the copyright holder nor the names of its * // contributors may be used to endorse or promote products derived from * // this software without specific prior written permission. * // * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::bits::get_exponent_f64; #[allow(unused_imports)] use crate::common::*; use std::ops::{Mul, Neg}; // https://hal.science/hal-01351529v3/document #[derive(Copy, Clone, Default, Debug)] pub(crate) struct DoubleDouble { pub(crate) lo: f64, pub(crate) hi: f64, } impl Neg for DoubleDouble { type Output = Self; #[inline] fn neg(self) -> Self::Output { Self { hi: -self.hi, lo: -self.lo, } } } impl DoubleDouble { #[inline] pub(crate) const fn from_bit_pair(pair: (u64, u64)) -> Self { Self { lo: f64::from_bits(pair.0), hi: f64::from_bits(pair.1), } } #[inline] pub(crate) const fn new(lo: f64, hi: f64) -> Self { DoubleDouble { lo, hi } } // Non FMA helper #[allow(dead_code)] #[inline] pub(crate) const fn split(a: f64) -> DoubleDouble { // CN = 2^N. const CN: f64 = (1 << 27) as f64; const C: f64 = CN + 1.0; let t1 = C * a; let t2 = a - t1; let r_hi = t1 + t2; let r_lo = a - r_hi; DoubleDouble::new(r_lo, r_hi) } // Non FMA helper #[allow(dead_code)] #[inline] fn from_exact_mult_impl_non_fma(asz: DoubleDouble, a: f64, b: f64) -> Self { let bs = DoubleDouble::split(b); let r_hi = a * b; let t1 = asz.hi * bs.hi - r_hi; let t2 = asz.hi * bs.lo + t1; let t3 = asz.lo * bs.hi + t2; let r_lo = asz.lo * bs.lo + t3; DoubleDouble::new(r_lo, r_hi) } // valid only for |a| > b #[inline] pub(crate) const fn from_exact_add(a: f64, b: f64) -> DoubleDouble { let r_hi = a + b; let t = r_hi - a; let r_lo = b - t; DoubleDouble::new(r_lo, r_hi) } // valid only for |a| > b #[inline] pub(crate) const fn from_exact_sub(a: f64, b: f64) -> DoubleDouble { let r_hi = a - b; let t = a - r_hi; let r_lo = t - b; DoubleDouble::new(r_lo, r_hi) } #[inline] pub(crate) const fn from_full_exact_add(a: f64, b: f64) -> DoubleDouble { let r_hi = a + b; let t1 = r_hi - a; let t2 = r_hi - t1; let t3 = b - t1; let t4 = a - t2; let r_lo = t3 + t4; DoubleDouble::new(r_lo, r_hi) } #[allow(unused)] #[inline] pub(crate) fn dd_f64_mul_add(a: f64, b: f64, c: f64) -> f64 { let ddx2 = DoubleDouble::from_exact_mult(a, b); let zv = DoubleDouble::full_add_f64(ddx2, c); zv.to_f64() } #[inline] pub(crate) const fn from_full_exact_sub(a: f64, b: f64) -> Self { let r_hi = a - b; let t1 = r_hi - a; let t2 = r_hi - t1; let t3 = -b - t1; let t4 = a - t2; let r_lo = t3 + t4; DoubleDouble::new(r_lo, r_hi) } #[inline] pub(crate) fn add(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble { let s = a.hi + b.hi; let d = s - a.hi; let l = ((b.hi - d) + (a.hi + (d - s))) + (a.lo + b.lo); DoubleDouble::new(l, s) } #[inline] pub(crate) fn quick_dd_add(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble { let DoubleDouble { hi: sh, lo: sl } = DoubleDouble::from_full_exact_add(a.hi, b.hi); let v = a.lo + b.lo; let w = sl + v; DoubleDouble::from_exact_add(sh, w) } #[inline] pub(crate) fn quick_dd_sub(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble { let DoubleDouble { hi: sh, lo: sl } = DoubleDouble::from_full_exact_sub(a.hi, b.hi); let v = a.lo - b.lo; let w = sl + v; DoubleDouble::from_exact_add(sh, w) } #[inline] pub(crate) fn full_dd_add(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble { let DoubleDouble { hi: sh, lo: sl } = DoubleDouble::from_full_exact_add(a.hi, b.hi); let DoubleDouble { hi: th, lo: tl } = DoubleDouble::from_full_exact_add(a.lo, b.lo); let c = sl + th; let v = DoubleDouble::from_exact_add(sh, c); let w = tl + v.lo; DoubleDouble::from_exact_add(v.hi, w) } #[inline] pub(crate) fn full_dd_sub(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble { DoubleDouble::full_dd_add(a, -b) } #[inline] pub(crate) fn sub(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble { let s = a.hi - b.hi; let d = s - a.hi; let l = ((-b.hi - d) + (a.hi + (d - s))) + (a.lo - b.lo); DoubleDouble::new(l, s) } /// DoubleDouble-style square root for a double-double number #[inline] pub(crate) fn sqrt(self) -> DoubleDouble { let a = self.hi + self.lo; if a == 0.0 { return DoubleDouble { hi: 0.0, lo: 0.0 }; } if a < 0.0 || a.is_nan() { return DoubleDouble { hi: f64::NAN, lo: 0.0, }; } if a.is_infinite() { return DoubleDouble { hi: f64::INFINITY, lo: 0.0, }; } let x = a.sqrt(); let x2 = DoubleDouble::from_exact_mult(x, x); // Residual = self - x² let mut r = self.hi - x2.hi; r += self.lo; r -= x2.lo; let dx = r / (2.0 * x); let hi = x + dx; let lo = (x - hi) + dx; DoubleDouble { hi, lo } } /// DoubleDouble-style square root for a double-double number #[inline] pub(crate) fn fast_sqrt(self) -> DoubleDouble { let a = self.hi + self.lo; let x = a.sqrt(); let x2 = DoubleDouble::from_exact_mult(x, x); // Residual = self - x² let mut r = self.hi - x2.hi; r += self.lo; r -= x2.lo; let dx = r / (2.0 * x); let hi = x + dx; let lo = (x - hi) + dx; DoubleDouble { hi, lo } } /// `a*b+c` /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* #[inline] pub(crate) fn mul_add_f64(a: DoubleDouble, b: DoubleDouble, c: f64) -> DoubleDouble { let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_full_exact_add(c, h); DoubleDouble::new(r + q, p) } /// `a*b+c` /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* #[inline] pub(crate) fn quick_mul_add_f64(a: DoubleDouble, b: DoubleDouble, c: f64) -> DoubleDouble { let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_exact_add(c, h); DoubleDouble::new(r + q, p) } /// `a*b+c` /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* #[inline] pub(crate) fn mul_f64_add_f64(a: DoubleDouble, b: f64, c: f64) -> DoubleDouble { let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_f64(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_full_exact_add(c, h); DoubleDouble::new(r + q, p) } // /// Accurate reciprocal: 1 / self // #[inline] // pub(crate) fn recip_raphson(self) -> DoubleDouble { // let y0 = DoubleDouble::recip(self); // let z = DoubleDouble::mul_add_f64(-self, y0, 1.0); // DoubleDouble::mul_add(y0, z, y0) // } /// Accurate reciprocal: 1 / self #[inline] pub(crate) fn recip(self) -> DoubleDouble { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let y = 1. / self.hi; let e1 = f_fmla(-self.hi, y, 1.0); let e2 = f_fmla(-self.lo, y, e1); let e = y * e2; DoubleDouble::new(e, y) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let y = 1.0 / self.hi; let DoubleDouble { hi: p1, lo: err1 } = DoubleDouble::from_exact_mult(self.hi, y); let e1 = (1.0 - p1) - err1; let DoubleDouble { hi: p2, lo: err2 } = DoubleDouble::from_exact_mult(self.lo, y); let e2 = (e1 - p2) - err2; let e = y * e2; DoubleDouble::new(e, y) } } #[inline] pub(crate) fn from_recip(b: f64) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let x_hi = 1.0 / b; let err = f_fmla(-x_hi, b, 1.0); let x_lo = err / b; Self::new(x_lo, x_hi) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let x_hi = 1.0 / b; let prod = Self::from_exact_mult(x_hi, b); let err = (1.0 - prod.hi) - prod.lo; let x_lo = err / b; Self::new(x_lo, x_hi) } } #[inline] pub(crate) fn from_quick_recip(b: f64) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let h = 1.0 / b; let hl = f_fmla(h, -b, 1.) * h; DoubleDouble::new(hl, h) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let h = 1.0 / b; let pr = DoubleDouble::from_exact_mult(h, b); let err = (1.0 - pr.hi) - pr.lo; let hl = err * h; DoubleDouble::new(hl, h) } } #[inline] pub(crate) fn from_exact_div(a: f64, b: f64) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let q_hi = a / b; let r = f_fmla(-q_hi, b, a); let q_lo = r / b; Self::new(q_lo, q_hi) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let q_hi = a / b; let p = DoubleDouble::from_exact_mult(q_hi, b); let r = DoubleDouble::from_exact_sub(a, p.hi); let r = r.hi + (r.lo - p.lo); let q_lo = r / b; Self::new(q_lo, q_hi) } } // Resistant to overflow without FMA #[inline] pub(crate) fn from_exact_safe_div(a: f64, b: f64) -> Self { let q_hi = a / b; let r = f64::mul_add(-q_hi, b, a); let q_lo = r / b; Self::new(q_lo, q_hi) } #[inline] pub(crate) fn from_sqrt(x: f64) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let h = x.sqrt(); /* h = sqrt(x) * (1 + e1) with |e1| < 2^-52 thus h^2 = x * (1 + e2) with |e2| < 2^-50.999 */ let e = -f_fmla(h, h, -x); // exact /* e = x - h^2 */ let l = e / (h + h); DoubleDouble::new(l, h) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let h = x.sqrt(); let prod_hh = DoubleDouble::from_exact_mult(h, h); let e = (x - prod_hh.hi) - prod_hh.lo; // exact /* e = x - h^2 */ let l = e / (h + h); DoubleDouble::new(l, h) } } /// Safe to overflow underflow division using mandatory FMA. #[inline] #[allow(dead_code)] pub(crate) fn div_safe_dd_f64(a: DoubleDouble, b: f64) -> Self { let q1 = a.hi / b; let r = f64::mul_add(-q1, b, a.hi); let r = r + a.lo; let q2 = r / b; DoubleDouble::new(q2, q1) } #[inline] pub(crate) fn div_dd_f64(a: DoubleDouble, b: f64) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let q1 = a.hi / b; let r = f_fmla(-q1, b, a.hi); let r = r + a.lo; let q2 = r / b; DoubleDouble::new(q2, q1) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let th = a.hi / b; let prod = DoubleDouble::from_exact_mult(th, b); let beta_h = a.hi - prod.hi; let beta_l = beta_h - prod.lo; let beta = beta_l + a.lo; let tl = beta / b; DoubleDouble::new(tl, th) } } // /// Dekker division with one refinement step // #[inline] // pub(crate) fn div_dd_f64_newton_raphson(a: DoubleDouble, b: f64) -> Self { // // Initial estimate q = a / b // let q = DoubleDouble::div_dd_f64(a, b); // // // One Newton-Raphson refinement step: // // e = a - q * b // let qb = DoubleDouble::quick_mult_f64(q, b); // let e = DoubleDouble::sub(a, qb); // let e_div_b = DoubleDouble::div_dd_f64(e, b); // // DoubleDouble::add(q, e_div_b) // } // /// Dekker division with two Newton-Raphson refinement steps // #[inline] // pub(crate) fn div_dd_f64_newton_raphson_2(a: Dekker, b: f64) -> Self { // // First estimate: q = a / b (one round of Dekker division) // let q1 = Dekker::div_dd_f64(a, b); // // // First refinement: q2 = q1 + (a - q1 * b) / b // let qb1 = Dekker::quick_mult_f64(q1, b); // let e1 = Dekker::sub(a, qb1); // let dq1 = Dekker::div_dd_f64(e1, b); // let q2 = Dekker::add(q1, dq1); // // // Second refinement: q3 = q2 + (a - q2 * b) / b // let qb2 = Dekker::quick_mult_f64(q2, b); // let e2 = Dekker::sub(a, qb2); // let dq2 = Dekker::div_dd_f64(e2, b); // // Dekker::add(q2, dq2) // } // #[inline] // pub(crate) fn neg(self) -> Self { // Self { // lo: -self.lo, hi: -self.hi, // } // } #[inline] pub(crate) fn from_f64_div_dd(a: f64, b: DoubleDouble) -> Self { let q1 = a / b.hi; let prod = DoubleDouble::from_exact_mult(q1, b.hi); let prod_lo = f_fmla(q1, b.lo, prod.lo); let rem = f_fmla(-1.0, prod.hi, a) - prod_lo; let q2 = rem / b.hi; DoubleDouble::new(q2, q1) } // #[inline] // pub(crate) fn mla_f64(a: Dekker, b: f64, c: f64) -> Self { // let q = Dekker::mult_f64(a, b); // Dekker::add_f64(q, c) // } // // #[inline] // pub(crate) fn mla_dd_f64(a: Dekker, b: Dekker, c: f64) -> Self { // let q = Dekker::quick_mult(a, b); // Dekker::add_f64(q, c) // } #[inline] pub(crate) fn div(a: DoubleDouble, b: DoubleDouble) -> DoubleDouble { let q = 1.0 / b.hi; let r_hi = a.hi * q; #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let e_hi = f_fmla(b.hi, -r_hi, a.hi); let e_lo = f_fmla(b.lo, -r_hi, a.lo); let r_lo = q * (e_hi + e_lo); DoubleDouble::new(r_lo, r_hi) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let b_hi_r_hi = DoubleDouble::from_exact_mult(b.hi, -r_hi); let b_lo_r_hi = DoubleDouble::from_exact_mult(b.lo, -r_hi); let e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo; let e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo; let r_lo = q * (e_hi + e_lo); DoubleDouble::new(r_lo, r_hi) } } #[inline] pub(crate) fn from_exact_mult(a: f64, b: f64) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let r_hi = a * b; let r_lo = f_fmla(a, b, -r_hi); DoubleDouble::new(r_lo, r_hi) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let splat = DoubleDouble::split(a); DoubleDouble::from_exact_mult_impl_non_fma(splat, a, b) } } // #[inline] // pub(crate) fn add_f64(&self, other: f64) -> DoubleDouble { // let r = DoubleDouble::from_exact_add(self.hi, other); // Dekker::from_exact_add(r.hi, r.lo + self.lo) // } // #[inline] // pub(crate) fn to_triple(self) -> TripleDouble { // TripleDouble::new(0., self.lo, self.hi) // } /// Computes `a * b + c` /// `b` is an `f64`, `a` and `c` are `DoubleDouble`. /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* #[inline] pub(crate) fn mul_f64_add(a: DoubleDouble, b: f64, c: DoubleDouble) -> Self { let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_f64(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::full_add_f64(c, h); DoubleDouble::new(r + q, p) } /// Computes `a * b + c` /// `b` is an `f64`, `a` and `c` are `DoubleDouble`. /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* /// /// *Correctness* /// |c.hi| > |a.hi * b.hi| #[inline] pub(crate) fn quick_mul_f64_add(a: DoubleDouble, b: f64, c: DoubleDouble) -> Self { let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_f64(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::add_f64(c, h); DoubleDouble::new(r + q, p) } /// Computes `a * b + c` /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* /// /// *Correctness* /// |c.hi| > |a.hi * b.hi| #[inline] pub(crate) fn quick_mul_f64_add_f64(a: DoubleDouble, b: f64, c: f64) -> Self { let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_f64(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_exact_add(c, h); DoubleDouble::new(r + q, p) } // #[inline] // pub(crate) fn mul_f64_add_full(a: DoubleDouble, b: f64, c: DoubleDouble) -> Self { // /* // double _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8; \ // \ // Mul12(&_t1,&_t2,(a),(bh)); \ // Add12(_t3,_t4,(ch),_t1); \ // _t5 = (bl) * (a); \ // _t6 = (cl) + _t2; \ // _t7 = _t5 + _t6; \ // _t8 = _t7 + _t4; \ // Add12((*(resh)),(*(resl)),_t3,_t8); \ // */ // let DoubleDouble { hi: t1, lo: t2 } = DoubleDouble::from_exact_mult(a.hi, b); // let DoubleDouble { hi: t3, lo: t4 } = DoubleDouble::from_full_exact_add(c.hi, t1); // let t5 = a.lo * b; // let t6 = c.lo + t2; // let t7 = t5 + t6; // let t8 = t7 + t4; // DoubleDouble::from_full_exact_add(t3, t8) // } /// Computes `a * b + c` /// `b` is an `f64`, `a` and `c` are `DoubleDouble`. /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* #[inline] pub(crate) fn f64_mul_f64_add(a: f64, b: f64, c: DoubleDouble) -> Self { let DoubleDouble { hi: h, lo: r } = DoubleDouble::from_exact_mult(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::full_add_f64(c, h); DoubleDouble::new(r + q, p) } // /// Computes `a * b + c` // /// `b` is an `f64`, `a` and `c` are `DoubleDouble`. // /// // /// *Accurate dot product (Ogita, Rump and Oishi 2004)* // #[inline] // pub(crate) fn single_mul_add(a: f64, b: f64, c: f64) -> Self { // let DoubleDouble { hi: h, lo: r } = DoubleDouble::from_exact_mult(a, b); // let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_full_exact_add(c, h); // DoubleDouble::new(r + q, p) // } // /// Computes `a * b + c` safe to overflow without FMA // /// `b` is an `f64`, `a` and `c` are `DoubleDouble`. // /// // /// *Accurate dot product (Ogita, Rump and Oishi 2004)* // #[inline] // pub(crate) fn mul_f64_safe_add(a: DoubleDouble, b: f64, c: DoubleDouble) -> Self { // let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult_safe_f64(a, b); // let DoubleDouble { hi: p, lo: q } = DoubleDouble::full_add_f64(c, h); // DoubleDouble::new(r + q, p) // } /// `a*b+c` /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* #[inline] pub(crate) fn mul_add(a: DoubleDouble, b: DoubleDouble, c: DoubleDouble) -> Self { let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::full_add_f64(c, h); DoubleDouble::new(r + q, p) } /// `a*b+c` /// /// *Accurate dot product (Ogita, Rump and Oishi 2004)* /// /// *Correctness* /// |c.hi| > |a.hi * b.hi| #[inline] pub(crate) fn quick_mul_add(a: DoubleDouble, b: DoubleDouble, c: DoubleDouble) -> Self { let DoubleDouble { hi: h, lo: r } = DoubleDouble::quick_mult(a, b); let DoubleDouble { hi: p, lo: q } = DoubleDouble::add_f64(c, h); DoubleDouble::new(r + q, p) } #[inline] pub(crate) fn quick_mult(a: DoubleDouble, b: DoubleDouble) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let mut r = DoubleDouble::from_exact_mult(a.hi, b.hi); let t1 = f_fmla(a.hi, b.lo, r.lo); let t2 = f_fmla(a.lo, b.hi, t1); r.lo = t2; r } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b.hi); let tl1 = a.hi * b.lo; let tl2 = a.lo * b.hi; let cl2 = tl1 + tl2; let cl3 = cl1 + cl2; DoubleDouble::new(cl3, ch) } } #[inline] pub(crate) fn mult(a: DoubleDouble, b: DoubleDouble) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b.hi); let tl0 = a.lo * b.lo; let tl1 = f_fmla(a.hi, b.lo, tl0); let cl2 = f_fmla(a.lo, b.hi, tl1); let cl3 = cl1 + cl2; DoubleDouble::from_exact_add(ch, cl3) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b.hi); let tl1 = a.hi * b.lo; let tl2 = a.lo * b.hi; let cl2 = tl1 + tl2; let cl3 = cl1 + cl2; DoubleDouble::from_exact_add(ch, cl3) } } #[inline] pub(crate) fn mult_f64(a: DoubleDouble, b: f64) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b); let cl3 = f_fmla(a.lo, b, cl1); DoubleDouble::from_exact_add(ch, cl3) } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b); let cl2 = a.lo * b; let t = DoubleDouble::from_exact_add(ch, cl2); let tl2 = t.lo + cl1; DoubleDouble::from_exact_add(t.hi, tl2) } } #[inline] pub(crate) fn quick_f64_mult(a: f64, b: DoubleDouble) -> DoubleDouble { DoubleDouble::quick_mult_f64(b, a) } #[inline] pub(crate) fn quick_mult_f64(a: DoubleDouble, b: f64) -> Self { #[cfg(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") ))] { let h = b * a.hi; let l = f_fmla(b, a.lo, f_fmla(b, a.hi, -h)); Self { lo: l, hi: h } } #[cfg(not(any( all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "fma" ), all(target_arch = "aarch64", target_feature = "neon") )))] { let DoubleDouble { hi: ch, lo: cl1 } = DoubleDouble::from_exact_mult(a.hi, b); let cl2 = a.lo * b; let cl3 = cl1 + cl2; DoubleDouble::new(cl3, ch) } } // /// Double-double multiplication safe to overflow without FMA // #[inline] // pub(crate) fn quick_mult_safe_f64(a: DoubleDouble, b: f64) -> Self { // let h = b * a.hi; // let l = f64::mul_add(b, a.lo, f64::mul_add(b, a.hi, -h)); // Self { lo: l, hi: h } // } /// Valid only |a.hi| > |b| #[inline] pub(crate) fn add_f64(a: DoubleDouble, b: f64) -> Self { let t = DoubleDouble::from_exact_add(a.hi, b); let l = a.lo + t.lo; Self { lo: l, hi: t.hi } } #[inline] pub(crate) fn full_add_f64(a: DoubleDouble, b: f64) -> Self { let t = DoubleDouble::from_full_exact_add(a.hi, b); let l = a.lo + t.lo; Self { lo: l, hi: t.hi } } /// Valid only |b| > |a.hi| #[inline] pub(crate) fn f64_add(b: f64, a: DoubleDouble) -> Self { let t = DoubleDouble::from_exact_add(b, a.hi); let l = a.lo + t.lo; Self { lo: l, hi: t.hi } } #[inline] pub(crate) const fn to_f64(self) -> f64 { self.lo + self.hi } // #[inline] // pub(crate) fn from_rsqrt(x: f64) -> DoubleDouble { // let r = DoubleDouble::div_dd_f64(DoubleDouble::from_sqrt(x), x); // let rx = DoubleDouble::quick_mult_safe_f64(r, x); // let drx = DoubleDouble::mul_f64_safe_add(r, x, -rx); // let h = DoubleDouble::mul_add(r, drx, DoubleDouble::mul_add_f64(r, rx, -1.0)); // let dr = DoubleDouble::quick_mult(DoubleDouble::quick_mult_f64(r, 0.5), h); // DoubleDouble::add(r, dr) // } #[inline] pub(crate) fn from_rsqrt_fast(x: f64) -> DoubleDouble { let sqrt_x = DoubleDouble::from_sqrt(x); sqrt_x.recip() } } impl Mul for DoubleDouble { type Output = Self; #[inline] fn mul(self, rhs: DoubleDouble) -> Self::Output { DoubleDouble::quick_mult(self, rhs) } } /// check if number is valid for Exact mult #[allow(dead_code)] #[inline] pub(crate) fn two_product_compatible(x: f64) -> bool { let exp = get_exponent_f64(x); !(exp >= 970 || exp <= -970) } #[cfg(test)] mod tests { use super::*; #[test] fn test_f64_mult() { let d1 = 1.1231; let d2 = DoubleDouble::new(1e-22, 3.2341); let p = DoubleDouble::quick_f64_mult(d1, d2); assert_eq!(p.hi, 3.6322177100000004); assert_eq!(p.lo, -1.971941841373783e-16); } #[test] fn test_mult_64() { let d1 = 1.1231; let d2 = DoubleDouble::new(1e-22, 3.2341); let p = DoubleDouble::mult_f64(d2, d1); assert_eq!(p.hi, 3.6322177100000004); assert_eq!(p.lo, -1.971941841373783e-16); } #[test] fn recip_test() { let d1 = 1.54352432142; let recip = DoubleDouble::new(0., d1).recip(); assert_eq!(recip.hi, d1.recip()); assert_ne!(recip.lo, 0.); } #[test] fn from_recip_test() { let d1 = 1.54352432142; let recip = DoubleDouble::from_recip(d1); assert_eq!(recip.hi, d1.recip()); assert_ne!(recip.lo, 0.); } #[test] fn from_quick_recip_test() { let d1 = 1.54352432142; let recip = DoubleDouble::from_quick_recip(d1); assert_eq!(recip.hi, d1.recip()); assert_ne!(recip.lo, 0.); } }