/* * // Copyright (c) Radzivon Bartoshyk 8/2025. All rights reserved. * // * // Redistribution and use in source and binary forms, with or without modification, * // are permitted provided that the following conditions are met: * // * // 1. Redistributions of source code must retain the above copyright notice, this * // list of conditions and the following disclaimer. * // * // 2. Redistributions in binary form must reproduce the above copyright notice, * // this list of conditions and the following disclaimer in the documentation * // and/or other materials provided with the distribution. * // * // 3. Neither the name of the copyright holder nor the names of its * // contributors may be used to endorse or promote products derived from * // this software without specific prior written permission. * // * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::common::{dd_fmla, is_integerf}; use crate::double_double::DoubleDouble; use crate::round_ties_even::RoundTiesEven; use std::hint::black_box; #[cold] #[inline(never)] fn as_compoundf_special(x: f32, y: f32) -> f32 { let nx = x.to_bits(); let ny = y.to_bits(); let ax: u32 = nx.wrapping_shl(1); let ay: u32 = ny.wrapping_shl(1); if ax == 0 || ay == 0 { // x or y is 0 if ax == 0 { // compound(0,y) = 1 except for y = sNaN return if y.is_nan() { x + y } else { 1.0 }; } if ay == 0 { // compound (x, 0) if x.is_nan() { return x + y; } // x = sNaN return if x < -1.0 { f32::NAN // rule (g) } else { 1.0 }; // rule (a) } } let mone = (-1.0f32).to_bits(); if ay >= 0xffu32 << 24 { // y=Inf/NaN // the case x=0 was already checked above if ax > 0xffu32 << 24 { return x + y; } // x=NaN if ay == 0xffu32 << 24 { // y = +/-Inf if nx > mone { return f32::NAN; } // rule (g) let sy = ny >> 31; // sign bit of y if nx == mone { return if sy == 0 { 0.0 // Rule (c) } else { f32::INFINITY // Rule (b) }; } if x < 0.0 { return if sy == 0 { 0.0 } else { f32::INFINITY }; } if x > 0.0 { return if sy != 0 { 0.0 } else { f32::INFINITY }; } return 1.0; } return x + y; // case y=NaN } if nx >= mone || nx >= 0xffu32 << 23 { // x is Inf, NaN or <= -1 if ax == 0xffu32 << 24 { // x is +Inf or -Inf if (nx >> 31) != 0 { return f32::NAN; } // x = -Inf, rule (g) // (1 + Inf)^y = +Inf for y > 0, +0 for y < 0 return if (ny >> 31) != 0 { 1.0 / x } else { x }; } if ax > 0xffu32 << 24 { return x + y; } // x is NaN if nx > mone { return f32::NAN; // x < -1.0: rule (g) } // now x = -1 return if (ny >> 31) != 0 { // y < 0 f32::INFINITY } else { // y > 0 0.0 }; } 0.0 } #[inline] pub(crate) fn log2p1_polyeval_1(z: f64) -> f64 { // we include P[0] = 0 so that P[i] corresponds to degree i // this degree-8 polynomial generated by Sollya (cf p1.sollya) // has relative error < 2^-50.98 const P: [u64; 8] = [ 0x0000000000000000, 0x3ff71547652b82fe, 0xbfe71547652b8d11, 0x3fdec709dc3a5014, 0xbfd715475b144983, 0x3fd2776c3fda300e, 0xbfcec990162358ce, 0x3fca645337c29e27, ]; let z2 = z * z; let mut c5 = dd_fmla(f64::from_bits(P[6]), z, f64::from_bits(P[5])); let c3 = dd_fmla(f64::from_bits(P[4]), z, f64::from_bits(P[3])); let mut c1 = dd_fmla(f64::from_bits(P[2]), z, f64::from_bits(P[1])); let z4 = z2 * z2; c5 = dd_fmla(f64::from_bits(P[7]), z2, c5); c1 = dd_fmla(c3, z2, c1); c1 = dd_fmla(c5, z4, c1); z * c1 } // for 0<=i<46, inv[i] approximates 1/t for 1/2+(i+13)/64 <= t < 1/2+(i+14)/64 pub(crate) static LOG2P1_COMPOUNDF_INV: [u64; 46] = [ 0x3ff6800000000000, 0x3ff6000000000000, 0x3ff5800000000000, 0x3ff5000000000000, 0x3ff4c00000000000, 0x3ff4400000000000, 0x3ff4000000000000, 0x3ff3800000000000, 0x3ff3400000000000, 0x3ff2c00000000000, 0x3ff2800000000000, 0x3ff2000000000000, 0x3ff1c00000000000, 0x3ff1800000000000, 0x3ff1400000000000, 0x3ff1000000000000, 0x3ff0c00000000000, 0x3ff0800000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3fef400000000000, 0x3feec00000000000, 0x3fee400000000000, 0x3fee000000000000, 0x3fed800000000000, 0x3fed000000000000, 0x3feca00000000000, 0x3fec400000000000, 0x3febe00000000000, 0x3feb800000000000, 0x3feb200000000000, 0x3feac00000000000, 0x3fea800000000000, 0x3fea200000000000, 0x3fe9c00000000000, 0x3fe9800000000000, 0x3fe9200000000000, 0x3fe8c00000000000, 0x3fe8800000000000, 0x3fe8400000000000, 0x3fe8000000000000, 0x3fe7c00000000000, 0x3fe7600000000000, 0x3fe7200000000000, 0x3fe6e00000000000, 0x3fe6a00000000000, ]; /* log2inv[i][0]+log2inv[i][1] is a double-double approximation of -log2(inv[i]), with log2inv[i][0] having absolute error < 2^-54.462, and log2inv[i][0]+log2inv[i][1] absolute error < 2^-109.101 */ pub(crate) static LOG2P1_COMPOUNDF_LOG2_INV: [(u64, u64); 46] = [ (0x3c68f3673ffdd785, 0xbfdf7a8568cb06cf), (0x3c1c141e66faaaad, 0xbfdd6753e032ea0f), (0x3c76fae441c09d76, 0xbfdb47ebf73882a1), (0x3c72d352bea51e59, 0xbfd91bba891f1709), (0xbc69575b04fa6fbd, 0xbfd800a563161c54), (0x3c7817fd3b7d7e5d, 0xbfd5c01a39fbd688), (0x3c1b6d40900b2502, 0xbfd49a784bcd1b8b), (0x3c7f6e91ad16ecff, 0xbfd24407ab0e073a), (0x3c6a7b47d2c352d9, 0xbfd11307dad30b76), (0x3c5b85a54d7ee2fd, 0xbfcd49ee4c325970), (0x3c401ee1343fe7ca, 0xbfcacf5e2db4ec94), (0x3c6817fd3b7d7e5d, 0xbfc5c01a39fbd688), (0xbc4f51f2c075a74c, 0xbfc32ae9e278ae1a), (0x3c6a7610e40bd6ab, 0xbfc08c588cda79e4), (0xbc58ecb169b9465f, 0xbfbbc84240adabba), (0xbc5f3314e0985116, 0xbfb663f6fac91316), (0x3c530c22d15199b8, 0xbfb0eb389fa29f9b), (0xbc389b03784b5be1, 0xbfa6bad3758efd87), (0x0000000000000000, 0x0000000000000000), (0x0000000000000000, 0x0000000000000000), (0x3c3491f06c085bc2, 0x3fa184b8e4c56af8), (0x3c0155660710eb2a, 0x3fad6ebd1f1febfe), (0x3c2c141e66faaaad, 0x3fb4c560fe68af88), (0x3c59ced1447e30ad, 0x3fb7d60496cfbb4c), (0x3c592ce9636c90a0, 0x3fbe0b1ae8f2fd56), (0xbc5696e2866c718e, 0x3fc22dadc2ab3497), (0xbc61562d61af73f8, 0x3fc494f863b8df35), (0xbc60798d1aa21694, 0x3fc7046031c79f85), (0xbc6e95734abd2fcc, 0x3fc97c1cb13c7ec1), (0x3c2bc0af7b82e7d7, 0x3fcbfc67a7fff4cc), (0xbc6086fce864a1f6, 0x3fce857d3d361368), (0xbc53d56efe4338fe, 0x3fd08bce0d95fa38), (0x3c7c8d43e017579b, 0x3fd169c05363f158), (0xbc50132ae5e417cd, 0x3fd2baa0c34be1ec), (0xbc7c658d602e66b0, 0x3fd4106017c3eca3), (0x3c7e393a16b94b52, 0x3fd4f6fbb2cec598), (0x3c7ac9080333c605, 0x3fd6552b49986277), (0x3c68f89e2eb553b2, 0x3fd7b89f02cf2aad), (0x3c799aa6df8b7d83, 0x3fd8a8980abfbd32), (0x3c7bca36fd02def0, 0x3fd99b072a96c6b2), (0x3c5817fd3b7d7e5d, 0x3fda8ff971810a5e), (0xbc501d98c3531027, 0x3fdb877c57b1b070), (0x3c78a38b4175d665, 0x3fdcffae611ad12b), (0x3c438c8946414c6a, 0x3fddfdd89d586e2b), (0x3c76d261f1753e0b, 0x3fdefec61b011f85), (0xbc87398fe685f171, 0x3fe0014332be0033), ]; /* for |z| <= 2^-6, returns an approximation of 2^z with absolute error < 2^-43.540 */ #[inline] fn compoundf_expf_poly(z: f64) -> f64 { /* Q is a degree-4 polynomial generated by Sollya (cf q1.sollya) with absolute error < 2^-43.549 */ const Q: [u64; 5] = [ 0x3ff0000000000000, 0x3fe62e42fef6d01a, 0x3fcebfbdff7feeba, 0x3fac6b167e579bee, 0x3f83b2b3428d06de, ]; let z2 = z * z; let c3 = dd_fmla(f64::from_bits(Q[4]), z, f64::from_bits(Q[3])); let c0 = dd_fmla(f64::from_bits(Q[1]), z, f64::from_bits(Q[0])); let c2 = dd_fmla(c3, z, f64::from_bits(Q[2])); dd_fmla(c2, z2, c0) } pub(crate) fn compoundf_log2p1_fast(x: f64) -> f64 { /* for x > 0, 1+x is exact when 2^-29 <= x < 2^53 for x < 0, 1+x is exact when -1 < x <= 2^-30 */ // double u = (x >= 0x1p53) ? x : 1.0 + x; let u = 1.0 + x; /* For x < 0x1p53, x + 1 is exact thus u = x+1. For x >= 2^53, we estimate log2(x) instead of log2(1+x), since log2(1+x) = log2(x) + log2(1+1/x), log2(x) >= 53 and |log2(1+1/x)| < 2^-52.471, the additional relative error is bounded by 2^-52.471/53 < 2^-58.198 */ let mut v = u.to_bits(); let m: u64 = v & 0xfffffffffffffu64; let e: i64 = (v >> 52) as i64 - 0x3ff + (m >= 0x6a09e667f3bcdu64) as i64; // 2^e/sqrt(2) < u < 2^e*sqrt(2), with -29 <= e <= 128 v = v.wrapping_sub((e << 52) as u64); let t = f64::from_bits(v); // u = 2^e*t with 1/sqrt(2) < t < sqrt(2) // thus log2(u) = e + log2(t) v = (f64::from_bits(v) + 2.0).to_bits(); // add 2 so that v.f is always in the binade [2, 4) let i = (v >> 45) as i32 - 0x2002d; // 0 <= i <= 45 let r = f64::from_bits(LOG2P1_COMPOUNDF_INV[i as usize]); let z = dd_fmla(r, t, -1.0); // exact, -1/64 <= z <= 1/64 // we approximates log2(t) by -log2(r) + log2(r*t) let p = log2p1_polyeval_1(z); // p approximates log2(r*t) with rel. error < 2^-49.642, and |p| < 2^-5.459 e as f64 + (f64::from_bits(LOG2P1_COMPOUNDF_LOG2_INV[i as usize].1) + p) } pub(crate) static COMPOUNDF_EXP2_T: [u64; 33] = [ 0xbfe0000000000000, 0xbfde000000000000, 0xbfdc000000000000, 0xbfda000000000000, 0xbfd8000000000000, 0xbfd6000000000000, 0xbfd4000000000000, 0xbfd2000000000000, 0xbfd0000000000000, 0xbfcc000000000000, 0xbfc8000000000000, 0xbfc4000000000000, 0xbfc0000000000000, 0xbfb8000000000000, 0xbfb0000000000000, 0xbfa0000000000000, 0x0000000000000000, 0x3fa0000000000000, 0x3fb0000000000000, 0x3fb8000000000000, 0x3fc0000000000000, 0x3fc4000000000000, 0x3fc8000000000000, 0x3fcc000000000000, 0x3fd0000000000000, 0x3fd2000000000000, 0x3fd4000000000000, 0x3fd6000000000000, 0x3fd8000000000000, 0x3fda000000000000, 0x3fdc000000000000, 0x3fde000000000000, 0x3fe0000000000000, ]; /* exp2_U[i] is a double-double approximation h+l of 2^exp2_T[i] so that h approximates 2^exp2_T[i] with absolute error < 2^-53.092, and h+l approximates 2^exp2_T[i] with absolute error < 2^-107.385 */ pub(crate) static COMPOUNDF_EXP2_U: [(u64, u64); 33] = [ (0xbc8bdd3413b26456, 0x3fe6a09e667f3bcd), (0xbc716e4786887a99, 0x3fe71f75e8ec5f74), (0xbc741577ee04992f, 0x3fe7a11473eb0187), (0xbc8d4c1dd41532d8, 0x3fe82589994cce13), (0x3c86e9f156864b27, 0x3fe8ace5422aa0db), (0xbc575fc781b57ebc, 0x3fe93737b0cdc5e5), (0x3c6c7c46b071f2be, 0x3fe9c49182a3f090), (0xbc8d2f6edb8d41e1, 0x3fea5503b23e255d), (0x3c87a1cd345dcc81, 0x3feae89f995ad3ad), (0xbc65584f7e54ac3b, 0x3feb7f76f2fb5e47), (0x3c711065895048dd, 0x3fec199bdd85529c), (0x3c6503cbd1e949db, 0x3fecb720dcef9069), (0x3c72ed02d75b3707, 0x3fed5818dcfba487), (0xbc81a5cd4f184b5c, 0x3fedfc97337b9b5f), (0xbc8e9c23179c2893, 0x3feea4afa2a490da), (0x3c89d3e12dd8a18b, 0x3fef50765b6e4540), (0x0000000000000000, 0x3ff0000000000000), (0x3c8d73e2a475b465, 0x3ff059b0d3158574), (0x3c98a62e4adc610b, 0x3ff0b5586cf9890f), (0xbc96c51039449b3a, 0x3ff11301d0125b51), (0xbc819041b9d78a76, 0x3ff172b83c7d517b), (0x3c9e016e00a2643c, 0x3ff1d4873168b9aa), (0x3c99b07eb6c70573, 0x3ff2387a6e756238), (0x3c8612e8afad1255, 0x3ff29e9df51fdee1), (0x3c86f46ad23182e4, 0x3ff306fe0a31b715), (0xbc963aeabf42eae2, 0x3ff371a7373aa9cb), (0x3c8ada0911f09ebc, 0x3ff3dea64c123422), (0x3c489b7a04ef80d0, 0x3ff44e086061892d), (0x3c7d4397afec42e2, 0x3ff4bfdad5362a27), (0xbc807abe1db13cad, 0x3ff5342b569d4f82), (0x3c96324c054647ad, 0x3ff5ab07dd485429), (0xbc9383c17e40b497, 0x3ff6247eb03a5585), (0xbc9bdd3413b26456, 0x3ff6a09e667f3bcd), ]; /* return the correct rounding of (1+x)^y, otherwise -1.0 where t is an approximation of y*log2(1+x) with absolute error < 2^-40.680, assuming 0x1.7154759a0df53p-24 <= |t| <= 150 exact is non-zero iff (1+x)^y is exact or midpoint */ fn exp2_fast(t: f64) -> f64 { let k = t.round_ties_even_finite(); // 0 <= |k| <= 150 let mut r = t - k; // |r| <= 1/2, exact let mut v: u64 = (3.015625 + r).to_bits(); // 2.5 <= v <= 3.5015625 // we add 2^-6 so that i is rounded to nearest let i: i32 = (v >> 46) as i32 - 0x10010; // 0 <= i <= 32 r -= f64::from_bits(COMPOUNDF_EXP2_T[i as usize]); // exact // now |r| <= 2^-6 // 2^t = 2^k * exp2_U[i][0] * 2^r v = (f64::from_bits(COMPOUNDF_EXP2_U[i as usize].1) * compoundf_expf_poly(r)).to_bits(); /* the absolute error on exp2_U[i][0] is bounded by 2^-53.092, with exp2_U[i][0] < 2^0.5, and that on q1(r) is bounded by 2^-43.540, with |q1(r)| < 1.011, thus |v| < 1.43, and the absolute error on v is bounded by ulp(v) + 2^0.5 * 2^-43.540 + 2^-53.092 * 1.011 < 2^-43.035. Now t approximates u := y*log2(1+x) with |t-u| < 2^-40.680 thus 2^u = 2^t * (1 + eps) with eps < 2^(2^-40.680)-1 < 2^-41.208. The total absolute error is thus bounded by 2^-43.035 + 2^-41.208 < 2^-40.849. */ let mut err: u64 = 0x3d61d00000000000; // 2^-40.849 < 0x1.1dp-41 v = unsafe { v.wrapping_add(k.to_int_unchecked::().wrapping_shl(52) as u64) }; // scale v by 2^k, k is already integer // in case of potential underflow, we defer to the accurate path if f64::from_bits(v) < f64::from_bits(0x38100000000008e2) { return -1.0; } err = unsafe { err.wrapping_add((k.to_int_unchecked::() << 52) as u64) }; // scale the error by 2^k too let lb = (f64::from_bits(v) - f64::from_bits(err)) as f32; let rb = (f64::from_bits(v) + f64::from_bits(err)) as f32; if lb != rb { return -1.0; } // rounding test failed f64::from_bits(v) } // 2^e/sqrt(2) < h < 2^e*sqrt(2), with -29 <= e <= 128 // divide h, l by 2^e pub(crate) static LOG2P1_SCALE: [u64; 158] = [ 0x41c0000000000000, 0x41b0000000000000, 0x41a0000000000000, 0x4190000000000000, 0x4180000000000000, 0x4170000000000000, 0x4160000000000000, 0x4150000000000000, 0x4140000000000000, 0x4130000000000000, 0x4120000000000000, 0x4110000000000000, 0x4100000000000000, 0x40f0000000000000, 0x40e0000000000000, 0x40d0000000000000, 0x40c0000000000000, 0x40b0000000000000, 0x40a0000000000000, 0x4090000000000000, 0x4080000000000000, 0x4070000000000000, 0x4060000000000000, 0x4050000000000000, 0x4040000000000000, 0x4030000000000000, 0x4020000000000000, 0x4010000000000000, 0x4000000000000000, 0x3ff0000000000000, 0x3fe0000000000000, 0x3fd0000000000000, 0x3fc0000000000000, 0x3fb0000000000000, 0x3fa0000000000000, 0x3f90000000000000, 0x3f80000000000000, 0x3f70000000000000, 0x3f60000000000000, 0x3f50000000000000, 0x3f40000000000000, 0x3f30000000000000, 0x3f20000000000000, 0x3f10000000000000, 0x3f00000000000000, 0x3ef0000000000000, 0x3ee0000000000000, 0x3ed0000000000000, 0x3ec0000000000000, 0x3eb0000000000000, 0x3ea0000000000000, 0x3e90000000000000, 0x3e80000000000000, 0x3e70000000000000, 0x3e60000000000000, 0x3e50000000000000, 0x3e40000000000000, 0x3e30000000000000, 0x3e20000000000000, 0x3e10000000000000, 0x3e00000000000000, 0x3df0000000000000, 0x3de0000000000000, 0x3dd0000000000000, 0x3dc0000000000000, 0x3db0000000000000, 0x3da0000000000000, 0x3d90000000000000, 0x3d80000000000000, 0x3d70000000000000, 0x3d60000000000000, 0x3d50000000000000, 0x3d40000000000000, 0x3d30000000000000, 0x3d20000000000000, 0x3d10000000000000, 0x3d00000000000000, 0x3cf0000000000000, 0x3ce0000000000000, 0x3cd0000000000000, 0x3cc0000000000000, 0x3cb0000000000000, 0x3ca0000000000000, 0x3c90000000000000, 0x3c80000000000000, 0x3c70000000000000, 0x3c60000000000000, 0x3c50000000000000, 0x3c40000000000000, 0x3c30000000000000, 0x3c20000000000000, 0x3c10000000000000, 0x3c00000000000000, 0x3bf0000000000000, 0x3be0000000000000, 0x3bd0000000000000, 0x3bc0000000000000, 0x3bb0000000000000, 0x3ba0000000000000, 0x3b90000000000000, 0x3b80000000000000, 0x3b70000000000000, 0x3b60000000000000, 0x3b50000000000000, 0x3b40000000000000, 0x3b30000000000000, 0x3b20000000000000, 0x3b10000000000000, 0x3b00000000000000, 0x3af0000000000000, 0x3ae0000000000000, 0x3ad0000000000000, 0x3ac0000000000000, 0x3ab0000000000000, 0x3aa0000000000000, 0x3a90000000000000, 0x3a80000000000000, 0x3a70000000000000, 0x3a60000000000000, 0x3a50000000000000, 0x3a40000000000000, 0x3a30000000000000, 0x3a20000000000000, 0x3a10000000000000, 0x3a00000000000000, 0x39f0000000000000, 0x39e0000000000000, 0x39d0000000000000, 0x39c0000000000000, 0x39b0000000000000, 0x39a0000000000000, 0x3990000000000000, 0x3980000000000000, 0x3970000000000000, 0x3960000000000000, 0x3950000000000000, 0x3940000000000000, 0x3930000000000000, 0x3920000000000000, 0x3910000000000000, 0x3900000000000000, 0x38f0000000000000, 0x38e0000000000000, 0x38d0000000000000, 0x38c0000000000000, 0x38b0000000000000, 0x38a0000000000000, 0x3890000000000000, 0x3880000000000000, 0x3870000000000000, 0x3860000000000000, 0x3850000000000000, 0x3840000000000000, 0x3830000000000000, 0x3820000000000000, 0x3810000000000000, 0x3800000000000000, 0x37f0000000000000, ]; /* put in h+l an approximation of log2(1+zh+zl) for |zh| <= 1/64 + 2^-51.508, |zl| < 2^-58 and |zl| < ulp(zh). We have |h|, |h+l| < 2^-5.459, |l| < 2^-56.162, the relative error is bounded by 2^-91.196, and |l| < 2^-50.523 |h| (see analyze_p2() in compoundf.sage). */ /* degree-13 polynomial generated by Sollya which approximates log2(1+z) for |z| <= 1/64 with relative error < 2^-93.777 (cf file p2.sollya) */ static LOG2P1_LOG2_POLY: [u64; 18] = [ 0x3ff71547652b82fe, 0x3c7777d0ffda0d80, 0xbfe71547652b82fe, 0xbc6777d0fd20b49c, 0x3fdec709dc3a03fd, 0x3c7d27f05171b74a, 0xbfd71547652b82fe, 0xbc57814e70b828b0, 0x3fd2776c50ef9bfe, 0x3c7e4f63e12bff83, 0xbfcec709dc3a03f4, 0x3fca61762a7adecc, 0xbfc71547652d8849, 0x3fc484b13d7e7029, 0xbfc2776c1b2a40fd, 0x3fc0c9a80f9b7c1c, 0xbfbecc6801121200, 0x3fbc6e4b91fd10e5, ]; fn log2_poly2(z: DoubleDouble) -> DoubleDouble { /* since we can't expect a relative accuracy better than 2^-93.777, the lower part of the double-double approximation only needs to have about 94-53 = 41 accurate bits. Since |p7*z^7/p1| < 2^-44, we evaluate terms of degree 7 or more in double precision only. */ let mut h = f64::from_bits(LOG2P1_LOG2_POLY[4 + 13]); // degree 13 for i in 7..=12 { h = dd_fmla(z.hi, z.hi, f64::from_bits(LOG2P1_LOG2_POLY[4 + i])); } let mut v = DoubleDouble::quick_mult_f64(z, h); let t = DoubleDouble::from_exact_add(v.hi, f64::from_bits(LOG2P1_LOG2_POLY[10])); v.hi = t.hi; v.lo += t.lo; v = DoubleDouble::quick_mult(v, z); let t = DoubleDouble::from_exact_add(v.hi, f64::from_bits(LOG2P1_LOG2_POLY[8])); v.hi = t.hi; v.lo += t.lo + f64::from_bits(LOG2P1_LOG2_POLY[9]); v = DoubleDouble::quick_mult(v, z); let t = DoubleDouble::from_exact_add(v.hi, f64::from_bits(LOG2P1_LOG2_POLY[6])); v.hi = t.hi; v.lo += t.lo + f64::from_bits(LOG2P1_LOG2_POLY[7]); v = DoubleDouble::quick_mult(v, z); let t = DoubleDouble::from_exact_add(v.hi, f64::from_bits(LOG2P1_LOG2_POLY[4])); v.hi = t.hi; v.lo += t.lo + f64::from_bits(LOG2P1_LOG2_POLY[5]); v = DoubleDouble::quick_mult(v, z); let t = DoubleDouble::from_exact_add(v.hi, f64::from_bits(LOG2P1_LOG2_POLY[2])); v.hi = t.hi; v.lo += t.lo + f64::from_bits(LOG2P1_LOG2_POLY[3]); v = DoubleDouble::quick_mult(v, z); let t = DoubleDouble::from_exact_add(v.hi, f64::from_bits(LOG2P1_LOG2_POLY[0])); v.hi = t.hi; v.lo += t.lo + f64::from_bits(LOG2P1_LOG2_POLY[1]); v = DoubleDouble::quick_mult(v, z); v } /* assuming -1 < x < 2^128, and x is representable in binary32, put in h+l a double-double approximation of log2(1+x), with relative error bounded by 2^-91.123, and |l| < 2^-48.574 |h| (see analyze_log2p1_accurate() in compoundf.sage) */ pub(crate) fn compoundf_log2p1_accurate(x: f64) -> DoubleDouble { let mut v_dd = if 1.0 >= x { // then 1.0 >= |x| since x > -1 if (x as f32).abs() >= f32::from_bits(0x25000000) { DoubleDouble::from_exact_add(1.0, x) } else { DoubleDouble::new(x, 1.0) } } else { // fast_two_sum() is exact when |x| < 2^54 by Lemma 1 condition (ii) of [1] DoubleDouble::from_exact_add(x, 1.0) }; // now h + l = 1 + x + eps with |eps| <= 2^-105 |h| and |l| <= ulp(h) let mut v = v_dd.hi.to_bits(); let m = v & 0xfffffffffffffu64; let e: i64 = (v >> 52) as i64 - 0x3ff + (m >= 0x6a09e667f3bcdu64) as i64; let scale = f64::from_bits(LOG2P1_SCALE[e.wrapping_add(29) as usize]); v_dd.hi *= scale; v_dd.lo *= scale; // now |h| < sqrt(2) and |l| <= ulp(h) <= 2^-52 // now 1 + x ~ 2^e * (h + l) thus log2(1+x) ~ e + log2(h+l) v = (2.0 + v_dd.hi).to_bits(); // add 2 so that v.f is always in the binade [2, 4) let i: i32 = (v >> 45) as i32 - 0x2002d; // h is near 1/2+(i+13)/64 let r = f64::from_bits(LOG2P1_COMPOUNDF_INV[i as usize]); let mut z_dd = DoubleDouble::new(r * v_dd.lo, dd_fmla(r, v_dd.hi, -1.0)); // exact, -1/64 <= zh <= 1/64 // since |r| <= 0x1.68p+0 and |l| <= 2^-52, |zl| <= 2^-51.508 // zh + zl = r*(h+l)-1 // log2(h+l) = -log2(r) + log2(r*(h+l)) = -log2(r) + log2(1+zh+zl) z_dd = DoubleDouble::from_exact_add(z_dd.hi, z_dd.lo); // now |zh| <= 1/64 + 2^-51.508 and |zl| < 2^-58 /* the relative error of fast_two_sum() is bounded by 2^-105, this amplified the relative error on p2() as follows: (1+2^-91.196)*(1+2^-105)-1 < 2^-91.195. */ // now |zh| <= 1/64 + 2^-51.508 and |zl| < 2^-58 /* the relative error of fast_two_sum() is bounded by 2^-105, this amplified the relative error on p2() as follows: (1+2^-91.196)*(1+2^-105)-1 < 2^-91.195. */ let log_p = log2_poly2(z_dd); // ph + pl approximates log2(1+zh+zl) with relative error < 2^-93.471 /* since |log2inv[i][0]| < 1 and e is integer, the precondition of fast_two_sum is fulfilled: either |e| >= 1, or e=0 and fast_two_sum is exact */ let log2_inv = LOG2P1_COMPOUNDF_LOG2_INV[i as usize]; v_dd = DoubleDouble::from_exact_add(e as f64, f64::from_bits(log2_inv.1)); v_dd.lo += f64::from_bits(log2_inv.0); let mut p = DoubleDouble::from_exact_add(v_dd.hi, log_p.hi); p.lo += v_dd.lo + log_p.lo; p } pub(crate) fn compoundf_exp2_poly2(z: DoubleDouble) -> DoubleDouble { /* Q2 is a degree-8 polynomial generated by Sollya (cf q2.sollya) with absolute error < 2^-85.218 */ static Q2: [u64; 12] = [ 0x3ff0000000000000, 0x3fe62e42fefa39ef, 0x3c7abc9d45534d06, 0x3fcebfbdff82c58f, 0xbc65e4383cf9ddf7, 0x3fac6b08d704a0c0, 0xbc46cbc55586c8f1, 0x3f83b2ab6fba4e77, 0x3f55d87fe789aec5, 0x3f2430912f879daa, 0x3eeffcc774b2367a, 0x3eb62c017b9bdfe6, ]; let h2 = z.hi * z.hi; let c7 = dd_fmla(f64::from_bits(Q2[11]), z.hi, f64::from_bits(Q2[10])); let mut c5 = dd_fmla(f64::from_bits(Q2[9]), z.hi, f64::from_bits(Q2[8])); c5 = dd_fmla(c7, h2, c5); // since ulp(c5*h^5) <= 2^-86, we still compute c5*z as double let z_vqh = c5 * z.hi; let mut q = DoubleDouble::from_exact_add(f64::from_bits(Q2[7]), z_vqh); // multiply by z q = DoubleDouble::quick_mult(q, z); // add coefficient of degree 3 let t = DoubleDouble::from_exact_add(f64::from_bits(Q2[5]), q.hi); q.hi = t.hi; q.lo += t.lo + f64::from_bits(Q2[6]); // multiply by z and add coefficient of degree 2 q = DoubleDouble::quick_mult(q, z); let t = DoubleDouble::from_exact_add(f64::from_bits(Q2[3]), q.hi); q.hi = t.hi; q.lo += t.lo + f64::from_bits(Q2[4]); // multiply by h+l and add coefficient of degree 1 q = DoubleDouble::quick_mult(q, z); let t = DoubleDouble::from_exact_add(f64::from_bits(Q2[1]), q.hi); q.hi = t.hi; q.lo += t.lo + f64::from_bits(Q2[2]); // multiply by h+l and add coefficient of degree 0 q = DoubleDouble::quick_mult(q, z); let t = DoubleDouble::from_exact_add(f64::from_bits(Q2[0]), q.hi); q.hi = t.hi; q.lo += t.lo; q } /* return the correct rounding of (1+x)^y or -1 if the rounding test failed, where t is an approximation of y*log2(1+x). We assume |h+l| < 150, |l/h| < 2^-48.445 |h|, and the relative error between h+l and y*log2(1+x) is < 2^-91.120. x and y are the original inputs of compound. */ fn compoundf_exp2_accurate(x_dd: DoubleDouble, x: f32, y: f32) -> f32 { if y == 1.0 { let res = 1.0 + x; return res; } let k = x_dd.hi.round_ties_even_finite(); // |k| <= 150 // check easy cases h+l is tiny thus 2^(h+l) rounds to 1, 1- or 1+ if k == 0. && x_dd.hi.abs() <= f64::from_bits(0x3e6715476af0d4c8) { /* the relative error between h and y*log2(1+x) is bounded by (1 + 2^-48.445) * (1 + 2^-91.120) - 1 < 2^-48.444. 2^h rounds to 1 to nearest for |h| <= H0 := 0x1.715476af0d4d9p-25. The above threshold is such that h*(1+2^-48.444) < H0. */ return (1.0 + x_dd.hi * 0.5) as f32; } let r = x_dd.hi - k; // |r| <= 1/2, exact // since r is an integer multiple of ulp(h), fast_two_sum() below is exact let mut v_dd = DoubleDouble::from_exact_add(r, x_dd.lo); let mut v = (3.015625 + v_dd.hi).to_bits(); // 2.5 <= v <= 3.5015625 // we add 2^-6 so that i is rounded to nearest let i: i32 = ((v >> 46) as i32).wrapping_sub(0x10010); // 0 <= i <= 32 // h is near (i-16)/2^5 v_dd.hi -= f64::from_bits(COMPOUNDF_EXP2_T[i as usize]); // exact // now |h| <= 2^-6 // 2^(h+l) = 2^k * exp2_U[i] * 2^(h+l) v_dd = DoubleDouble::from_exact_add(v_dd.hi, v_dd.lo); let q = compoundf_exp2_poly2(v_dd); /* we have 0.989 < qh < 1.011, |ql| < 2^-51.959, and |qh + ql - 2^(h+l)| < 2^-85.210 */ let exp2u = DoubleDouble::from_bit_pair(COMPOUNDF_EXP2_U[i as usize]); let mut q = DoubleDouble::quick_mult(exp2u, q); q = DoubleDouble::from_exact_add(q.hi, q.lo); /* Total error: * at input we have a relative error between h+l and y*log2(1+x) bounded by 2^-91.120: h + l = y*log2(1+x) * (1 + eps1) with |eps1| < 2^-91.120. Since |h+l| <= 150, this yields an absolute error bounded by 150*2^-91.120 < 2^-83.891: h + l = y*log2(1+x) + eps2 with |eps2| <= 150*2^-91.120 < 2^-83.891. * the absolute error in q2() is bounded by 2^-85.210 and is multiplied by exp2_U[i] < 1.415 * the absolute d_mul() error is bounded by 2^-102.199 * the fast_two_sum() error is bounded by 2^-105 All this yields an absolute error on qh+ql bounded by: 2^-83.891 + 2^-85.210*1.415 + 2^-102.199 + 2^-105 < 2^-83.242. We distinguish the "small" case when at input |h+l| <= 2^-9. In this case k=0, i=16, thus exp2_T[i]=0, exp2_U[i]=1, and absolute error in q2() is bounded by 2^-102.646, and remains unchanged since the d_mul() call does not change qh, ql. */ /* Rounding test: since |ql| < ulp(qh), and the error is less than ulp(qh), the rounding test can fail only when the last 53-25 = 28 bits of qh represent a signed number in [-1,1] (when it is -2 or 2, adding ql and the error cannot cross a rounding boundary). */ let mut w = q.hi.to_bits(); if ((w.wrapping_add(1)) & 0xfffffffu64) <= 2 { static ERR: [u64; 2] = [0x3abb100000000000, 0x3a2d800000000000]; let small: bool = k == 0. && i == 16 && x_dd.hi <= f64::from_bits(0x3f60000000000000); let err = f64::from_bits(ERR[small as usize]); w = (q.hi + (q.lo + err)).to_bits(); w = unsafe { w.wrapping_add(k.to_int_unchecked::().wrapping_shl(52) as u64) }; } /* multiply qh+ql by 2^k: since 0.989 < qh_in < 1.011 and 0.707 < exp2_U[i] < 1.415, we have 0.69 < qh+ql < 1.44 */ v = (q.hi + q.lo).to_bits(); /* For RNDN, if qh fits exactly in 25 bits, and ql is tiny, so that qh + ql rounds to qh, then we might have a double-rounding issue. */ if (w.wrapping_shl(36)) == 0 && f64::from_bits(v) == q.hi && q.lo != 0. { v = v.wrapping_add((if q.lo > 0. { 1i64 } else { -1i64 }) as u64); // simulate round to odd } v = unsafe { v.wrapping_add(k.to_int_unchecked::().wrapping_shl(52) as u64) }; // there is no underflow/overflow in the scaling by 2^k since |k| <= 150 f64::from_bits(v) as f32 } // at input, exact is non-zero iff (1+x)^y is exact // x,y=0x1.0f6f1ap+1,0x1.c643bp+5: 49 identical bits after round bit // x,y=0x1.ef272cp+15,-0x1.746ab2p+1: 55 identical bits after round bit // x,y=0x1.07ffcp+0,-0x1.921a8ap+4: 47 identical bits after round bit #[cold] #[inline(never)] fn compoundf_accurate(x: f32, y: f32) -> f32 { let mut v = compoundf_log2p1_accurate(x as f64); /* h + l is a double-double approximation of log(1+x), with relative error bounded by 2^-91.123, and |l| < 2^-48.574 |h| */ v = DoubleDouble::quick_mult_f64(v, y as f64); /* h + l is a double-double approximation of y*log(1+x). Since 2^-149 <= |h_in+l_in| < 128 and 2^-149 <= |y| < 2^128, we have 2^-298 <= |h+l| < 2^135, thus no underflow/overflow in double is possible. The s_mul() error is bounded by ulp(l). Since |l_in| < 2^-48.574 |h_in|, and the intermediate variable lo in s_mul() satisfies |lo| < ulp(h), we have |l| < ulp(h) + |y l_in| <= ulp(h) + 2^-48.574 |y h_in| < (2^-52 + 2^-48.574) |h| < 2^-48.445 |h|. The s_mul() error is thus bounded by 2^-48.445*2^-52 = 2^-100.445 |h|. This yields a total relative error bounded by (1+2^-91.123)*(1+2^-100.445)-1 < 2^-91.120. */ compoundf_exp2_accurate(v, x, y) } /// Computes compound function (1.0 + x)^y /// /// Max ULP 0.5 #[inline] pub fn f_compoundf(x: f32, y: f32) -> f32 { /* Rules from IEEE 754-2019 for compound (x, n) with n integer: (a) compound (x, 0) is 1 for x >= -1 or quiet NaN (b) compound (-1, n) is +Inf and signals the divideByZero exception for n < 0 (c) compound (-1, n) is +0 for n > 0 (d) compound (+/-0, n) is 1 (e) compound (+Inf, n) is +Inf for n > 0 (f) compound (+Inf, n) is +0 for n < 0 (g) compound (x, n) is qNaN and signals the invalid exception for x < -1 (h) compound (qNaN, n) is qNaN for n <> 0. */ let mone = (-1.0f32).to_bits(); let nx = x.to_bits(); let ny = y.to_bits(); if nx >= mone { return as_compoundf_special(x, y); } // x <= -1 // now x > -1 let ax: u32 = nx.wrapping_shl(1); let ay: u32 = ny.wrapping_shl(1); if ax == 0 || ax >= 0xffu32 << 24 || ay == 0 || ay >= 0xffu32 << 24 { return as_compoundf_special(x, y); } // x=+-0 || x=+-inf/nan || y=+-0 || y=+-inf/nan // evaluate (1+x)^y explicitly for integer y in [-16,16] range and |x|<2^64 if is_integerf(y) && ay <= 0x83000000u32 && ax <= 0xbefffffeu32 { if ax <= 0x62000000u32 { return 1.0 + y * x; } // does it work for |x|<2^-29 and |y|<=16? let mut s = x as f64 + 1.; let mut iter_count = unsafe { y.abs().to_int_unchecked::() }; // exponentiation by squaring: O(log(y)) complexity let mut acc = if iter_count % 2 != 0 { s } else { 1. }; while { iter_count >>= 1; iter_count } != 0 { s = s * s; if iter_count % 2 != 0 { acc *= s; } } let dz = if y.is_sign_negative() { 1. / acc } else { acc }; return dz as f32; } let xd = x as f64; let yd = y as f64; let tx = xd.to_bits(); let ty = yd.to_bits(); let l: f64 = compoundf_log2p1_fast(f64::from_bits(tx)); /* l approximates log2(1+x) with relative error < 2^-47.997, and 2^-149 <= |l| < 128 */ let t: u64 = (l * f64::from_bits(ty)).to_bits(); /* since 2^-149 <= |l| < 128 and 2^-149 <= |y| < 2^128, we have 2^-298 <= |t| < 2^135, thus no underflow/overflow in double is possible. The relative error is bounded by (1+2^-47.997)*(1+2^-52)-1 < 2^-47.909 */ // detect overflow/underflow if (t.wrapping_shl(1)) >= (0x406u64 << 53) { // |t| >= 128 if t >= 0x3018bu64 << 46 { // t <= -150 return black_box(f32::from_bits(0x00800000)) * black_box(f32::from_bits(0x00800000)); } else if (t >> 63) == 0 { // t >= 128: overflow return black_box(f32::from_bits(0x7e800000)) * black_box(f32::from_bits(0x7e800000)); } } /* since |t| < 150, the absolute error on t is bounded by 150*2^-47.909 < 2^-40.680 */ // 2^t rounds to 1 to nearest when |t| <= 0x1.715476ba97f14p-25 if (t.wrapping_shl(1)) <= 0x3e6715476ba97f14u64 { return if (t >> 63) != 0 { black_box(1.0) - black_box(f32::from_bits(0x33000000)) } else { black_box(1.0) + black_box(f32::from_bits(0x33000000)) }; } let res = exp2_fast(f64::from_bits(t)); if res != -1.0 { return res as f32; } compoundf_accurate(x, y) } #[cfg(test)] mod tests { use super::*; #[test] fn test_compoundf() { assert_eq!( f_compoundf( 0.000000000000000000000000000000000000011754944, -170502050000000000000000000000000000000. ), 1. ); assert_eq!(f_compoundf(1.235, 1.432), 3.1634824); assert_eq!(f_compoundf(2., 3.0), 27.); assert!(f_compoundf(-2., 5.0).is_nan()); assert_eq!(f_compoundf(1., f32::INFINITY), f32::INFINITY); assert_eq!(f_compoundf(1., f32::NEG_INFINITY), 0.0); } }