/*
 * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
 * //
 * // Redistribution and use in source and binary forms, with or without modification,
 * // are permitted provided that the following conditions are met:
 * //
 * // 1.  Redistributions of source code must retain the above copyright notice, this
 * // list of conditions and the following disclaimer.
 * //
 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
 * // this list of conditions and the following disclaimer in the documentation
 * // and/or other materials provided with the distribution.
 * //
 * // 3.  Neither the name of the copyright holder nor the names of its
 * // contributors may be used to endorse or promote products derived from
 * // this software without specific prior written permission.
 * //
 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
use crate::common::{dd_fmla, dyad_fmla, f_fmla, is_odd_integer};
use crate::double_double::DoubleDouble;
use crate::polyeval::{f_polyeval3, f_polyeval4};
use crate::round::RoundFinite;
use crate::sin::SinCos;
use crate::sincospi_tables::SINPI_K_PI_OVER_64;

/**
Cospi(x) on [0; 0.000244140625]

Generated by Sollya:
```text
d = [0, 0.000244140625];
f_cos = cos(y*pi);
Q = fpminimax(f_cos, [|0, 2, 4, 6, 8, 10|], [|107...|], d, relative, floating);
```

See ./notes/cospi_zero_dd.sollya
**/
#[cold]
fn as_cospi_zero(x: f64) -> f64 {
    const C: [(u64, u64); 5] = [
        (0xbcb692b71366cc04, 0xc013bd3cc9be45de),
        (0xbcb32b33fb803bd5, 0x40103c1f081b5ac4),
        (0xbc9f5b752e98b088, 0xbff55d3c7e3cbff9),
        (0x3c30023d540b9350, 0x3fce1f506446cb66),
        (0x3c1a5d47937787d2, 0xbf8a9b062a36ba1c),
    ];
    let x2 = DoubleDouble::from_exact_mult(x, x);
    let mut p = DoubleDouble::quick_mul_add(
        x2,
        DoubleDouble::from_bit_pair(C[3]),
        DoubleDouble::from_bit_pair(C[3]),
    );
    p = DoubleDouble::quick_mul_add(x2, p, DoubleDouble::from_bit_pair(C[2]));
    p = DoubleDouble::quick_mul_add(x2, p, DoubleDouble::from_bit_pair(C[1]));
    p = DoubleDouble::quick_mul_add(x2, p, DoubleDouble::from_bit_pair(C[0]));
    p = DoubleDouble::mul_add_f64(x2, p, 1.);
    p.to_f64()
}

/**
Sinpi on range [0.0, 0.03515625]

Generated poly by Sollya:
```text
d = [0, 0.03515625];

f_sin = sin(y*pi)/y;
Q = fpminimax(f_sin, [|0, 2, 4, 6, 8, 10|], [|107...|], d, relative, floating);
```
See ./notes/sinpi_zero_dd.sollya
**/
#[cold]
fn as_sinpi_zero(x: f64) -> f64 {
    const C: [(u64, u64); 6] = [
        (0x3ca1a626311d9056, 0x400921fb54442d18),
        (0x3cb055f12c462211, 0xc014abbce625be53),
        (0xbc9789ea63534250, 0x400466bc6775aae1),
        (0xbc78b86de6962184, 0xbfe32d2cce62874e),
        (0x3c4eddf7cd887302, 0x3fb507833e2b781f),
        (0x3bf180c9d4af2894, 0xbf7e2ea4e143707e),
    ];
    let x2 = DoubleDouble::from_exact_mult(x, x);
    let mut p = DoubleDouble::quick_mul_add(
        x2,
        DoubleDouble::from_bit_pair(C[5]),
        DoubleDouble::from_bit_pair(C[4]),
    );
    p = DoubleDouble::quick_mul_add(x2, p, DoubleDouble::from_bit_pair(C[3]));
    p = DoubleDouble::quick_mul_add(x2, p, DoubleDouble::from_bit_pair(C[2]));
    p = DoubleDouble::quick_mul_add(x2, p, DoubleDouble::from_bit_pair(C[1]));
    p = DoubleDouble::quick_mul_add(x2, p, DoubleDouble::from_bit_pair(C[0]));
    p = DoubleDouble::quick_mult_f64(p, x);
    p.to_f64()
}

// Return k and y, where
// k = round(x * 64 / pi) and y = (x * 64 / pi) - k.
#[inline]
pub(crate) fn reduce_pi_64(x: f64) -> (f64, i64) {
    let kd = (x * 64.).round_finite();
    let y = dd_fmla(kd, -1. / 64., x);
    (y, unsafe {
        kd.to_int_unchecked::<i64>() // indeterminate values is always filtered out before this call, as well only lowest bits are used
    })
}

#[inline]
pub(crate) fn sincospi_eval(x: f64) -> SinCos {
    let x2 = x * x;
    /*
        sinpi(pi*x) poly generated by Sollya:
        d = [0, 0.0078128];
        f_sin = sin(y*pi)/y;
        Q = fpminimax(f_sin, [|0, 2, 4, 6|], [|107, D...|], d, relative, floating);
        See ./notes/sinpi.sollya
    */
    let sin_lop = f_polyeval3(
        x2,
        f64::from_bits(0xc014abbce625be4d),
        f64::from_bits(0x400466bc6767f259),
        f64::from_bits(0xbfe32d176b0b3baf),
    ) * x2;
    // We're splitting polynomial in two parts, since first term dominates
    // we compute: (a0_lo + a0_hi) * x + x * (a1 * x^2 + a2 + x^4) ...
    let sin_lo = dd_fmla(f64::from_bits(0x3ca1a5c04563817a), x, sin_lop * x);
    let sin_hi = x * f64::from_bits(0x400921fb54442d18);

    /*
       cospi(pi*x) poly generated by Sollya:
       d = [0, 0.015625];
       f_cos = cos(y*pi);
       Q = fpminimax(f_cos, [|0, 2, 4, 6, 8|], [|107, D...|], d, relative, floating);
       See ./notes/cospi.sollya
    */
    let p = f_polyeval3(
        x2,
        f64::from_bits(0xc013bd3cc9be45cf),
        f64::from_bits(0x40103c1f08085ad1),
        f64::from_bits(0xbff55d1e43463fc3),
    );

    // We're splitting polynomial in two parts, since first term dominates
    // we compute: (a0_lo + a0_hi) + (a1 * x^2 + a2 + x^4)...
    let cos_lo = dd_fmla(p, x2, f64::from_bits(0xbbdf72adefec0800));
    let cos_hi = f64::from_bits(0x3ff0000000000000);

    let err = f_fmla(
        x2,
        f64::from_bits(0x3cb0000000000000), // 2^-52
        f64::from_bits(0x3c40000000000000), // 2^-59
    );
    SinCos {
        v_sin: DoubleDouble::from_exact_add(sin_hi, sin_lo),
        v_cos: DoubleDouble::from_exact_add(cos_hi, cos_lo),
        err,
    }
}

#[inline]
pub(crate) fn sincospi_eval_dd(x: f64) -> SinCos {
    let x2 = DoubleDouble::from_exact_mult(x, x);
    // Sin coeffs
    // poly sin(pi*x) generated by Sollya:
    // d = [0, 0.0078128];
    // f_sin = sin(y*pi)/y;
    // Q = fpminimax(f_sin, [|0, 2, 4, 6, 8|], [|107...|], d, relative, floating);
    // see ./notes/sinpi_dd.sollya
    const SC: [(u64, u64); 5] = [
        (0x3ca1a626330ccf19, 0x400921fb54442d18),
        (0x3cb05540f6323de9, 0xc014abbce625be53),
        (0xbc9050fdd1229756, 0x400466bc6775aadf),
        (0xbc780d406f3472e8, 0xbfe32d2cce5a7bf1),
        (0x3c4cfcf8b6b817f2, 0x3fb5077069d8a182),
    ];

    let mut sin_y = DoubleDouble::quick_mul_add(
        x2,
        DoubleDouble::from_bit_pair(SC[4]),
        DoubleDouble::from_bit_pair(SC[3]),
    );
    sin_y = DoubleDouble::quick_mul_add(x2, sin_y, DoubleDouble::from_bit_pair(SC[2]));
    sin_y = DoubleDouble::quick_mul_add(x2, sin_y, DoubleDouble::from_bit_pair(SC[1]));
    sin_y = DoubleDouble::quick_mul_add(x2, sin_y, DoubleDouble::from_bit_pair(SC[0]));
    sin_y = DoubleDouble::quick_mult_f64(sin_y, x);

    // Cos coeffs
    // d = [0, 0.0078128];
    // f_cos = cos(y*pi);
    // Q = fpminimax(f_cos, [|0, 2, 4, 6, 8|], [|107...|], d, relative, floating);
    // See ./notes/cospi_dd.sollya
    const CC: [(u64, u64); 5] = [
        (0xbaaa70a580000000, 0x3ff0000000000000),
        (0xbcb69211d8dd1237, 0xc013bd3cc9be45de),
        (0xbcbd96cfd637eeb7, 0x40103c1f081b5abf),
        (0x3c994d75c577f029, 0xbff55d3c7e2e4ba5),
        (0xbc5c542d998a4e48, 0x3fce1f2f5f747411),
    ];

    let mut cos_y = DoubleDouble::quick_mul_add(
        x2,
        DoubleDouble::from_bit_pair(CC[4]),
        DoubleDouble::from_bit_pair(CC[3]),
    );
    cos_y = DoubleDouble::quick_mul_add(x2, cos_y, DoubleDouble::from_bit_pair(CC[2]));
    cos_y = DoubleDouble::quick_mul_add(x2, cos_y, DoubleDouble::from_bit_pair(CC[1]));
    cos_y = DoubleDouble::quick_mul_add(x2, cos_y, DoubleDouble::from_bit_pair(CC[0]));
    SinCos {
        v_sin: sin_y,
        v_cos: cos_y,
        err: 0.,
    }
}

#[cold]
fn sinpi_dd(x: f64, sin_k: DoubleDouble, cos_k: DoubleDouble) -> f64 {
    let r_sincos = sincospi_eval_dd(x);
    let cos_k_sin_y = DoubleDouble::quick_mult(cos_k, r_sincos.v_sin);
    let rr = DoubleDouble::mul_add(sin_k, r_sincos.v_cos, cos_k_sin_y);
    rr.to_f64()
}

#[cold]
fn sincospi_dd(
    x: f64,
    sin_sin_k: DoubleDouble,
    sin_cos_k: DoubleDouble,
    cos_sin_k: DoubleDouble,
    cos_cos_k: DoubleDouble,
) -> (f64, f64) {
    let r_sincos = sincospi_eval_dd(x);

    let cos_k_sin_y = DoubleDouble::quick_mult(sin_cos_k, r_sincos.v_sin);
    let rr_sin = DoubleDouble::mul_add(sin_sin_k, r_sincos.v_cos, cos_k_sin_y);

    let cos_k_sin_y = DoubleDouble::quick_mult(cos_cos_k, r_sincos.v_sin);
    let rr_cos = DoubleDouble::mul_add(cos_sin_k, r_sincos.v_cos, cos_k_sin_y);

    (rr_sin.to_f64(), rr_cos.to_f64())
}

// [sincospi_eval] gives precision around 2^-66 what is not enough for DD case this method gives 2^-84
#[inline]
fn sincospi_eval_extended(x: f64) -> SinCos {
    let x2 = DoubleDouble::from_exact_mult(x, x);
    /*
        sinpi(pi*x) poly generated by Sollya:
        d = [0, 0.0078128];
        f_sin = sin(y*pi)/y;
        Q = fpminimax(f_sin, [|0, 2, 4, 6, 8|], [|107, 107, D...|], d, relative, floating);
        See ./notes/sinpi.sollya
    */
    let sin_lop = f_polyeval3(
        x2.hi,
        f64::from_bits(0x400466bc67763662),
        f64::from_bits(0xbfe32d2cce5aad86),
        f64::from_bits(0x3fb5077099a1f35b),
    );
    let mut v_sin = DoubleDouble::mul_f64_add(
        x2,
        sin_lop,
        DoubleDouble::from_bit_pair((0x3cb0553d6ee5e8ec, 0xc014abbce625be53)),
    );
    v_sin = DoubleDouble::mul_add(
        x2,
        v_sin,
        DoubleDouble::from_bit_pair((0x3ca1a626330dd130, 0x400921fb54442d18)),
    );
    v_sin = DoubleDouble::quick_mult_f64(v_sin, x);

    /*
       cospi(pi*x) poly generated by Sollya:
       d = [0, 0.015625];
       f_cos = cos(y*pi);
       Q = fpminimax(f_cos, [|0, 2, 4, 6, 8|], [|107, 107, D...|], d, relative, floating);
       See ./notes/cospi_fast_dd.sollya
    */
    let p = f_polyeval3(
        x2.hi,
        f64::from_bits(0x40103c1f081b5abf),
        f64::from_bits(0xbff55d3c7e2edd89),
        f64::from_bits(0x3fce1f2fd9d79484),
    );

    let mut v_cos = DoubleDouble::mul_f64_add(
        x2,
        p,
        DoubleDouble::from_bit_pair((0xbcb69236a9b3ed73, 0xc013bd3cc9be45de)),
    );
    v_cos = DoubleDouble::mul_add_f64(x2, v_cos, f64::from_bits(0x3ff0000000000000));

    SinCos {
        v_sin: DoubleDouble::from_exact_add(v_sin.hi, v_sin.lo),
        v_cos: DoubleDouble::from_exact_add(v_cos.hi, v_cos.lo),
        err: 0.,
    }
}

pub(crate) fn f_fast_sinpi_dd(x: f64) -> DoubleDouble {
    let ix = x.to_bits();
    let ax = ix & 0x7fff_ffff_ffff_ffff;
    if ax == 0 {
        return DoubleDouble::new(0., 0.);
    }
    let e: i32 = (ax >> 52) as i32;
    let m0 = (ix & 0x000fffffffffffff) | (1u64 << 52);
    let sgn: i64 = (ix as i64) >> 63;
    let m = ((m0 as i64) ^ sgn).wrapping_sub(sgn);
    let mut s: i32 = 1063i32.wrapping_sub(e);
    if s < 0 {
        s = -s - 1;
        if s > 10 {
            return DoubleDouble::new(0., f64::copysign(0.0, x));
        }
        let iq: u64 = (m as u64).wrapping_shl(s as u32);
        if (iq & 2047) == 0 {
            return DoubleDouble::new(0., f64::copysign(0.0, x));
        }
    }

    if ax <= 0x3fa2000000000000u64 {
        // |x| <= 0.03515625
        const PI: DoubleDouble = DoubleDouble::new(
            f64::from_bits(0x3ca1a62633145c07),
            f64::from_bits(0x400921fb54442d18),
        );

        if ax < 0x3c90000000000000 {
            // for x near zero, sinpi(x) = pi*x + O(x^3), thus worst cases are those
            // of the function pi*x, and if x is a worst case, then 2*x is another
            // one in the next binade. For this reason, worst cases are only included
            // for the binade [2^-1022, 2^-1021). For larger binades,
            // up to [2^-54,2^-53), worst cases should be deduced by multiplying
            // by some power of 2.
            if ax < 0x0350000000000000 {
                let t = x * f64::from_bits(0x4690000000000000);
                let z = DoubleDouble::quick_mult_f64(PI, t);
                let r = z.to_f64();
                let rs = r * f64::from_bits(0x3950000000000000);
                let rt = rs * f64::from_bits(0x4690000000000000);
                return DoubleDouble::new(
                    0.,
                    dyad_fmla((z.hi - rt) + z.lo, f64::from_bits(0x3950000000000000), rs),
                );
            }
            let z = DoubleDouble::quick_mult_f64(PI, x);
            return z;
        }

        /*
           Poly generated by Sollya:
           d = [0, 0.03515625];
           f_sin = sin(y*pi)/y;
           Q = fpminimax(f_sin, [|0, 2, 4, 6, 8, 10|], [|107, 107, D...|], d, relative, floating);

           See ./notes/sinpi_zero_fast_dd.sollya
        */
        const C: [u64; 4] = [
            0xbfe32d2cce62bd85,
            0x3fb50783487eb73d,
            0xbf7e3074f120ad1f,
            0x3f3e8d9011340e5a,
        ];

        let x2 = DoubleDouble::from_exact_mult(x, x);

        const C_PI: DoubleDouble =
            DoubleDouble::from_bit_pair((0x3ca1a626331457a4, 0x400921fb54442d18));

        let p = f_polyeval4(
            x2.hi,
            f64::from_bits(C[0]),
            f64::from_bits(C[1]),
            f64::from_bits(C[2]),
            f64::from_bits(C[3]),
        );
        let mut r = DoubleDouble::mul_f64_add(
            x2,
            p,
            DoubleDouble::from_bit_pair((0xbc96dd7ae221e58c, 0x400466bc6775aae2)),
        );
        r = DoubleDouble::mul_add(
            x2,
            r,
            DoubleDouble::from_bit_pair((0x3cb05511c8a6c478, 0xc014abbce625be53)),
        );
        r = DoubleDouble::mul_add(r, x2, C_PI);
        r = DoubleDouble::quick_mult_f64(r, x);
        let k = DoubleDouble::from_exact_add(r.hi, r.lo);
        return k;
    }

    let si = e.wrapping_sub(1011);
    if si >= 0 && (m0.wrapping_shl(si.wrapping_add(1) as u32)) == 0 {
        // x is integer or half-integer
        if (m0.wrapping_shl(si as u32)) == 0 {
            return DoubleDouble::new(0., f64::copysign(0.0, x)); // x is integer
        }
        let t = (m0.wrapping_shl((si - 1) as u32)) >> 63;
        // t = 0 if |x| = 1/2 mod 2, t = 1 if |x| = 3/2 mod 2
        return DoubleDouble::new(
            0.,
            if t == 0 {
                f64::copysign(1.0, x)
            } else {
                -f64::copysign(1.0, x)
            },
        );
    }

    let (y, k) = reduce_pi_64(x);

    // // cos(k * pi/64) = sin(k * pi/64 + pi/2) = sin((k + 32) * pi/64).
    let sin_k = DoubleDouble::from_bit_pair(SINPI_K_PI_OVER_64[((k as u64) & 127) as usize]);
    let cos_k = DoubleDouble::from_bit_pair(
        SINPI_K_PI_OVER_64[((k as u64).wrapping_add(32) & 127) as usize],
    );

    let r_sincos = sincospi_eval_extended(y);

    let sin_k_cos_y = DoubleDouble::quick_mult(sin_k, r_sincos.v_cos);
    let cos_k_sin_y = DoubleDouble::quick_mult(cos_k, r_sincos.v_sin);

    // sin_k_cos_y is always >> cos_k_sin_y
    let mut rr = DoubleDouble::from_exact_add(sin_k_cos_y.hi, cos_k_sin_y.hi);
    rr.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;
    DoubleDouble::from_exact_add(rr.hi, rr.lo)
}

/// Computes sin(PI*x)
///
/// Max ULP 0.5
pub fn f_sinpi(x: f64) -> f64 {
    let ix = x.to_bits();
    let ax = ix & 0x7fff_ffff_ffff_ffff;
    if ax == 0 {
        return x;
    }
    let e: i32 = (ax >> 52) as i32;
    let m0 = (ix & 0x000fffffffffffff) | (1u64 << 52);
    let sgn: i64 = (ix as i64) >> 63;
    let m = ((m0 as i64) ^ sgn).wrapping_sub(sgn);
    let mut s: i32 = 1063i32.wrapping_sub(e);
    if s < 0 {
        if e == 0x7ff {
            if (ix << 12) == 0 {
                return f64::NAN;
            }
            return x + x; // case x=NaN
        }
        s = -s - 1;
        if s > 10 {
            return f64::copysign(0.0, x);
        }
        let iq: u64 = (m as u64).wrapping_shl(s as u32);
        if (iq & 2047) == 0 {
            return f64::copysign(0.0, x);
        }
    }

    if ax <= 0x3fa2000000000000u64 {
        // |x| <= 0.03515625
        const PI: DoubleDouble = DoubleDouble::new(
            f64::from_bits(0x3ca1a62633145c07),
            f64::from_bits(0x400921fb54442d18),
        );

        if ax < 0x3c90000000000000 {
            // for x near zero, sinpi(x) = pi*x + O(x^3), thus worst cases are those
            // of the function pi*x, and if x is a worst case, then 2*x is another
            // one in the next binade. For this reason, worst cases are only included
            // for the binade [2^-1022, 2^-1021). For larger binades,
            // up to [2^-54,2^-53), worst cases should be deduced by multiplying
            // by some power of 2.
            if ax < 0x0350000000000000 {
                let t = x * f64::from_bits(0x4690000000000000);
                let z = DoubleDouble::quick_mult_f64(PI, t);
                let r = z.to_f64();
                let rs = r * f64::from_bits(0x3950000000000000);
                let rt = rs * f64::from_bits(0x4690000000000000);
                return dyad_fmla((z.hi - rt) + z.lo, f64::from_bits(0x3950000000000000), rs);
            }
            let z = DoubleDouble::quick_mult_f64(PI, x);
            return z.to_f64();
        }

        /*
           Poly generated by Sollya:
           d = [0, 0.03515625];
           f_sin = sin(y*pi)/y;
           Q = fpminimax(f_sin, [|0, 2, 4, 6, 8, 10|], [|107, D...|], d, relative, floating);

           See ./notes/sinpi_zero.sollya
        */

        let x2 = x * x;
        let x3 = x2 * x;
        let x4 = x2 * x2;

        let eps = x * f_fmla(
            x2,
            f64::from_bits(0x3d00000000000000), // 2^-47
            f64::from_bits(0x3bd0000000000000), // 2^-66
        );

        const C: [u64; 4] = [
            0xc014abbce625be51,
            0x400466bc67754b46,
            0xbfe32d2cc12a51f4,
            0x3fb5060540058476,
        ];

        const C_PI: DoubleDouble =
            DoubleDouble::from_bit_pair((0x3ca1a67088eb1a46, 0x400921fb54442d18));

        let mut z = DoubleDouble::quick_mult_f64(C_PI, x);

        let zl0 = f_fmla(x2, f64::from_bits(C[1]), f64::from_bits(C[0]));
        let zl1 = f_fmla(x2, f64::from_bits(C[3]), f64::from_bits(C[2]));

        z.lo = f_fmla(x3, f_fmla(x4, zl1, zl0), z.lo);
        let lb = z.hi + (z.lo - eps);
        let ub = z.hi + (z.lo + eps);
        if lb == ub {
            return lb;
        }
        return as_sinpi_zero(x);
    }

    let si = e.wrapping_sub(1011);
    if si >= 0 && (m0.wrapping_shl(si.wrapping_add(1) as u32)) == 0 {
        // x is integer or half-integer
        if (m0.wrapping_shl(si as u32)) == 0 {
            return f64::copysign(0.0, x); // x is integer
        }
        let t = (m0.wrapping_shl((si - 1) as u32)) >> 63;
        // t = 0 if |x| = 1/2 mod 2, t = 1 if |x| = 3/2 mod 2
        return if t == 0 {
            f64::copysign(1.0, x)
        } else {
            -f64::copysign(1.0, x)
        };
    }

    let (y, k) = reduce_pi_64(x);

    // cos(k * pi/64) = sin(k * pi/64 + pi/2) = sin((k + 32) * pi/64).
    let sin_k = DoubleDouble::from_bit_pair(SINPI_K_PI_OVER_64[((k as u64) & 127) as usize]);
    let cos_k = DoubleDouble::from_bit_pair(
        SINPI_K_PI_OVER_64[((k as u64).wrapping_add(32) & 127) as usize],
    );

    let r_sincos = sincospi_eval(y);

    let sin_k_cos_y = DoubleDouble::quick_mult(sin_k, r_sincos.v_cos);
    let cos_k_sin_y = DoubleDouble::quick_mult(cos_k, r_sincos.v_sin);

    // sin_k_cos_y is always >> cos_k_sin_y
    let mut rr = DoubleDouble::from_exact_add(sin_k_cos_y.hi, cos_k_sin_y.hi);
    rr.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;

    let ub = rr.hi + (rr.lo + r_sincos.err); // (rr.lo + ERR);
    let lb = rr.hi + (rr.lo - r_sincos.err); // (rr.lo - ERR);

    if ub == lb {
        return rr.to_f64();
    }
    sinpi_dd(y, sin_k, cos_k)
}

/// Computes cos(PI*x)
///
/// Max found ULP 0.5
pub fn f_cospi(x: f64) -> f64 {
    let ix = x.to_bits();
    let ax = ix & 0x7fff_ffff_ffff_ffff;
    if ax == 0 {
        return 1.0;
    }
    let e: i32 = (ax >> 52) as i32;
    // e is the unbiased exponent, we have 2^(e-1023) <= |x| < 2^(e-1022)
    let m: i64 = ((ix & 0x000fffffffffffff) | (1u64 << 52)) as i64;
    let mut s = 1063i32.wrapping_sub(e); // 2^(40-s) <= |x| < 2^(41-s)
    if s < 0 {
        // |x| >= 2^41
        if e == 0x7ff {
            // NaN or Inf
            if ix.wrapping_shl(12) == 0 {
                return f64::NAN;
            }
            return x + x; // NaN
        }
        s = -s - 1; // now 2^(41+s) <= |x| < 2^(42+s)
        if s > 11 {
            return 1.0;
        } // |x| >= 2^53
        let iq: u64 = (m as u64).wrapping_shl(s as u32).wrapping_add(1024);
        if (iq & 2047) == 0 {
            return 0.0;
        }
    }
    if ax <= 0x3f30000000000000u64 {
        // |x| <= 2^-12, |x| <= 0.000244140625
        if ax <= 0x3e2ccf6429be6621u64 {
            return 1.0 - f64::from_bits(0x3c80000000000000);
        }
        let x2 = x * x;
        let x4 = x2 * x2;
        let eps = x2 * f64::from_bits(0x3cfa000000000000);

        /*
            Generated by Sollya:
            d = [0, 0.000244140625];
            f_cos = cos(y*pi);
            Q = fpminimax(f_cos, [|0, 2, 4, 6, 8|], [|107, 107, D...|], d, relative, floating);

            See ./notes/cospi.sollya
        */

        const C: [u64; 4] = [
            0xc013bd3cc9be45de,
            0x40103c1f081b5ac4,
            0xbff55d3c7ff79b60,
            0x3fd24c7b6f7d0690,
        ];

        let p0 = f_fmla(x2, f64::from_bits(C[3]), f64::from_bits(C[2]));
        let p1 = f_fmla(x2, f64::from_bits(C[1]), f64::from_bits(C[0]));

        let p = x2 * f_fmla(x4, p0, p1);
        let lb = (p - eps) + 1.;
        let ub = (p + eps) + 1.;
        if lb == ub {
            return lb;
        }
        return as_cospi_zero(x);
    }

    let si: i32 = e.wrapping_sub(1011);
    if si >= 0 && ((m as u64).wrapping_shl(si as u32) ^ 0x8000000000000000u64) == 0 {
        return 0.0;
    }

    let (y, k) = reduce_pi_64(x);

    // cos(k * pi/64) = sin(k * pi/64 + pi/2) = sin((k + 32) * pi/64).
    let msin_k = DoubleDouble::from_bit_pair(
        SINPI_K_PI_OVER_64[((k as u64).wrapping_add(64) & 127) as usize],
    );
    let cos_k = DoubleDouble::from_bit_pair(
        SINPI_K_PI_OVER_64[((k as u64).wrapping_add(32) & 127) as usize],
    );

    let r_sincos = sincospi_eval(y);

    let cos_k_cos_y = DoubleDouble::quick_mult(r_sincos.v_cos, cos_k);
    let cos_k_msin_y = DoubleDouble::quick_mult(r_sincos.v_sin, msin_k);

    // cos_k_cos_y is always >> cos_k_msin_y
    let mut rr = DoubleDouble::from_exact_add(cos_k_cos_y.hi, cos_k_msin_y.hi);
    rr.lo += cos_k_cos_y.lo + cos_k_msin_y.lo;

    let ub = rr.hi + (rr.lo + r_sincos.err); // (rr.lo + ERR);
    let lb = rr.hi + (rr.lo - r_sincos.err); // (rr.lo - ERR);

    if ub == lb {
        return rr.to_f64();
    }
    sinpi_dd(y, cos_k, msin_k)
}

/// Computes sin(PI*x) and cos(PI*x)
///
/// Max found ULP 0.5
pub fn f_sincospi(x: f64) -> (f64, f64) {
    let ix = x.to_bits();
    let ax = ix & 0x7fff_ffff_ffff_ffff;
    if ax == 0 {
        return (x, 1.0);
    }
    let e: i32 = (ax >> 52) as i32;
    // e is the unbiased exponent, we have 2^(e-1023) <= |x| < 2^(e-1022)
    let m0 = (ix & 0x000fffffffffffff) | (1u64 << 52);
    let m: i64 = ((ix & 0x000fffffffffffff) | (1u64 << 52)) as i64;
    let mut s = 1063i32.wrapping_sub(e); // 2^(40-s) <= |x| < 2^(41-s)
    if s < 0 {
        // |x| >= 2^41
        if e == 0x7ff {
            // NaN or Inf
            if ix.wrapping_shl(12) == 0 {
                return (f64::NAN, f64::NAN);
            }
            return (x + x, x + x); // NaN
        }
        s = -s - 1;
        if s > 10 {
            static CF: [f64; 2] = [1., -1.];
            let is_odd = is_odd_integer(f64::from_bits(ax));
            let cos_x = CF[is_odd as usize];
            return (f64::copysign(0.0, x), cos_x);
        } // |x| >= 2^53
        let iq: u64 = (m as u64).wrapping_shl(s as u32);

        // sinpi = 0 when multiple of 2048
        let sin_zero = (iq & 2047) == 0;

        // cospi = 0 when offset-by-half multiple of 2048
        let cos_zero = ((m as u64).wrapping_shl(s as u32).wrapping_add(1024) & 2047) == 0;

        if sin_zero && cos_zero {
            // both zero (only possible if NaN or something degenerate)
        } else if sin_zero {
            static CF: [f64; 2] = [1., -1.];
            let is_odd = is_odd_integer(f64::from_bits(ax));
            let cos_x = CF[is_odd as usize];
            return (0.0, cos_x); // sin = 0, cos = ±1
        } else if cos_zero {
            // x = k / 2 * PI
            let si = e.wrapping_sub(1011);
            let t = (m0.wrapping_shl((si - 1) as u32)) >> 63;
            // making sin decision based on quadrant
            return if t == 0 {
                (f64::copysign(1.0, x), 0.0)
            } else {
                (-f64::copysign(1.0, x), 0.0)
            }; // sin = ±1, cos = 0
        }
    }

    if ax <= 0x3f30000000000000u64 {
        // |x| <= 2^-12, |x| <= 0.000244140625
        if ax <= 0x3c90000000000000u64 {
            const PI: DoubleDouble = DoubleDouble::new(
                f64::from_bits(0x3ca1a62633145c07),
                f64::from_bits(0x400921fb54442d18),
            );
            let sin_x = if ax < 0x0350000000000000 {
                let t = x * f64::from_bits(0x4690000000000000);
                let z = DoubleDouble::quick_mult_f64(PI, t);
                let r = z.to_f64();
                let rs = r * f64::from_bits(0x3950000000000000);
                let rt = rs * f64::from_bits(0x4690000000000000);
                dyad_fmla((z.hi - rt) + z.lo, f64::from_bits(0x3950000000000000), rs)
            } else {
                let z = DoubleDouble::quick_mult_f64(PI, x);
                z.to_f64()
            };
            return (sin_x, 1.0 - f64::from_bits(0x3c80000000000000));
        }
        let x2 = x * x;
        let x4 = x2 * x2;
        let cos_eps = x2 * f64::from_bits(0x3cfa000000000000);

        /*
            Generated by Sollya:
            d = [0, 0.000244140625];
            f_cos = cos(y*pi);
            Q = fpminimax(f_cos, [|0, 2, 4, 6, 8|], [|107, 107, D...|], d, relative, floating);

            See ./notes/cospi.sollya
        */

        const COS_C: [u64; 4] = [
            0xc013bd3cc9be45de,
            0x40103c1f081b5ac4,
            0xbff55d3c7ff79b60,
            0x3fd24c7b6f7d0690,
        ];

        let p0 = f_fmla(x2, f64::from_bits(COS_C[3]), f64::from_bits(COS_C[2]));
        let p1 = f_fmla(x2, f64::from_bits(COS_C[1]), f64::from_bits(COS_C[0]));

        let p = x2 * f_fmla(x4, p0, p1);
        let cos_lb = (p - cos_eps) + 1.;
        let cos_ub = (p + cos_eps) + 1.;
        let cos_x = if cos_lb == cos_ub {
            cos_lb
        } else {
            as_cospi_zero(x)
        };

        /*
            Poly generated by Sollya:
            d = [0, 0.03515625];
            f_sin = sin(y*pi)/y;
            Q = fpminimax(f_sin, [|0, 2, 4, 6, 8, 10|], [|107, D...|], d, relative, floating);

            See ./notes/sinpi_zero.sollya
        */

        const SIN_C: [u64; 4] = [
            0xc014abbce625be51,
            0x400466bc67754b46,
            0xbfe32d2cc12a51f4,
            0x3fb5060540058476,
        ];

        const C_PI: DoubleDouble =
            DoubleDouble::from_bit_pair((0x3ca1a67088eb1a46, 0x400921fb54442d18));

        let mut z = DoubleDouble::quick_mult_f64(C_PI, x);

        let x3 = x2 * x;

        let zl0 = f_fmla(x2, f64::from_bits(SIN_C[1]), f64::from_bits(SIN_C[0]));
        let zl1 = f_fmla(x2, f64::from_bits(SIN_C[3]), f64::from_bits(SIN_C[2]));

        let sin_eps = x * f_fmla(
            x2,
            f64::from_bits(0x3d00000000000000), // 2^-47
            f64::from_bits(0x3bd0000000000000), // 2^-66
        );

        z.lo = f_fmla(x3, f_fmla(x4, zl1, zl0), z.lo);
        let sin_lb = z.hi + (z.lo - sin_eps);
        let sin_ub = z.hi + (z.lo + sin_eps);
        let sin_x = if sin_lb == sin_ub {
            sin_lb
        } else {
            as_sinpi_zero(x)
        };
        return (sin_x, cos_x);
    }

    let si = e.wrapping_sub(1011);
    if si >= 0 && (m0.wrapping_shl(si.wrapping_add(1) as u32)) == 0 {
        // x is integer or half-integer
        if (m0.wrapping_shl(si as u32)) == 0 {
            static CF: [f64; 2] = [1., -1.];
            let is_odd = is_odd_integer(f64::from_bits(ax));
            let cos_x = CF[is_odd as usize];
            return (f64::copysign(0.0, x), cos_x); // x is integer
        }
        // x is half-integer
        let t = (m0.wrapping_shl((si - 1) as u32)) >> 63;
        // t = 0 if |x| = 1/2 mod 2, t = 1 if |x| = 3/2 mod 2
        return if t == 0 {
            (f64::copysign(1.0, x), 0.0)
        } else {
            (-f64::copysign(1.0, x), 0.0)
        };
    }

    let (y, k) = reduce_pi_64(x);

    // cos(k * pi/64) = sin(k * pi/64 + pi/2) = sin((k + 32) * pi/64).
    let sin_k = DoubleDouble::from_bit_pair(SINPI_K_PI_OVER_64[((k as u64) & 127) as usize]);
    let cos_k = DoubleDouble::from_bit_pair(
        SINPI_K_PI_OVER_64[((k as u64).wrapping_add(32) & 127) as usize],
    );
    let msin_k = -sin_k;

    let r_sincos = sincospi_eval(y);

    let sin_k_cos_y = DoubleDouble::quick_mult(sin_k, r_sincos.v_cos);
    let cos_k_sin_y = DoubleDouble::quick_mult(cos_k, r_sincos.v_sin);

    let cos_k_cos_y = DoubleDouble::quick_mult(r_sincos.v_cos, cos_k);
    let msin_k_sin_y = DoubleDouble::quick_mult(r_sincos.v_sin, msin_k);

    // sin_k_cos_y is always >> cos_k_sin_y
    let mut rr_sin = DoubleDouble::from_exact_add(sin_k_cos_y.hi, cos_k_sin_y.hi);
    rr_sin.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;

    let sin_ub = rr_sin.hi + (rr_sin.lo + r_sincos.err); // (rr.lo + ERR);
    let sin_lb = rr_sin.hi + (rr_sin.lo - r_sincos.err); // (rr.lo - ERR);

    let mut rr_cos = DoubleDouble::from_exact_add(cos_k_cos_y.hi, msin_k_sin_y.hi);
    rr_cos.lo += cos_k_cos_y.lo + msin_k_sin_y.lo;

    let cos_ub = rr_cos.hi + (rr_cos.lo + r_sincos.err); // (rr.lo + ERR);
    let cos_lb = rr_cos.hi + (rr_cos.lo - r_sincos.err); // (rr.lo - ERR);

    if sin_ub == sin_lb && cos_lb == cos_ub {
        return (rr_sin.to_f64(), rr_cos.to_f64());
    }

    sincospi_dd(y, sin_k, cos_k, cos_k, msin_k)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_sinpi() {
        assert_eq!(f_sinpi(262143.50006870925), -0.9999999767029883);
        assert_eq!(f_sinpi(7124076477593855.), 0.);
        assert_eq!(f_sinpi(-11235582092889474000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.), -0.);
        assert_eq!(f_sinpi(-2.7430620343968443e303), -0.0);
        assert_eq!(f_sinpi(0.00003195557007273919), 0.00010039138401316004);
        assert_eq!(f_sinpi(-0.038357843137253766), -0.12021328061499763);
        assert_eq!(f_sinpi(1.0156097449358867), -0.04901980680173724);
        assert_eq!(f_sinpi(74.8593852519989), 0.42752597787896457);
        assert_eq!(f_sinpi(0.500091552734375), 0.9999999586369661);
        assert_eq!(f_sinpi(0.5307886532952182), 0.9953257438106751);
        assert_eq!(f_sinpi(3.1415926535897936), -0.43030121700009316);
        assert_eq!(f_sinpi(-0.5305172747685276), -0.9954077178320563);
        assert_eq!(f_sinpi(-0.03723630312089732), -0.1167146713267927);
        assert_eq!(
            f_sinpi(0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000022946074000077123),
            0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007208721750737005
        );
        assert_eq!(
            f_sinpi(0.000000000000000000000000000000000000007413093439574428),
            2.3288919890141717e-38
        );
        assert_eq!(f_sinpi(0.0031909299901270445), 0.0100244343161398578);
        assert_eq!(f_sinpi(0.11909245901270445), 0.36547215190661003);
        assert_eq!(f_sinpi(0.99909245901270445), 0.0028511202357662186);
        assert!(f_sinpi(f64::INFINITY).is_nan());
        assert!(f_sinpi(f64::NEG_INFINITY).is_nan());
        assert!(f_sinpi(f64::NAN).is_nan());
    }

    #[test]
    fn test_sincospi() {
        let v0 = f_sincospi(1.0156097449358867);
        assert_eq!(v0.0, f_sinpi(1.0156097449358867));
        assert_eq!(v0.1, f_cospi(1.0156097449358867));

        let v1 = f_sincospi(4503599627370496.);
        assert_eq!(v1.0, f_sinpi(4503599627370496.));
        assert_eq!(v1.1, f_cospi(4503599627370496.));

        let v1 = f_sincospi(-108.);
        assert_eq!(v1.0, f_sinpi(-108.));
        assert_eq!(v1.1, f_cospi(-108.));

        let v1 = f_sincospi(3.);
        assert_eq!(v1.0, f_sinpi(3.));
        assert_eq!(v1.1, f_cospi(3.));

        let v1 = f_sincospi(13.5);
        assert_eq!(v1.0, f_sinpi(13.5));
        assert_eq!(v1.1, f_cospi(13.5));

        let v1 = f_sincospi(7124076477593855.);
        assert_eq!(v1.0, f_sinpi(7124076477593855.));
        assert_eq!(v1.1, f_cospi(7124076477593855.));

        let v1 = f_sincospi(2533419148247186.5);
        assert_eq!(v1.0, f_sinpi(2533419148247186.5));
        assert_eq!(v1.1, f_cospi(2533419148247186.5));

        let v1 = f_sincospi(2.2250653705240375E-308);
        assert_eq!(v1.0, f_sinpi(2.2250653705240375E-308));
        assert_eq!(v1.1, f_cospi(2.2250653705240375E-308));

        let v1 = f_sincospi(2533420818956351.);
        assert_eq!(v1.0, f_sinpi(2533420818956351.));
        assert_eq!(v1.1, f_cospi(2533420818956351.));

        let v1 = f_sincospi(2533822406803233.5);
        assert_eq!(v1.0, f_sinpi(2533822406803233.5));
        assert_eq!(v1.1, f_cospi(2533822406803233.5));

        let v1 = f_sincospi(-3040685725640478.5);
        assert_eq!(v1.0, f_sinpi(-3040685725640478.5));
        assert_eq!(v1.1, f_cospi(-3040685725640478.5));

        let v1 = f_sincospi(2533419148247186.5);
        assert_eq!(v1.0, f_sinpi(2533419148247186.5));
        assert_eq!(v1.1, f_cospi(2533419148247186.5));

        let v1 = f_sincospi(2533420819267583.5);
        assert_eq!(v1.0, f_sinpi(2533420819267583.5));
        assert_eq!(v1.1, f_cospi(2533420819267583.5));

        let v1 = f_sincospi(6979704728846336.);
        assert_eq!(v1.0, f_sinpi(6979704728846336.));
        assert_eq!(v1.1, f_cospi(6979704728846336.));

        let v1 = f_sincospi(7124076477593855.);
        assert_eq!(v1.0, f_sinpi(7124076477593855.));
        assert_eq!(v1.1, f_cospi(7124076477593855.));

        let v1 = f_sincospi(-0.00000000002728839192371484);
        assert_eq!(v1.0, f_sinpi(-0.00000000002728839192371484));
        assert_eq!(v1.1, f_cospi(-0.00000000002728839192371484));

        let v1 = f_sincospi(0.00002465398569495569);
        assert_eq!(v1.0, f_sinpi(0.00002465398569495569));
        assert_eq!(v1.1, f_cospi(0.00002465398569495569));
    }

    #[test]
    fn test_cospi() {
        assert_eq!(0.9999497540959953, f_cospi(0.0031909299901270445));
        assert_eq!(0.9308216542079669, f_cospi(0.11909299901270445));
        assert_eq!(-0.1536194873288318, f_cospi(0.54909299901270445));
        assert!(f_cospi(f64::INFINITY).is_nan());
        assert!(f_cospi(f64::NEG_INFINITY).is_nan());
        assert!(f_cospi(f64::NAN).is_nan());
    }
}