789 lines
24 KiB
Rust
789 lines
24 KiB
Rust
//! A simple big-integer type for slow path algorithms.
|
|
//!
|
|
//! This includes minimal stackvector for use in big-integer arithmetic.
|
|
|
|
#![doc(hidden)]
|
|
|
|
#[cfg(feature = "alloc")]
|
|
use crate::heapvec::HeapVec;
|
|
use crate::num::Float;
|
|
#[cfg(not(feature = "alloc"))]
|
|
use crate::stackvec::StackVec;
|
|
#[cfg(not(feature = "compact"))]
|
|
use crate::table::{LARGE_POW5, LARGE_POW5_STEP};
|
|
use core::{cmp, ops, ptr};
|
|
|
|
/// Number of bits in a Bigint.
|
|
///
|
|
/// This needs to be at least the number of bits required to store
|
|
/// a Bigint, which is `log2(radix**digits)`.
|
|
/// ≅ 3600 for base-10, rounded-up.
|
|
pub const BIGINT_BITS: usize = 4000;
|
|
|
|
/// The number of limbs for the bigint.
|
|
pub const BIGINT_LIMBS: usize = BIGINT_BITS / LIMB_BITS;
|
|
|
|
#[cfg(feature = "alloc")]
|
|
pub type VecType = HeapVec;
|
|
|
|
#[cfg(not(feature = "alloc"))]
|
|
pub type VecType = StackVec;
|
|
|
|
/// Storage for a big integer type.
|
|
///
|
|
/// This is used for algorithms when we have a finite number of digits.
|
|
/// Specifically, it stores all the significant digits scaled to the
|
|
/// proper exponent, as an integral type, and then directly compares
|
|
/// these digits.
|
|
///
|
|
/// This requires us to store the number of significant bits, plus the
|
|
/// number of exponent bits (required) since we scale everything
|
|
/// to the same exponent.
|
|
#[derive(Clone, PartialEq, Eq)]
|
|
pub struct Bigint {
|
|
/// Significant digits for the float, stored in a big integer in LE order.
|
|
///
|
|
/// This is pretty much the same number of digits for any radix, since the
|
|
/// significant digits balances out the zeros from the exponent:
|
|
/// 1. Decimal is 1091 digits, 767 mantissa digits + 324 exponent zeros.
|
|
/// 2. Base 6 is 1097 digits, or 680 mantissa digits + 417 exponent zeros.
|
|
/// 3. Base 36 is 1086 digits, or 877 mantissa digits + 209 exponent zeros.
|
|
///
|
|
/// However, the number of bytes required is larger for large radixes:
|
|
/// for decimal, we need `log2(10**1091) ≅ 3600`, while for base 36
|
|
/// we need `log2(36**1086) ≅ 5600`. Since we use uninitialized data,
|
|
/// we avoid a major performance hit from the large buffer size.
|
|
pub data: VecType,
|
|
}
|
|
|
|
#[allow(clippy::new_without_default)]
|
|
impl Bigint {
|
|
/// Construct a bigint representing 0.
|
|
#[inline(always)]
|
|
pub fn new() -> Self {
|
|
Self {
|
|
data: VecType::new(),
|
|
}
|
|
}
|
|
|
|
/// Construct a bigint from an integer.
|
|
#[inline(always)]
|
|
pub fn from_u64(value: u64) -> Self {
|
|
Self {
|
|
data: VecType::from_u64(value),
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn hi64(&self) -> (u64, bool) {
|
|
self.data.hi64()
|
|
}
|
|
|
|
/// Multiply and assign as if by exponentiation by a power.
|
|
#[inline]
|
|
pub fn pow(&mut self, base: u32, exp: u32) -> Option<()> {
|
|
debug_assert!(base == 2 || base == 5 || base == 10);
|
|
if base % 5 == 0 {
|
|
pow(&mut self.data, exp)?;
|
|
}
|
|
if base % 2 == 0 {
|
|
shl(&mut self.data, exp as usize)?;
|
|
}
|
|
Some(())
|
|
}
|
|
|
|
/// Calculate the bit-length of the big-integer.
|
|
#[inline]
|
|
pub fn bit_length(&self) -> u32 {
|
|
bit_length(&self.data)
|
|
}
|
|
}
|
|
|
|
impl ops::MulAssign<&Bigint> for Bigint {
|
|
fn mul_assign(&mut self, rhs: &Bigint) {
|
|
self.data *= &rhs.data;
|
|
}
|
|
}
|
|
|
|
/// REVERSE VIEW
|
|
|
|
/// Reverse, immutable view of a sequence.
|
|
pub struct ReverseView<'a, T: 'a> {
|
|
inner: &'a [T],
|
|
}
|
|
|
|
impl<'a, T> ops::Index<usize> for ReverseView<'a, T> {
|
|
type Output = T;
|
|
|
|
#[inline]
|
|
fn index(&self, index: usize) -> &T {
|
|
let len = self.inner.len();
|
|
&(*self.inner)[len - index - 1]
|
|
}
|
|
}
|
|
|
|
/// Create a reverse view of the vector for indexing.
|
|
#[inline]
|
|
pub fn rview(x: &[Limb]) -> ReverseView<Limb> {
|
|
ReverseView {
|
|
inner: x,
|
|
}
|
|
}
|
|
|
|
// COMPARE
|
|
// -------
|
|
|
|
/// Compare `x` to `y`, in little-endian order.
|
|
#[inline]
|
|
pub fn compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering {
|
|
match x.len().cmp(&y.len()) {
|
|
cmp::Ordering::Equal => {
|
|
let iter = x.iter().rev().zip(y.iter().rev());
|
|
for (&xi, yi) in iter {
|
|
match xi.cmp(yi) {
|
|
cmp::Ordering::Equal => (),
|
|
ord => return ord,
|
|
}
|
|
}
|
|
// Equal case.
|
|
cmp::Ordering::Equal
|
|
},
|
|
ord => ord,
|
|
}
|
|
}
|
|
|
|
// NORMALIZE
|
|
// ---------
|
|
|
|
/// Normalize the integer, so any leading zero values are removed.
|
|
#[inline]
|
|
pub fn normalize(x: &mut VecType) {
|
|
// We don't care if this wraps: the index is bounds-checked.
|
|
while let Some(&value) = x.get(x.len().wrapping_sub(1)) {
|
|
if value == 0 {
|
|
unsafe { x.set_len(x.len() - 1) };
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Get if the big integer is normalized.
|
|
#[inline]
|
|
#[allow(clippy::match_like_matches_macro)]
|
|
pub fn is_normalized(x: &[Limb]) -> bool {
|
|
// We don't care if this wraps: the index is bounds-checked.
|
|
match x.get(x.len().wrapping_sub(1)) {
|
|
Some(&0) => false,
|
|
_ => true,
|
|
}
|
|
}
|
|
|
|
// FROM
|
|
// ----
|
|
|
|
/// Create StackVec from u64 value.
|
|
#[inline(always)]
|
|
#[allow(clippy::branches_sharing_code)]
|
|
pub fn from_u64(x: u64) -> VecType {
|
|
let mut vec = VecType::new();
|
|
debug_assert!(vec.capacity() >= 2);
|
|
if LIMB_BITS == 32 {
|
|
vec.try_push(x as Limb).unwrap();
|
|
vec.try_push((x >> 32) as Limb).unwrap();
|
|
} else {
|
|
vec.try_push(x as Limb).unwrap();
|
|
}
|
|
vec.normalize();
|
|
vec
|
|
}
|
|
|
|
// HI
|
|
// --
|
|
|
|
/// Check if any of the remaining bits are non-zero.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Safe as long as `rindex <= x.len()`.
|
|
#[inline]
|
|
pub fn nonzero(x: &[Limb], rindex: usize) -> bool {
|
|
debug_assert!(rindex <= x.len());
|
|
|
|
let len = x.len();
|
|
let slc = &x[..len - rindex];
|
|
slc.iter().rev().any(|&x| x != 0)
|
|
}
|
|
|
|
// These return the high X bits and if the bits were truncated.
|
|
|
|
/// Shift 32-bit integer to high 64-bits.
|
|
#[inline]
|
|
pub fn u32_to_hi64_1(r0: u32) -> (u64, bool) {
|
|
u64_to_hi64_1(r0 as u64)
|
|
}
|
|
|
|
/// Shift 2 32-bit integers to high 64-bits.
|
|
#[inline]
|
|
pub fn u32_to_hi64_2(r0: u32, r1: u32) -> (u64, bool) {
|
|
let r0 = (r0 as u64) << 32;
|
|
let r1 = r1 as u64;
|
|
u64_to_hi64_1(r0 | r1)
|
|
}
|
|
|
|
/// Shift 3 32-bit integers to high 64-bits.
|
|
#[inline]
|
|
pub fn u32_to_hi64_3(r0: u32, r1: u32, r2: u32) -> (u64, bool) {
|
|
let r0 = r0 as u64;
|
|
let r1 = (r1 as u64) << 32;
|
|
let r2 = r2 as u64;
|
|
u64_to_hi64_2(r0, r1 | r2)
|
|
}
|
|
|
|
/// Shift 64-bit integer to high 64-bits.
|
|
#[inline]
|
|
pub fn u64_to_hi64_1(r0: u64) -> (u64, bool) {
|
|
let ls = r0.leading_zeros();
|
|
(r0 << ls, false)
|
|
}
|
|
|
|
/// Shift 2 64-bit integers to high 64-bits.
|
|
#[inline]
|
|
pub fn u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool) {
|
|
let ls = r0.leading_zeros();
|
|
let rs = 64 - ls;
|
|
let v = match ls {
|
|
0 => r0,
|
|
_ => (r0 << ls) | (r1 >> rs),
|
|
};
|
|
let n = r1 << ls != 0;
|
|
(v, n)
|
|
}
|
|
|
|
/// Extract the hi bits from the buffer.
|
|
macro_rules! hi {
|
|
// # Safety
|
|
//
|
|
// Safe as long as the `stackvec.len() >= 1`.
|
|
(@1 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
|
|
$fn($rview[0] as $t)
|
|
}};
|
|
|
|
// # Safety
|
|
//
|
|
// Safe as long as the `stackvec.len() >= 2`.
|
|
(@2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
|
|
let r0 = $rview[0] as $t;
|
|
let r1 = $rview[1] as $t;
|
|
$fn(r0, r1)
|
|
}};
|
|
|
|
// # Safety
|
|
//
|
|
// Safe as long as the `stackvec.len() >= 2`.
|
|
(@nonzero2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
|
|
let (v, n) = hi!(@2 $self, $rview, $t, $fn);
|
|
(v, n || nonzero($self, 2 ))
|
|
}};
|
|
|
|
// # Safety
|
|
//
|
|
// Safe as long as the `stackvec.len() >= 3`.
|
|
(@3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
|
|
let r0 = $rview[0] as $t;
|
|
let r1 = $rview[1] as $t;
|
|
let r2 = $rview[2] as $t;
|
|
$fn(r0, r1, r2)
|
|
}};
|
|
|
|
// # Safety
|
|
//
|
|
// Safe as long as the `stackvec.len() >= 3`.
|
|
(@nonzero3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
|
|
let (v, n) = hi!(@3 $self, $rview, $t, $fn);
|
|
(v, n || nonzero($self, 3))
|
|
}};
|
|
}
|
|
|
|
/// Get the high 64 bits from the vector.
|
|
#[inline(always)]
|
|
pub fn hi64(x: &[Limb]) -> (u64, bool) {
|
|
let rslc = rview(x);
|
|
// SAFETY: the buffer must be at least length bytes long.
|
|
match x.len() {
|
|
0 => (0, false),
|
|
1 if LIMB_BITS == 32 => hi!(@1 x, rslc, u32, u32_to_hi64_1),
|
|
1 => hi!(@1 x, rslc, u64, u64_to_hi64_1),
|
|
2 if LIMB_BITS == 32 => hi!(@2 x, rslc, u32, u32_to_hi64_2),
|
|
2 => hi!(@2 x, rslc, u64, u64_to_hi64_2),
|
|
_ if LIMB_BITS == 32 => hi!(@nonzero3 x, rslc, u32, u32_to_hi64_3),
|
|
_ => hi!(@nonzero2 x, rslc, u64, u64_to_hi64_2),
|
|
}
|
|
}
|
|
|
|
// POWERS
|
|
// ------
|
|
|
|
/// MulAssign by a power of 5.
|
|
///
|
|
/// Theoretically...
|
|
///
|
|
/// Use an exponentiation by squaring method, since it reduces the time
|
|
/// complexity of the multiplication to ~`O(log(n))` for the squaring,
|
|
/// and `O(n*m)` for the result. Since `m` is typically a lower-order
|
|
/// factor, this significantly reduces the number of multiplications
|
|
/// we need to do. Iteratively multiplying by small powers follows
|
|
/// the nth triangular number series, which scales as `O(p^2)`, but
|
|
/// where `p` is `n+m`. In short, it scales very poorly.
|
|
///
|
|
/// Practically....
|
|
///
|
|
/// Exponentiation by Squaring:
|
|
/// running 2 tests
|
|
/// test bigcomp_f32_lexical ... bench: 1,018 ns/iter (+/- 78)
|
|
/// test bigcomp_f64_lexical ... bench: 3,639 ns/iter (+/- 1,007)
|
|
///
|
|
/// Exponentiation by Iterative Small Powers:
|
|
/// running 2 tests
|
|
/// test bigcomp_f32_lexical ... bench: 518 ns/iter (+/- 31)
|
|
/// test bigcomp_f64_lexical ... bench: 583 ns/iter (+/- 47)
|
|
///
|
|
/// Exponentiation by Iterative Large Powers (of 2):
|
|
/// running 2 tests
|
|
/// test bigcomp_f32_lexical ... bench: 671 ns/iter (+/- 31)
|
|
/// test bigcomp_f64_lexical ... bench: 1,394 ns/iter (+/- 47)
|
|
///
|
|
/// The following benchmarks were run on `1 * 5^300`, using native `pow`,
|
|
/// a version with only small powers, and one with pre-computed powers
|
|
/// of `5^(3 * max_exp)`, rather than `5^(5 * max_exp)`.
|
|
///
|
|
/// However, using large powers is crucial for good performance for higher
|
|
/// powers.
|
|
/// pow/default time: [426.20 ns 427.96 ns 429.89 ns]
|
|
/// pow/small time: [2.9270 us 2.9411 us 2.9565 us]
|
|
/// pow/large:3 time: [838.51 ns 842.21 ns 846.27 ns]
|
|
///
|
|
/// Even using worst-case scenarios, exponentiation by squaring is
|
|
/// significantly slower for our workloads. Just multiply by small powers,
|
|
/// in simple cases, and use precalculated large powers in other cases.
|
|
///
|
|
/// Furthermore, using sufficiently big large powers is also crucial for
|
|
/// performance. This is a tradeoff of binary size and performance, and
|
|
/// using a single value at ~`5^(5 * max_exp)` seems optimal.
|
|
pub fn pow(x: &mut VecType, mut exp: u32) -> Option<()> {
|
|
// Minimize the number of iterations for large exponents: just
|
|
// do a few steps with a large powers.
|
|
#[cfg(not(feature = "compact"))]
|
|
{
|
|
while exp >= LARGE_POW5_STEP {
|
|
large_mul(x, &LARGE_POW5)?;
|
|
exp -= LARGE_POW5_STEP;
|
|
}
|
|
}
|
|
|
|
// Now use our pre-computed small powers iteratively.
|
|
// This is calculated as `⌊log(2^BITS - 1, 5)⌋`.
|
|
let small_step = if LIMB_BITS == 32 {
|
|
13
|
|
} else {
|
|
27
|
|
};
|
|
let max_native = (5 as Limb).pow(small_step);
|
|
while exp >= small_step {
|
|
small_mul(x, max_native)?;
|
|
exp -= small_step;
|
|
}
|
|
if exp != 0 {
|
|
// SAFETY: safe, since `exp < small_step`.
|
|
let small_power = unsafe { f64::int_pow_fast_path(exp as usize, 5) };
|
|
small_mul(x, small_power as Limb)?;
|
|
}
|
|
Some(())
|
|
}
|
|
|
|
// SCALAR
|
|
// ------
|
|
|
|
/// Add two small integers and return the resulting value and if overflow happens.
|
|
#[inline(always)]
|
|
pub fn scalar_add(x: Limb, y: Limb) -> (Limb, bool) {
|
|
x.overflowing_add(y)
|
|
}
|
|
|
|
/// Multiply two small integers (with carry) (and return the overflow contribution).
|
|
///
|
|
/// Returns the (low, high) components.
|
|
#[inline(always)]
|
|
pub fn scalar_mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb) {
|
|
// Cannot overflow, as long as wide is 2x as wide. This is because
|
|
// the following is always true:
|
|
// `Wide::MAX - (Narrow::MAX * Narrow::MAX) >= Narrow::MAX`
|
|
let z: Wide = (x as Wide) * (y as Wide) + (carry as Wide);
|
|
(z as Limb, (z >> LIMB_BITS) as Limb)
|
|
}
|
|
|
|
// SMALL
|
|
// -----
|
|
|
|
/// Add small integer to bigint starting from offset.
|
|
#[inline]
|
|
pub fn small_add_from(x: &mut VecType, y: Limb, start: usize) -> Option<()> {
|
|
let mut index = start;
|
|
let mut carry = y;
|
|
while carry != 0 && index < x.len() {
|
|
let result = scalar_add(x[index], carry);
|
|
x[index] = result.0;
|
|
carry = result.1 as Limb;
|
|
index += 1;
|
|
}
|
|
// If we carried past all the elements, add to the end of the buffer.
|
|
if carry != 0 {
|
|
x.try_push(carry)?;
|
|
}
|
|
Some(())
|
|
}
|
|
|
|
/// Add small integer to bigint.
|
|
#[inline(always)]
|
|
pub fn small_add(x: &mut VecType, y: Limb) -> Option<()> {
|
|
small_add_from(x, y, 0)
|
|
}
|
|
|
|
/// Multiply bigint by small integer.
|
|
#[inline]
|
|
pub fn small_mul(x: &mut VecType, y: Limb) -> Option<()> {
|
|
let mut carry = 0;
|
|
for xi in x.iter_mut() {
|
|
let result = scalar_mul(*xi, y, carry);
|
|
*xi = result.0;
|
|
carry = result.1;
|
|
}
|
|
// If we carried past all the elements, add to the end of the buffer.
|
|
if carry != 0 {
|
|
x.try_push(carry)?;
|
|
}
|
|
Some(())
|
|
}
|
|
|
|
// LARGE
|
|
// -----
|
|
|
|
/// Add bigint to bigint starting from offset.
|
|
pub fn large_add_from(x: &mut VecType, y: &[Limb], start: usize) -> Option<()> {
|
|
// The effective x buffer is from `xstart..x.len()`, so we need to treat
|
|
// that as the current range. If the effective y buffer is longer, need
|
|
// to resize to that, + the start index.
|
|
if y.len() > x.len().saturating_sub(start) {
|
|
// Ensure we panic if we can't extend the buffer.
|
|
// This avoids any unsafe behavior afterwards.
|
|
x.try_resize(y.len() + start, 0)?;
|
|
}
|
|
|
|
// Iteratively add elements from y to x.
|
|
let mut carry = false;
|
|
for (index, &yi) in y.iter().enumerate() {
|
|
// We panicked in `try_resize` if this wasn't true.
|
|
let xi = x.get_mut(start + index).unwrap();
|
|
|
|
// Only one op of the two ops can overflow, since we added at max
|
|
// Limb::max_value() + Limb::max_value(). Add the previous carry,
|
|
// and store the current carry for the next.
|
|
let result = scalar_add(*xi, yi);
|
|
*xi = result.0;
|
|
let mut tmp = result.1;
|
|
if carry {
|
|
let result = scalar_add(*xi, 1);
|
|
*xi = result.0;
|
|
tmp |= result.1;
|
|
}
|
|
carry = tmp;
|
|
}
|
|
|
|
// Handle overflow.
|
|
if carry {
|
|
small_add_from(x, 1, y.len() + start)?;
|
|
}
|
|
Some(())
|
|
}
|
|
|
|
/// Add bigint to bigint.
|
|
#[inline(always)]
|
|
pub fn large_add(x: &mut VecType, y: &[Limb]) -> Option<()> {
|
|
large_add_from(x, y, 0)
|
|
}
|
|
|
|
/// Grade-school multiplication algorithm.
|
|
///
|
|
/// Slow, naive algorithm, using limb-bit bases and just shifting left for
|
|
/// each iteration. This could be optimized with numerous other algorithms,
|
|
/// but it's extremely simple, and works in O(n*m) time, which is fine
|
|
/// by me. Each iteration, of which there are `m` iterations, requires
|
|
/// `n` multiplications, and `n` additions, or grade-school multiplication.
|
|
///
|
|
/// Don't use Karatsuba multiplication, since out implementation seems to
|
|
/// be slower asymptotically, which is likely just due to the small sizes
|
|
/// we deal with here. For example, running on the following data:
|
|
///
|
|
/// ```text
|
|
/// const SMALL_X: &[u32] = &[
|
|
/// 766857581, 3588187092, 1583923090, 2204542082, 1564708913, 2695310100, 3676050286,
|
|
/// 1022770393, 468044626, 446028186
|
|
/// ];
|
|
/// const SMALL_Y: &[u32] = &[
|
|
/// 3945492125, 3250752032, 1282554898, 1708742809, 1131807209, 3171663979, 1353276095,
|
|
/// 1678845844, 2373924447, 3640713171
|
|
/// ];
|
|
/// const LARGE_X: &[u32] = &[
|
|
/// 3647536243, 2836434412, 2154401029, 1297917894, 137240595, 790694805, 2260404854,
|
|
/// 3872698172, 690585094, 99641546, 3510774932, 1672049983, 2313458559, 2017623719,
|
|
/// 638180197, 1140936565, 1787190494, 1797420655, 14113450, 2350476485, 3052941684,
|
|
/// 1993594787, 2901001571, 4156930025, 1248016552, 848099908, 2660577483, 4030871206,
|
|
/// 692169593, 2835966319, 1781364505, 4266390061, 1813581655, 4210899844, 2137005290,
|
|
/// 2346701569, 3715571980, 3386325356, 1251725092, 2267270902, 474686922, 2712200426,
|
|
/// 197581715, 3087636290, 1379224439, 1258285015, 3230794403, 2759309199, 1494932094,
|
|
/// 326310242
|
|
/// ];
|
|
/// const LARGE_Y: &[u32] = &[
|
|
/// 1574249566, 868970575, 76716509, 3198027972, 1541766986, 1095120699, 3891610505,
|
|
/// 2322545818, 1677345138, 865101357, 2650232883, 2831881215, 3985005565, 2294283760,
|
|
/// 3468161605, 393539559, 3665153349, 1494067812, 106699483, 2596454134, 797235106,
|
|
/// 705031740, 1209732933, 2732145769, 4122429072, 141002534, 790195010, 4014829800,
|
|
/// 1303930792, 3649568494, 308065964, 1233648836, 2807326116, 79326486, 1262500691,
|
|
/// 621809229, 2258109428, 3819258501, 171115668, 1139491184, 2979680603, 1333372297,
|
|
/// 1657496603, 2790845317, 4090236532, 4220374789, 601876604, 1828177209, 2372228171,
|
|
/// 2247372529
|
|
/// ];
|
|
/// ```
|
|
///
|
|
/// We get the following results:
|
|
|
|
/// ```text
|
|
/// mul/small:long time: [220.23 ns 221.47 ns 222.81 ns]
|
|
/// Found 4 outliers among 100 measurements (4.00%)
|
|
/// 2 (2.00%) high mild
|
|
/// 2 (2.00%) high severe
|
|
/// mul/small:karatsuba time: [233.88 ns 234.63 ns 235.44 ns]
|
|
/// Found 11 outliers among 100 measurements (11.00%)
|
|
/// 8 (8.00%) high mild
|
|
/// 3 (3.00%) high severe
|
|
/// mul/large:long time: [1.9365 us 1.9455 us 1.9558 us]
|
|
/// Found 12 outliers among 100 measurements (12.00%)
|
|
/// 7 (7.00%) high mild
|
|
/// 5 (5.00%) high severe
|
|
/// mul/large:karatsuba time: [4.4250 us 4.4515 us 4.4812 us]
|
|
/// ```
|
|
///
|
|
/// In short, Karatsuba multiplication is never worthwhile for out use-case.
|
|
pub fn long_mul(x: &[Limb], y: &[Limb]) -> Option<VecType> {
|
|
// Using the immutable value, multiply by all the scalars in y, using
|
|
// the algorithm defined above. Use a single buffer to avoid
|
|
// frequent reallocations. Handle the first case to avoid a redundant
|
|
// addition, since we know y.len() >= 1.
|
|
let mut z = VecType::try_from(x)?;
|
|
if !y.is_empty() {
|
|
let y0 = y[0];
|
|
small_mul(&mut z, y0)?;
|
|
|
|
for (index, &yi) in y.iter().enumerate().skip(1) {
|
|
if yi != 0 {
|
|
let mut zi = VecType::try_from(x)?;
|
|
small_mul(&mut zi, yi)?;
|
|
large_add_from(&mut z, &zi, index)?;
|
|
}
|
|
}
|
|
}
|
|
|
|
z.normalize();
|
|
Some(z)
|
|
}
|
|
|
|
/// Multiply bigint by bigint using grade-school multiplication algorithm.
|
|
#[inline(always)]
|
|
pub fn large_mul(x: &mut VecType, y: &[Limb]) -> Option<()> {
|
|
// Karatsuba multiplication never makes sense, so just use grade school
|
|
// multiplication.
|
|
if y.len() == 1 {
|
|
// SAFETY: safe since `y.len() == 1`.
|
|
small_mul(x, y[0])?;
|
|
} else {
|
|
*x = long_mul(y, x)?;
|
|
}
|
|
Some(())
|
|
}
|
|
|
|
// SHIFT
|
|
// -----
|
|
|
|
/// Shift-left `n` bits inside a buffer.
|
|
#[inline]
|
|
pub fn shl_bits(x: &mut VecType, n: usize) -> Option<()> {
|
|
debug_assert!(n != 0);
|
|
|
|
// Internally, for each item, we shift left by n, and add the previous
|
|
// right shifted limb-bits.
|
|
// For example, we transform (for u8) shifted left 2, to:
|
|
// b10100100 b01000010
|
|
// b10 b10010001 b00001000
|
|
debug_assert!(n < LIMB_BITS);
|
|
let rshift = LIMB_BITS - n;
|
|
let lshift = n;
|
|
let mut prev: Limb = 0;
|
|
for xi in x.iter_mut() {
|
|
let tmp = *xi;
|
|
*xi <<= lshift;
|
|
*xi |= prev >> rshift;
|
|
prev = tmp;
|
|
}
|
|
|
|
// Always push the carry, even if it creates a non-normal result.
|
|
let carry = prev >> rshift;
|
|
if carry != 0 {
|
|
x.try_push(carry)?;
|
|
}
|
|
|
|
Some(())
|
|
}
|
|
|
|
/// Shift-left `n` limbs inside a buffer.
|
|
#[inline]
|
|
pub fn shl_limbs(x: &mut VecType, n: usize) -> Option<()> {
|
|
debug_assert!(n != 0);
|
|
if n + x.len() > x.capacity() {
|
|
None
|
|
} else if !x.is_empty() {
|
|
let len = n + x.len();
|
|
// SAFE: since x is not empty, and `x.len() + n <= x.capacity()`.
|
|
unsafe {
|
|
// Move the elements.
|
|
let src = x.as_ptr();
|
|
let dst = x.as_mut_ptr().add(n);
|
|
ptr::copy(src, dst, x.len());
|
|
// Write our 0s.
|
|
ptr::write_bytes(x.as_mut_ptr(), 0, n);
|
|
x.set_len(len);
|
|
}
|
|
Some(())
|
|
} else {
|
|
Some(())
|
|
}
|
|
}
|
|
|
|
/// Shift-left buffer by n bits.
|
|
#[inline]
|
|
pub fn shl(x: &mut VecType, n: usize) -> Option<()> {
|
|
let rem = n % LIMB_BITS;
|
|
let div = n / LIMB_BITS;
|
|
if rem != 0 {
|
|
shl_bits(x, rem)?;
|
|
}
|
|
if div != 0 {
|
|
shl_limbs(x, div)?;
|
|
}
|
|
Some(())
|
|
}
|
|
|
|
/// Get number of leading zero bits in the storage.
|
|
#[inline]
|
|
pub fn leading_zeros(x: &[Limb]) -> u32 {
|
|
let length = x.len();
|
|
// wrapping_sub is fine, since it'll just return None.
|
|
if let Some(&value) = x.get(length.wrapping_sub(1)) {
|
|
value.leading_zeros()
|
|
} else {
|
|
0
|
|
}
|
|
}
|
|
|
|
/// Calculate the bit-length of the big-integer.
|
|
#[inline]
|
|
pub fn bit_length(x: &[Limb]) -> u32 {
|
|
let nlz = leading_zeros(x);
|
|
LIMB_BITS as u32 * x.len() as u32 - nlz
|
|
}
|
|
|
|
// LIMB
|
|
// ----
|
|
|
|
// Type for a single limb of the big integer.
|
|
//
|
|
// A limb is analogous to a digit in base10, except, it stores 32-bit
|
|
// or 64-bit numbers instead. We want types where 64-bit multiplication
|
|
// is well-supported by the architecture, rather than emulated in 3
|
|
// instructions. The quickest way to check this support is using a
|
|
// cross-compiler for numerous architectures, along with the following
|
|
// source file and command:
|
|
//
|
|
// Compile with `gcc main.c -c -S -O3 -masm=intel`
|
|
//
|
|
// And the source code is:
|
|
// ```text
|
|
// #include <stdint.h>
|
|
//
|
|
// struct i128 {
|
|
// uint64_t hi;
|
|
// uint64_t lo;
|
|
// };
|
|
//
|
|
// // Type your code here, or load an example.
|
|
// struct i128 square(uint64_t x, uint64_t y) {
|
|
// __int128 prod = (__int128)x * (__int128)y;
|
|
// struct i128 z;
|
|
// z.hi = (uint64_t)(prod >> 64);
|
|
// z.lo = (uint64_t)prod;
|
|
// return z;
|
|
// }
|
|
// ```
|
|
//
|
|
// If the result contains `call __multi3`, then the multiplication
|
|
// is emulated by the compiler. Otherwise, it's natively supported.
|
|
//
|
|
// This should be all-known 64-bit platforms supported by Rust.
|
|
// https://forge.rust-lang.org/platform-support.html
|
|
//
|
|
// # Supported
|
|
//
|
|
// Platforms where native 128-bit multiplication is explicitly supported:
|
|
// - x86_64 (Supported via `MUL`).
|
|
// - mips64 (Supported via `DMULTU`, which `HI` and `LO` can be read-from).
|
|
// - s390x (Supported via `MLGR`).
|
|
//
|
|
// # Efficient
|
|
//
|
|
// Platforms where native 64-bit multiplication is supported and
|
|
// you can extract hi-lo for 64-bit multiplications.
|
|
// - aarch64 (Requires `UMULH` and `MUL` to capture high and low bits).
|
|
// - powerpc64 (Requires `MULHDU` and `MULLD` to capture high and low bits).
|
|
// - riscv64 (Requires `MUL` and `MULH` to capture high and low bits).
|
|
//
|
|
// # Unsupported
|
|
//
|
|
// Platforms where native 128-bit multiplication is not supported,
|
|
// requiring software emulation.
|
|
// sparc64 (`UMUL` only supports double-word arguments).
|
|
// sparcv9 (Same as sparc64).
|
|
//
|
|
// These tests are run via `xcross`, my own library for C cross-compiling,
|
|
// which supports numerous targets (far in excess of Rust's tier 1 support,
|
|
// or rust-embedded/cross's list). xcross may be found here:
|
|
// https://github.com/Alexhuszagh/xcross
|
|
//
|
|
// To compile for the given target, run:
|
|
// `xcross gcc main.c -c -S -O3 --target $target`
|
|
//
|
|
// All 32-bit architectures inherently do not have support. That means
|
|
// we can essentially look for 64-bit architectures that are not SPARC.
|
|
|
|
#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
|
|
pub type Limb = u64;
|
|
#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
|
|
pub type Wide = u128;
|
|
#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
|
|
pub const LIMB_BITS: usize = 64;
|
|
|
|
#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
|
|
pub type Limb = u32;
|
|
#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
|
|
pub type Wide = u64;
|
|
#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
|
|
pub const LIMB_BITS: usize = 32;
|