Vendor dependencies for 0.3.0 release

2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions
--- a/vendor/half/src/bfloat.rs
+++ b/vendor/half/src/bfloat.rs
--- a/vendor/half/src/bfloat/convert.rs
+++ b/vendor/half/src/bfloat/convert.rs
@@ -0,0 +1,152 @@
+use crate::leading_zeros::leading_zeros_u16;
+use core::mem;
+
+#[inline]
+pub(crate) const fn f32_to_bf16(value: f32) -> u16 {
+    // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
+    // Convert to raw bytes
+    let x: u32 = unsafe { mem::transmute::<f32, u32>(value) };
+
+    // check for NaN
+    if x & 0x7FFF_FFFFu32 > 0x7F80_0000u32 {
+        // Keep high part of current mantissa but also set most significiant mantissa bit
+        return ((x >> 16) | 0x0040u32) as u16;
+    }
+
+    // round and shift
+    let round_bit = 0x0000_8000u32;
+    if (x & round_bit) != 0 && (x & (3 * round_bit - 1)) != 0 {
+        (x >> 16) as u16 + 1
+    } else {
+        (x >> 16) as u16
+    }
+}
+
+#[inline]
+pub(crate) const fn f64_to_bf16(value: f64) -> u16 {
+    // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
+    // Convert to raw bytes, truncating the last 32-bits of mantissa; that precision will always
+    // be lost on half-precision.
+    let val: u64 = unsafe { mem::transmute::<f64, u64>(value) };
+    let x = (val >> 32) as u32;
+
+    // Extract IEEE754 components
+    let sign = x & 0x8000_0000u32;
+    let exp = x & 0x7FF0_0000u32;
+    let man = x & 0x000F_FFFFu32;
+
+    // Check for all exponent bits being set, which is Infinity or NaN
+    if exp == 0x7FF0_0000u32 {
+        // Set mantissa MSB for NaN (and also keep shifted mantissa bits).
+        // We also have to check the last 32 bits.
+        let nan_bit = if man == 0 && (val as u32 == 0) {
+            0
+        } else {
+            0x0040u32
+        };
+        return ((sign >> 16) | 0x7F80u32 | nan_bit | (man >> 13)) as u16;
+    }
+
+    // The number is normalized, start assembling half precision version
+    let half_sign = sign >> 16;
+    // Unbias the exponent, then bias for bfloat16 precision
+    let unbiased_exp = ((exp >> 20) as i64) - 1023;
+    let half_exp = unbiased_exp + 127;
+
+    // Check for exponent overflow, return +infinity
+    if half_exp >= 0xFF {
+        return (half_sign | 0x7F80u32) as u16;
+    }
+
+    // Check for underflow
+    if half_exp <= 0 {
+        // Check mantissa for what we can do
+        if 7 - half_exp > 21 {
+            // No rounding possibility, so this is a full underflow, return signed zero
+            return half_sign as u16;
+        }
+        // Don't forget about hidden leading mantissa bit when assembling mantissa
+        let man = man | 0x0010_0000u32;
+        let mut half_man = man >> (14 - half_exp);
+        // Check for rounding
+        let round_bit = 1 << (13 - half_exp);
+        if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
+            half_man += 1;
+        }
+        // No exponent for subnormals
+        return (half_sign | half_man) as u16;
+    }
+
+    // Rebias the exponent
+    let half_exp = (half_exp as u32) << 7;
+    let half_man = man >> 13;
+    // Check for rounding
+    let round_bit = 0x0000_1000u32;
+    if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
+        // Round it
+        ((half_sign | half_exp | half_man) + 1) as u16
+    } else {
+        (half_sign | half_exp | half_man) as u16
+    }
+}
+
+#[inline]
+pub(crate) const fn bf16_to_f32(i: u16) -> f32 {
+    // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
+    // If NaN, keep current mantissa but also set most significiant mantissa bit
+    if i & 0x7FFFu16 > 0x7F80u16 {
+        unsafe { mem::transmute::<u32, f32>((i as u32 | 0x0040u32) << 16) }
+    } else {
+        unsafe { mem::transmute::<u32, f32>((i as u32) << 16) }
+    }
+}
+
+#[inline]
+pub(crate) const fn bf16_to_f64(i: u16) -> f64 {
+    // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
+    // Check for signed zero
+    if i & 0x7FFFu16 == 0 {
+        return unsafe { mem::transmute::<u64, f64>((i as u64) << 48) };
+    }
+
+    let half_sign = (i & 0x8000u16) as u64;
+    let half_exp = (i & 0x7F80u16) as u64;
+    let half_man = (i & 0x007Fu16) as u64;
+
+    // Check for an infinity or NaN when all exponent bits set
+    if half_exp == 0x7F80u64 {
+        // Check for signed infinity if mantissa is zero
+        if half_man == 0 {
+            return unsafe {
+                mem::transmute::<u64, f64>((half_sign << 48) | 0x7FF0_0000_0000_0000u64)
+            };
+        } else {
+            // NaN, keep current mantissa but also set most significiant mantissa bit
+            return unsafe {
+                mem::transmute::<u64, f64>(
+                    (half_sign << 48) | 0x7FF8_0000_0000_0000u64 | (half_man << 45),
+                )
+            };
+        }
+    }
+
+    // Calculate double-precision components with adjusted exponent
+    let sign = half_sign << 48;
+    // Unbias exponent
+    let unbiased_exp = ((half_exp as i64) >> 7) - 127;
+
+    // Check for subnormals, which will be normalized by adjusting exponent
+    if half_exp == 0 {
+        // Calculate how much to adjust the exponent by
+        let e = leading_zeros_u16(half_man as u16) - 9;
+
+        // Rebias and adjust exponent
+        let exp = ((1023 - 127 - e) as u64) << 52;
+        let man = (half_man << (46 + e)) & 0xF_FFFF_FFFF_FFFFu64;
+        return unsafe { mem::transmute::<u64, f64>(sign | exp | man) };
+    }
+    // Rebias exponent for a normalized normal
+    let exp = ((unbiased_exp + 1023) as u64) << 52;
+    let man = (half_man & 0x007Fu64) << 45;
+    unsafe { mem::transmute::<u64, f64>(sign | exp | man) }
+}
--- a/vendor/half/src/binary16.rs
+++ b/vendor/half/src/binary16.rs
--- a/vendor/half/src/binary16/arch.rs
+++ b/vendor/half/src/binary16/arch.rs
@@ -0,0 +1,847 @@
+#![allow(dead_code, unused_imports)]
+use crate::leading_zeros::leading_zeros_u16;
+use core::mem;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod x86;
+
+#[cfg(target_arch = "aarch64")]
+mod aarch64;
+
+macro_rules! convert_fn {
+    (if x86_feature("f16c") { $f16c:expr }
+    else if aarch64_feature("fp16") { $aarch64:expr }
+    else { $fallback:expr }) => {
+        cfg_if::cfg_if! {
+            // Use intrinsics directly when a compile target or using no_std
+            if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                target_feature = "f16c"
+            ))] {
+                $f16c
+            }
+            else if #[cfg(all(
+                target_arch = "aarch64",
+                target_feature = "fp16"
+            ))] {
+                $aarch64
+
+            }
+
+            // Use CPU feature detection if using std
+            else if #[cfg(all(
+                feature = "std",
+                any(target_arch = "x86", target_arch = "x86_64")
+            ))] {
+                use std::arch::is_x86_feature_detected;
+                if is_x86_feature_detected!("f16c") {
+                    $f16c
+                } else {
+                    $fallback
+                }
+            }
+            else if #[cfg(all(
+                feature = "std",
+                target_arch = "aarch64",
+            ))] {
+                use std::arch::is_aarch64_feature_detected;
+                if is_aarch64_feature_detected!("fp16") {
+                    $aarch64
+                } else {
+                    $fallback
+                }
+            }
+
+            // Fallback to software
+            else {
+                $fallback
+            }
+        }
+    };
+}
+
+#[inline]
+pub(crate) fn f32_to_f16(f: f32) -> u16 {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f32_to_f16_x86_f16c(f) }
+        } else if aarch64_feature("fp16") {
+            unsafe { aarch64::f32_to_f16_fp16(f) }
+        } else {
+            f32_to_f16_fallback(f)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f64_to_f16(f: f64) -> u16 {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f32_to_f16_x86_f16c(f as f32) }
+        } else if aarch64_feature("fp16") {
+            unsafe { aarch64::f64_to_f16_fp16(f) }
+        } else {
+            f64_to_f16_fallback(f)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f16_to_f32(i: u16) -> f32 {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f16_to_f32_x86_f16c(i) }
+        } else if aarch64_feature("fp16") {
+            unsafe { aarch64::f16_to_f32_fp16(i) }
+        } else {
+            f16_to_f32_fallback(i)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f16_to_f64(i: u16) -> f64 {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f16_to_f32_x86_f16c(i) as f64 }
+        } else if aarch64_feature("fp16") {
+            unsafe { aarch64::f16_to_f64_fp16(i) }
+        } else {
+            f16_to_f64_fallback(i)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f32x4_to_f16x4(f: &[f32; 4]) -> [u16; 4] {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f32x4_to_f16x4_x86_f16c(f) }
+        } else if aarch64_feature("fp16") {
+            unsafe { aarch64::f32x4_to_f16x4_fp16(f) }
+        } else {
+            f32x4_to_f16x4_fallback(f)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f16x4_to_f32x4(i: &[u16; 4]) -> [f32; 4] {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f16x4_to_f32x4_x86_f16c(i) }
+        } else if aarch64_feature("fp16") {
+            unsafe { aarch64::f16x4_to_f32x4_fp16(i) }
+        } else {
+            f16x4_to_f32x4_fallback(i)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f64x4_to_f16x4(f: &[f64; 4]) -> [u16; 4] {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f64x4_to_f16x4_x86_f16c(f) }
+        } else if aarch64_feature("fp16") {
+            unsafe { aarch64::f64x4_to_f16x4_fp16(f) }
+        } else {
+            f64x4_to_f16x4_fallback(f)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f16x4_to_f64x4(i: &[u16; 4]) -> [f64; 4] {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f16x4_to_f64x4_x86_f16c(i) }
+        } else if aarch64_feature("fp16") {
+            unsafe { aarch64::f16x4_to_f64x4_fp16(i) }
+        } else {
+            f16x4_to_f64x4_fallback(i)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f32x8_to_f16x8(f: &[f32; 8]) -> [u16; 8] {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f32x8_to_f16x8_x86_f16c(f) }
+        } else if aarch64_feature("fp16") {
+            {
+                let mut result = [0u16; 8];
+                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
+                    aarch64::f32x4_to_f16x4_fp16);
+                result
+            }
+        } else {
+            f32x8_to_f16x8_fallback(f)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f16x8_to_f32x8(i: &[u16; 8]) -> [f32; 8] {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f16x8_to_f32x8_x86_f16c(i) }
+        } else if aarch64_feature("fp16") {
+            {
+                let mut result = [0f32; 8];
+                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
+                    aarch64::f16x4_to_f32x4_fp16);
+                result
+            }
+        } else {
+            f16x8_to_f32x8_fallback(i)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f64x8_to_f16x8(f: &[f64; 8]) -> [u16; 8] {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f64x8_to_f16x8_x86_f16c(f) }
+        } else if aarch64_feature("fp16") {
+            {
+                let mut result = [0u16; 8];
+                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
+                    aarch64::f64x4_to_f16x4_fp16);
+                result
+            }
+        } else {
+            f64x8_to_f16x8_fallback(f)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f16x8_to_f64x8(i: &[u16; 8]) -> [f64; 8] {
+    convert_fn! {
+        if x86_feature("f16c") {
+            unsafe { x86::f16x8_to_f64x8_x86_f16c(i) }
+        } else if aarch64_feature("fp16") {
+            {
+                let mut result = [0f64; 8];
+                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
+                    aarch64::f16x4_to_f64x4_fp16);
+                result
+            }
+        } else {
+            f16x8_to_f64x8_fallback(i)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f32_to_f16_slice(src: &[f32], dst: &mut [u16]) {
+    convert_fn! {
+        if x86_feature("f16c") {
+            convert_chunked_slice_8(src, dst, x86::f32x8_to_f16x8_x86_f16c,
+                x86::f32x4_to_f16x4_x86_f16c)
+        } else if aarch64_feature("fp16") {
+            convert_chunked_slice_4(src, dst, aarch64::f32x4_to_f16x4_fp16)
+        } else {
+            slice_fallback(src, dst, f32_to_f16_fallback)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f16_to_f32_slice(src: &[u16], dst: &mut [f32]) {
+    convert_fn! {
+        if x86_feature("f16c") {
+            convert_chunked_slice_8(src, dst, x86::f16x8_to_f32x8_x86_f16c,
+                x86::f16x4_to_f32x4_x86_f16c)
+        } else if aarch64_feature("fp16") {
+            convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f32x4_fp16)
+        } else {
+            slice_fallback(src, dst, f16_to_f32_fallback)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f64_to_f16_slice(src: &[f64], dst: &mut [u16]) {
+    convert_fn! {
+        if x86_feature("f16c") {
+            convert_chunked_slice_8(src, dst, x86::f64x8_to_f16x8_x86_f16c,
+                x86::f64x4_to_f16x4_x86_f16c)
+        } else if aarch64_feature("fp16") {
+            convert_chunked_slice_4(src, dst, aarch64::f64x4_to_f16x4_fp16)
+        } else {
+            slice_fallback(src, dst, f64_to_f16_fallback)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn f16_to_f64_slice(src: &[u16], dst: &mut [f64]) {
+    convert_fn! {
+        if x86_feature("f16c") {
+            convert_chunked_slice_8(src, dst, x86::f16x8_to_f64x8_x86_f16c,
+                x86::f16x4_to_f64x4_x86_f16c)
+        } else if aarch64_feature("fp16") {
+            convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f64x4_fp16)
+        } else {
+            slice_fallback(src, dst, f16_to_f64_fallback)
+        }
+    }
+}
+
+macro_rules! math_fn {
+    (if aarch64_feature("fp16") { $aarch64:expr }
+    else { $fallback:expr }) => {
+        cfg_if::cfg_if! {
+            // Use intrinsics directly when a compile target or using no_std
+            if #[cfg(all(
+                target_arch = "aarch64",
+                target_feature = "fp16"
+            ))] {
+                $aarch64
+            }
+
+            // Use CPU feature detection if using std
+            else if #[cfg(all(
+                feature = "std",
+                target_arch = "aarch64",
+                not(target_feature = "fp16")
+            ))] {
+                use std::arch::is_aarch64_feature_detected;
+                if is_aarch64_feature_detected!("fp16") {
+                    $aarch64
+                } else {
+                    $fallback
+                }
+            }
+
+            // Fallback to software
+            else {
+                $fallback
+            }
+        }
+    };
+}
+
+#[inline]
+pub(crate) fn add_f16(a: u16, b: u16) -> u16 {
+    math_fn! {
+        if aarch64_feature("fp16") {
+            unsafe { aarch64::add_f16_fp16(a, b) }
+        } else {
+            add_f16_fallback(a, b)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn subtract_f16(a: u16, b: u16) -> u16 {
+    math_fn! {
+        if aarch64_feature("fp16") {
+            unsafe { aarch64::subtract_f16_fp16(a, b) }
+        } else {
+            subtract_f16_fallback(a, b)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn multiply_f16(a: u16, b: u16) -> u16 {
+    math_fn! {
+        if aarch64_feature("fp16") {
+            unsafe { aarch64::multiply_f16_fp16(a, b) }
+        } else {
+            multiply_f16_fallback(a, b)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn divide_f16(a: u16, b: u16) -> u16 {
+    math_fn! {
+        if aarch64_feature("fp16") {
+            unsafe { aarch64::divide_f16_fp16(a, b) }
+        } else {
+            divide_f16_fallback(a, b)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn remainder_f16(a: u16, b: u16) -> u16 {
+    remainder_f16_fallback(a, b)
+}
+
+#[inline]
+pub(crate) fn product_f16<I: Iterator<Item = u16>>(iter: I) -> u16 {
+    math_fn! {
+        if aarch64_feature("fp16") {
+            iter.fold(0, |acc, x| unsafe { aarch64::multiply_f16_fp16(acc, x) })
+        } else {
+            product_f16_fallback(iter)
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn sum_f16<I: Iterator<Item = u16>>(iter: I) -> u16 {
+    math_fn! {
+        if aarch64_feature("fp16") {
+            iter.fold(0, |acc, x| unsafe { aarch64::add_f16_fp16(acc, x) })
+        } else {
+            sum_f16_fallback(iter)
+        }
+    }
+}
+
+/// Chunks sliced into x8 or x4 arrays
+#[inline]
+fn convert_chunked_slice_8<S: Copy + Default, D: Copy>(
+    src: &[S],
+    dst: &mut [D],
+    fn8: unsafe fn(&[S; 8]) -> [D; 8],
+    fn4: unsafe fn(&[S; 4]) -> [D; 4],
+) {
+    assert_eq!(src.len(), dst.len());
+
+    // TODO: Can be further optimized with array_chunks when it becomes stabilized
+
+    let src_chunks = src.chunks_exact(8);
+    let mut dst_chunks = dst.chunks_exact_mut(8);
+    let src_remainder = src_chunks.remainder();
+    for (s, d) in src_chunks.zip(&mut dst_chunks) {
+        let chunk: &[S; 8] = s.try_into().unwrap();
+        d.copy_from_slice(unsafe { &fn8(chunk) });
+    }
+
+    // Process remainder
+    if src_remainder.len() > 4 {
+        let mut buf: [S; 8] = Default::default();
+        buf[..src_remainder.len()].copy_from_slice(src_remainder);
+        let vec = unsafe { fn8(&buf) };
+        let dst_remainder = dst_chunks.into_remainder();
+        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
+    } else if !src_remainder.is_empty() {
+        let mut buf: [S; 4] = Default::default();
+        buf[..src_remainder.len()].copy_from_slice(src_remainder);
+        let vec = unsafe { fn4(&buf) };
+        let dst_remainder = dst_chunks.into_remainder();
+        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
+    }
+}
+
+/// Chunks sliced into x4 arrays
+#[inline]
+fn convert_chunked_slice_4<S: Copy + Default, D: Copy>(
+    src: &[S],
+    dst: &mut [D],
+    f: unsafe fn(&[S; 4]) -> [D; 4],
+) {
+    assert_eq!(src.len(), dst.len());
+
+    // TODO: Can be further optimized with array_chunks when it becomes stabilized
+
+    let src_chunks = src.chunks_exact(4);
+    let mut dst_chunks = dst.chunks_exact_mut(4);
+    let src_remainder = src_chunks.remainder();
+    for (s, d) in src_chunks.zip(&mut dst_chunks) {
+        let chunk: &[S; 4] = s.try_into().unwrap();
+        d.copy_from_slice(unsafe { &f(chunk) });
+    }
+
+    // Process remainder
+    if !src_remainder.is_empty() {
+        let mut buf: [S; 4] = Default::default();
+        buf[..src_remainder.len()].copy_from_slice(src_remainder);
+        let vec = unsafe { f(&buf) };
+        let dst_remainder = dst_chunks.into_remainder();
+        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
+    }
+}
+
+/////////////// Fallbacks ////////////////
+
+// In the below functions, round to nearest, with ties to even.
+// Let us call the most significant bit that will be shifted out the round_bit.
+//
+// Round up if either
+//  a) Removed part > tie.
+//     (mantissa & round_bit) != 0 && (mantissa & (round_bit - 1)) != 0
+//  b) Removed part == tie, and retained part is odd.
+//     (mantissa & round_bit) != 0 && (mantissa & (2 * round_bit)) != 0
+// (If removed part == tie and retained part is even, do not round up.)
+// These two conditions can be combined into one:
+//     (mantissa & round_bit) != 0 && (mantissa & ((round_bit - 1) | (2 * round_bit))) != 0
+// which can be simplified into
+//     (mantissa & round_bit) != 0 && (mantissa & (3 * round_bit - 1)) != 0
+
+#[inline]
+pub(crate) const fn f32_to_f16_fallback(value: f32) -> u16 {
+    // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
+    // Convert to raw bytes
+    let x: u32 = unsafe { mem::transmute::<f32, u32>(value) };
+
+    // Extract IEEE754 components
+    let sign = x & 0x8000_0000u32;
+    let exp = x & 0x7F80_0000u32;
+    let man = x & 0x007F_FFFFu32;
+
+    // Check for all exponent bits being set, which is Infinity or NaN
+    if exp == 0x7F80_0000u32 {
+        // Set mantissa MSB for NaN (and also keep shifted mantissa bits)
+        let nan_bit = if man == 0 { 0 } else { 0x0200u32 };
+        return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 13)) as u16;
+    }
+
+    // The number is normalized, start assembling half precision version
+    let half_sign = sign >> 16;
+    // Unbias the exponent, then bias for half precision
+    let unbiased_exp = ((exp >> 23) as i32) - 127;
+    let half_exp = unbiased_exp + 15;
+
+    // Check for exponent overflow, return +infinity
+    if half_exp >= 0x1F {
+        return (half_sign | 0x7C00u32) as u16;
+    }
+
+    // Check for underflow
+    if half_exp <= 0 {
+        // Check mantissa for what we can do
+        if 14 - half_exp > 24 {
+            // No rounding possibility, so this is a full underflow, return signed zero
+            return half_sign as u16;
+        }
+        // Don't forget about hidden leading mantissa bit when assembling mantissa
+        let man = man | 0x0080_0000u32;
+        let mut half_man = man >> (14 - half_exp);
+        // Check for rounding (see comment above functions)
+        let round_bit = 1 << (13 - half_exp);
+        if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
+            half_man += 1;
+        }
+        // No exponent for subnormals
+        return (half_sign | half_man) as u16;
+    }
+
+    // Rebias the exponent
+    let half_exp = (half_exp as u32) << 10;
+    let half_man = man >> 13;
+    // Check for rounding (see comment above functions)
+    let round_bit = 0x0000_1000u32;
+    if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
+        // Round it
+        ((half_sign | half_exp | half_man) + 1) as u16
+    } else {
+        (half_sign | half_exp | half_man) as u16
+    }
+}
+
+#[inline]
+pub(crate) const fn f64_to_f16_fallback(value: f64) -> u16 {
+    // Convert to raw bytes, truncating the last 32-bits of mantissa; that precision will always
+    // be lost on half-precision.
+    // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
+    let val: u64 = unsafe { mem::transmute::<f64, u64>(value) };
+    let x = (val >> 32) as u32;
+
+    // Extract IEEE754 components
+    let sign = x & 0x8000_0000u32;
+    let exp = x & 0x7FF0_0000u32;
+    let man = x & 0x000F_FFFFu32;
+
+    // Check for all exponent bits being set, which is Infinity or NaN
+    if exp == 0x7FF0_0000u32 {
+        // Set mantissa MSB for NaN (and also keep shifted mantissa bits).
+        // We also have to check the last 32 bits.
+        let nan_bit = if man == 0 && (val as u32 == 0) {
+            0
+        } else {
+            0x0200u32
+        };
+        return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 10)) as u16;
+    }
+
+    // The number is normalized, start assembling half precision version
+    let half_sign = sign >> 16;
+    // Unbias the exponent, then bias for half precision
+    let unbiased_exp = ((exp >> 20) as i64) - 1023;
+    let half_exp = unbiased_exp + 15;
+
+    // Check for exponent overflow, return +infinity
+    if half_exp >= 0x1F {
+        return (half_sign | 0x7C00u32) as u16;
+    }
+
+    // Check for underflow
+    if half_exp <= 0 {
+        // Check mantissa for what we can do
+        if 10 - half_exp > 21 {
+            // No rounding possibility, so this is a full underflow, return signed zero
+            return half_sign as u16;
+        }
+        // Don't forget about hidden leading mantissa bit when assembling mantissa
+        let man = man | 0x0010_0000u32;
+        let mut half_man = man >> (11 - half_exp);
+        // Check for rounding (see comment above functions)
+        let round_bit = 1 << (10 - half_exp);
+        if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
+            half_man += 1;
+        }
+        // No exponent for subnormals
+        return (half_sign | half_man) as u16;
+    }
+
+    // Rebias the exponent
+    let half_exp = (half_exp as u32) << 10;
+    let half_man = man >> 10;
+    // Check for rounding (see comment above functions)
+    let round_bit = 0x0000_0200u32;
+    if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
+        // Round it
+        ((half_sign | half_exp | half_man) + 1) as u16
+    } else {
+        (half_sign | half_exp | half_man) as u16
+    }
+}
+
+#[inline]
+pub(crate) const fn f16_to_f32_fallback(i: u16) -> f32 {
+    // Check for signed zero
+    // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
+    if i & 0x7FFFu16 == 0 {
+        return unsafe { mem::transmute::<u32, f32>((i as u32) << 16) };
+    }
+
+    let half_sign = (i & 0x8000u16) as u32;
+    let half_exp = (i & 0x7C00u16) as u32;
+    let half_man = (i & 0x03FFu16) as u32;
+
+    // Check for an infinity or NaN when all exponent bits set
+    if half_exp == 0x7C00u32 {
+        // Check for signed infinity if mantissa is zero
+        if half_man == 0 {
+            return unsafe { mem::transmute::<u32, f32>((half_sign << 16) | 0x7F80_0000u32) };
+        } else {
+            // NaN, keep current mantissa but also set most significiant mantissa bit
+            return unsafe {
+                mem::transmute::<u32, f32>((half_sign << 16) | 0x7FC0_0000u32 | (half_man << 13))
+            };
+        }
+    }
+
+    // Calculate single-precision components with adjusted exponent
+    let sign = half_sign << 16;
+    // Unbias exponent
+    let unbiased_exp = ((half_exp as i32) >> 10) - 15;
+
+    // Check for subnormals, which will be normalized by adjusting exponent
+    if half_exp == 0 {
+        // Calculate how much to adjust the exponent by
+        let e = leading_zeros_u16(half_man as u16) - 6;
+
+        // Rebias and adjust exponent
+        let exp = (127 - 15 - e) << 23;
+        let man = (half_man << (14 + e)) & 0x7F_FF_FFu32;
+        return unsafe { mem::transmute::<u32, f32>(sign | exp | man) };
+    }
+
+    // Rebias exponent for a normalized normal
+    let exp = ((unbiased_exp + 127) as u32) << 23;
+    let man = (half_man & 0x03FFu32) << 13;
+    unsafe { mem::transmute::<u32, f32>(sign | exp | man) }
+}
+
+#[inline]
+pub(crate) const fn f16_to_f64_fallback(i: u16) -> f64 {
+    // Check for signed zero
+    // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
+    if i & 0x7FFFu16 == 0 {
+        return unsafe { mem::transmute::<u64, f64>((i as u64) << 48) };
+    }
+
+    let half_sign = (i & 0x8000u16) as u64;
+    let half_exp = (i & 0x7C00u16) as u64;
+    let half_man = (i & 0x03FFu16) as u64;
+
+    // Check for an infinity or NaN when all exponent bits set
+    if half_exp == 0x7C00u64 {
+        // Check for signed infinity if mantissa is zero
+        if half_man == 0 {
+            return unsafe {
+                mem::transmute::<u64, f64>((half_sign << 48) | 0x7FF0_0000_0000_0000u64)
+            };
+        } else {
+            // NaN, keep current mantissa but also set most significiant mantissa bit
+            return unsafe {
+                mem::transmute::<u64, f64>(
+                    (half_sign << 48) | 0x7FF8_0000_0000_0000u64 | (half_man << 42),
+                )
+            };
+        }
+    }
+
+    // Calculate double-precision components with adjusted exponent
+    let sign = half_sign << 48;
+    // Unbias exponent
+    let unbiased_exp = ((half_exp as i64) >> 10) - 15;
+
+    // Check for subnormals, which will be normalized by adjusting exponent
+    if half_exp == 0 {
+        // Calculate how much to adjust the exponent by
+        let e = leading_zeros_u16(half_man as u16) - 6;
+
+        // Rebias and adjust exponent
+        let exp = ((1023 - 15 - e) as u64) << 52;
+        let man = (half_man << (43 + e)) & 0xF_FFFF_FFFF_FFFFu64;
+        return unsafe { mem::transmute::<u64, f64>(sign | exp | man) };
+    }
+
+    // Rebias exponent for a normalized normal
+    let exp = ((unbiased_exp + 1023) as u64) << 52;
+    let man = (half_man & 0x03FFu64) << 42;
+    unsafe { mem::transmute::<u64, f64>(sign | exp | man) }
+}
+
+#[inline]
+fn f16x4_to_f32x4_fallback(v: &[u16; 4]) -> [f32; 4] {
+    [
+        f16_to_f32_fallback(v[0]),
+        f16_to_f32_fallback(v[1]),
+        f16_to_f32_fallback(v[2]),
+        f16_to_f32_fallback(v[3]),
+    ]
+}
+
+#[inline]
+fn f32x4_to_f16x4_fallback(v: &[f32; 4]) -> [u16; 4] {
+    [
+        f32_to_f16_fallback(v[0]),
+        f32_to_f16_fallback(v[1]),
+        f32_to_f16_fallback(v[2]),
+        f32_to_f16_fallback(v[3]),
+    ]
+}
+
+#[inline]
+fn f16x4_to_f64x4_fallback(v: &[u16; 4]) -> [f64; 4] {
+    [
+        f16_to_f64_fallback(v[0]),
+        f16_to_f64_fallback(v[1]),
+        f16_to_f64_fallback(v[2]),
+        f16_to_f64_fallback(v[3]),
+    ]
+}
+
+#[inline]
+fn f64x4_to_f16x4_fallback(v: &[f64; 4]) -> [u16; 4] {
+    [
+        f64_to_f16_fallback(v[0]),
+        f64_to_f16_fallback(v[1]),
+        f64_to_f16_fallback(v[2]),
+        f64_to_f16_fallback(v[3]),
+    ]
+}
+
+#[inline]
+fn f16x8_to_f32x8_fallback(v: &[u16; 8]) -> [f32; 8] {
+    [
+        f16_to_f32_fallback(v[0]),
+        f16_to_f32_fallback(v[1]),
+        f16_to_f32_fallback(v[2]),
+        f16_to_f32_fallback(v[3]),
+        f16_to_f32_fallback(v[4]),
+        f16_to_f32_fallback(v[5]),
+        f16_to_f32_fallback(v[6]),
+        f16_to_f32_fallback(v[7]),
+    ]
+}
+
+#[inline]
+fn f32x8_to_f16x8_fallback(v: &[f32; 8]) -> [u16; 8] {
+    [
+        f32_to_f16_fallback(v[0]),
+        f32_to_f16_fallback(v[1]),
+        f32_to_f16_fallback(v[2]),
+        f32_to_f16_fallback(v[3]),
+        f32_to_f16_fallback(v[4]),
+        f32_to_f16_fallback(v[5]),
+        f32_to_f16_fallback(v[6]),
+        f32_to_f16_fallback(v[7]),
+    ]
+}
+
+#[inline]
+fn f16x8_to_f64x8_fallback(v: &[u16; 8]) -> [f64; 8] {
+    [
+        f16_to_f64_fallback(v[0]),
+        f16_to_f64_fallback(v[1]),
+        f16_to_f64_fallback(v[2]),
+        f16_to_f64_fallback(v[3]),
+        f16_to_f64_fallback(v[4]),
+        f16_to_f64_fallback(v[5]),
+        f16_to_f64_fallback(v[6]),
+        f16_to_f64_fallback(v[7]),
+    ]
+}
+
+#[inline]
+fn f64x8_to_f16x8_fallback(v: &[f64; 8]) -> [u16; 8] {
+    [
+        f64_to_f16_fallback(v[0]),
+        f64_to_f16_fallback(v[1]),
+        f64_to_f16_fallback(v[2]),
+        f64_to_f16_fallback(v[3]),
+        f64_to_f16_fallback(v[4]),
+        f64_to_f16_fallback(v[5]),
+        f64_to_f16_fallback(v[6]),
+        f64_to_f16_fallback(v[7]),
+    ]
+}
+
+#[inline]
+fn slice_fallback<S: Copy, D>(src: &[S], dst: &mut [D], f: fn(S) -> D) {
+    assert_eq!(src.len(), dst.len());
+    for (s, d) in src.iter().copied().zip(dst.iter_mut()) {
+        *d = f(s);
+    }
+}
+
+#[inline]
+fn add_f16_fallback(a: u16, b: u16) -> u16 {
+    f32_to_f16(f16_to_f32(a) + f16_to_f32(b))
+}
+
+#[inline]
+fn subtract_f16_fallback(a: u16, b: u16) -> u16 {
+    f32_to_f16(f16_to_f32(a) - f16_to_f32(b))
+}
+
+#[inline]
+fn multiply_f16_fallback(a: u16, b: u16) -> u16 {
+    f32_to_f16(f16_to_f32(a) * f16_to_f32(b))
+}
+
+#[inline]
+fn divide_f16_fallback(a: u16, b: u16) -> u16 {
+    f32_to_f16(f16_to_f32(a) / f16_to_f32(b))
+}
+
+#[inline]
+fn remainder_f16_fallback(a: u16, b: u16) -> u16 {
+    f32_to_f16(f16_to_f32(a) % f16_to_f32(b))
+}
+
+#[inline]
+fn product_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 {
+    f32_to_f16(iter.map(f16_to_f32).product())
+}
+
+#[inline]
+fn sum_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 {
+    f32_to_f16(iter.map(f16_to_f32).sum())
+}
+
+// TODO SIMD arithmetic
--- a/vendor/half/src/binary16/arch/aarch64.rs
+++ b/vendor/half/src/binary16/arch/aarch64.rs
@@ -0,0 +1,175 @@
+use core::{
+    arch::{
+        aarch64::{float32x4_t, float64x2_t, uint16x4_t},
+        asm,
+    },
+    mem::MaybeUninit,
+    ptr,
+};
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn f16_to_f32_fp16(i: u16) -> f32 {
+    let result: f32;
+    asm!(
+        "fcvt {0:s}, {1:h}",
+        out(vreg) result,
+        in(vreg) i,
+        options(pure, nomem, nostack, preserves_flags));
+    result
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn f16_to_f64_fp16(i: u16) -> f64 {
+    let result: f64;
+    asm!(
+        "fcvt {0:d}, {1:h}",
+        out(vreg) result,
+        in(vreg) i,
+        options(pure, nomem, nostack, preserves_flags));
+    result
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn f32_to_f16_fp16(f: f32) -> u16 {
+    let result: u16;
+    asm!(
+        "fcvt {0:h}, {1:s}",
+        out(vreg) result,
+        in(vreg) f,
+        options(pure, nomem, nostack, preserves_flags));
+    result
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn f64_to_f16_fp16(f: f64) -> u16 {
+    let result: u16;
+    asm!(
+        "fcvt {0:h}, {1:d}",
+        out(vreg) result,
+        in(vreg) f,
+        options(pure, nomem, nostack, preserves_flags));
+    result
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn f16x4_to_f32x4_fp16(v: &[u16; 4]) -> [f32; 4] {
+    let mut vec = MaybeUninit::<uint16x4_t>::uninit();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
+    let result: float32x4_t;
+    asm!(
+        "fcvtl {0:v}.4s, {1:v}.4h",
+        out(vreg) result,
+        in(vreg) vec.assume_init(),
+        options(pure, nomem, nostack));
+    *(&result as *const float32x4_t).cast()
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn f32x4_to_f16x4_fp16(v: &[f32; 4]) -> [u16; 4] {
+    let mut vec = MaybeUninit::<float32x4_t>::uninit();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
+    let result: uint16x4_t;
+    asm!(
+        "fcvtn {0:v}.4h, {1:v}.4s",
+        out(vreg) result,
+        in(vreg) vec.assume_init(),
+        options(pure, nomem, nostack));
+    *(&result as *const uint16x4_t).cast()
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn f16x4_to_f64x4_fp16(v: &[u16; 4]) -> [f64; 4] {
+    let mut vec = MaybeUninit::<uint16x4_t>::uninit();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
+    let low: float64x2_t;
+    let high: float64x2_t;
+    asm!(
+        "fcvtl {2:v}.4s, {3:v}.4h", // Convert to f32
+        "fcvtl {0:v}.2d, {2:v}.2s", // Convert low part to f64
+        "fcvtl2 {1:v}.2d, {2:v}.4s", // Convert high part to f64
+        lateout(vreg) low,
+        lateout(vreg) high,
+        out(vreg) _,
+        in(vreg) vec.assume_init(),
+        options(pure, nomem, nostack));
+    *[low, high].as_ptr().cast()
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn f64x4_to_f16x4_fp16(v: &[f64; 4]) -> [u16; 4] {
+    let mut low = MaybeUninit::<float64x2_t>::uninit();
+    let mut high = MaybeUninit::<float64x2_t>::uninit();
+    ptr::copy_nonoverlapping(v.as_ptr(), low.as_mut_ptr().cast(), 2);
+    ptr::copy_nonoverlapping(v[2..].as_ptr(), high.as_mut_ptr().cast(), 2);
+    let result: uint16x4_t;
+    asm!(
+        "fcvtn {1:v}.2s, {2:v}.2d", // Convert low to f32
+        "fcvtn2 {1:v}.4s, {3:v}.2d", // Convert high to f32
+        "fcvtn {0:v}.4h, {1:v}.4s", // Convert to f16
+        lateout(vreg) result,
+        out(vreg) _,
+        in(vreg) low.assume_init(),
+        in(vreg) high.assume_init(),
+        options(pure, nomem, nostack));
+    *(&result as *const uint16x4_t).cast()
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn add_f16_fp16(a: u16, b: u16) -> u16 {
+    let result: u16;
+    asm!(
+        "fadd {0:h}, {1:h}, {2:h}",
+        out(vreg) result,
+        in(vreg) a,
+        in(vreg) b,
+        options(pure, nomem, nostack));
+    result
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn subtract_f16_fp16(a: u16, b: u16) -> u16 {
+    let result: u16;
+    asm!(
+        "fsub {0:h}, {1:h}, {2:h}",
+        out(vreg) result,
+        in(vreg) a,
+        in(vreg) b,
+        options(pure, nomem, nostack));
+    result
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn multiply_f16_fp16(a: u16, b: u16) -> u16 {
+    let result: u16;
+    asm!(
+        "fmul {0:h}, {1:h}, {2:h}",
+        out(vreg) result,
+        in(vreg) a,
+        in(vreg) b,
+        options(pure, nomem, nostack));
+    result
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn divide_f16_fp16(a: u16, b: u16) -> u16 {
+    let result: u16;
+    asm!(
+        "fdiv {0:h}, {1:h}, {2:h}",
+        out(vreg) result,
+        in(vreg) a,
+        in(vreg) b,
+        options(pure, nomem, nostack));
+    result
+}
--- a/vendor/half/src/binary16/arch/x86.rs
+++ b/vendor/half/src/binary16/arch/x86.rs
@@ -0,0 +1,132 @@
+use core::{mem::MaybeUninit, ptr};
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::{
+    __m128, __m128i, __m256, _mm256_cvtph_ps, _mm256_cvtps_ph, _mm_cvtph_ps,
+    _MM_FROUND_TO_NEAREST_INT,
+};
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::{
+    __m128, __m128i, __m256, _mm256_cvtph_ps, _mm256_cvtps_ph, _mm_cvtph_ps, _mm_cvtps_ph,
+    _MM_FROUND_TO_NEAREST_INT,
+};
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::_mm_cvtps_ph;
+
+use super::convert_chunked_slice_8;
+
+/////////////// x86/x86_64 f16c ////////////////
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f16_to_f32_x86_f16c(i: u16) -> f32 {
+    let mut vec = MaybeUninit::<__m128i>::zeroed();
+    vec.as_mut_ptr().cast::<u16>().write(i);
+    let retval = _mm_cvtph_ps(vec.assume_init());
+    *(&retval as *const __m128).cast()
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 {
+    let mut vec = MaybeUninit::<__m128>::zeroed();
+    vec.as_mut_ptr().cast::<f32>().write(f);
+    let retval = _mm_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
+    *(&retval as *const __m128i).cast()
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f16x4_to_f32x4_x86_f16c(v: &[u16; 4]) -> [f32; 4] {
+    let mut vec = MaybeUninit::<__m128i>::zeroed();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
+    let retval = _mm_cvtph_ps(vec.assume_init());
+    *(&retval as *const __m128).cast()
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f32x4_to_f16x4_x86_f16c(v: &[f32; 4]) -> [u16; 4] {
+    let mut vec = MaybeUninit::<__m128>::uninit();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
+    let retval = _mm_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
+    *(&retval as *const __m128i).cast()
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f16x4_to_f64x4_x86_f16c(v: &[u16; 4]) -> [f64; 4] {
+    let array = f16x4_to_f32x4_x86_f16c(v);
+    // Let compiler vectorize this regular cast for now.
+    // TODO: investigate auto-detecting sse2/avx convert features
+    [
+        array[0] as f64,
+        array[1] as f64,
+        array[2] as f64,
+        array[3] as f64,
+    ]
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f64x4_to_f16x4_x86_f16c(v: &[f64; 4]) -> [u16; 4] {
+    // Let compiler vectorize this regular cast for now.
+    // TODO: investigate auto-detecting sse2/avx convert features
+    let v = [v[0] as f32, v[1] as f32, v[2] as f32, v[3] as f32];
+    f32x4_to_f16x4_x86_f16c(&v)
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f16x8_to_f32x8_x86_f16c(v: &[u16; 8]) -> [f32; 8] {
+    let mut vec = MaybeUninit::<__m128i>::zeroed();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 8);
+    let retval = _mm256_cvtph_ps(vec.assume_init());
+    *(&retval as *const __m256).cast()
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f32x8_to_f16x8_x86_f16c(v: &[f32; 8]) -> [u16; 8] {
+    let mut vec = MaybeUninit::<__m256>::uninit();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 8);
+    let retval = _mm256_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
+    *(&retval as *const __m128i).cast()
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f16x8_to_f64x8_x86_f16c(v: &[u16; 8]) -> [f64; 8] {
+    let array = f16x8_to_f32x8_x86_f16c(v);
+    // Let compiler vectorize this regular cast for now.
+    // TODO: investigate auto-detecting sse2/avx convert features
+    [
+        array[0] as f64,
+        array[1] as f64,
+        array[2] as f64,
+        array[3] as f64,
+        array[4] as f64,
+        array[5] as f64,
+        array[6] as f64,
+        array[7] as f64,
+    ]
+}
+
+#[target_feature(enable = "f16c")]
+#[inline]
+pub(super) unsafe fn f64x8_to_f16x8_x86_f16c(v: &[f64; 8]) -> [u16; 8] {
+    // Let compiler vectorize this regular cast for now.
+    // TODO: investigate auto-detecting sse2/avx convert features
+    let v = [
+        v[0] as f32,
+        v[1] as f32,
+        v[2] as f32,
+        v[3] as f32,
+        v[4] as f32,
+        v[5] as f32,
+        v[6] as f32,
+        v[7] as f32,
+    ];
+    f32x8_to_f16x8_x86_f16c(&v)
+}
--- a/vendor/half/src/leading_zeros.rs
+++ b/vendor/half/src/leading_zeros.rs
@@ -0,0 +1,65 @@
+// https://doc.rust-lang.org/std/primitive.u16.html#method.leading_zeros
+
+#[cfg(not(any(all(
+    target_arch = "spirv",
+    not(all(
+        target_feature = "IntegerFunctions2INTEL",
+        target_feature = "SPV_INTEL_shader_integer_functions2"
+    ))
+))))]
+#[inline]
+pub(crate) const fn leading_zeros_u16(x: u16) -> u32 {
+    x.leading_zeros()
+}
+
+#[cfg(all(
+    target_arch = "spirv",
+    not(all(
+        target_feature = "IntegerFunctions2INTEL",
+        target_feature = "SPV_INTEL_shader_integer_functions2"
+    ))
+))]
+#[inline]
+pub(crate) const fn leading_zeros_u16(x: u16) -> u32 {
+    leading_zeros_u16_fallback(x)
+}
+
+#[cfg(any(
+    test,
+    all(
+        target_arch = "spirv",
+        not(all(
+            target_feature = "IntegerFunctions2INTEL",
+            target_feature = "SPV_INTEL_shader_integer_functions2"
+        ))
+    )
+))]
+#[inline]
+const fn leading_zeros_u16_fallback(mut x: u16) -> u32 {
+    use crunchy::unroll;
+    let mut c = 0;
+    let msb = 1 << 15;
+    unroll! { for i in 0 .. 16 {
+        if x & msb == 0 {
+            c += 1;
+        } else {
+            return c;
+        }
+        #[allow(unused_assignments)]
+        if i < 15 {
+            x <<= 1;
+        }
+    }}
+    c
+}
+
+#[cfg(test)]
+mod test {
+
+    #[test]
+    fn leading_zeros_u16_fallback() {
+        for x in [44, 97, 304, 1179, 23571] {
+            assert_eq!(super::leading_zeros_u16_fallback(x), x.leading_zeros());
+        }
+    }
+}
--- a/vendor/half/src/lib.rs
+++ b/vendor/half/src/lib.rs
@@ -0,0 +1,271 @@
+//! A crate that provides support for half-precision 16-bit floating point types.
+//!
+//! This crate provides the [`struct@f16`] type, which is an implementation of the IEEE 754-2008 standard
+//! [`binary16`] a.k.a "half" floating point type. This 16-bit floating point type is intended for
+//! efficient storage where the full range and precision of a larger floating point value is not
+//! required. This is especially useful for image storage formats.
+//!
+//! This crate also provides a [`struct@bf16`] type, an alternative 16-bit floating point format. The
+//! [`bfloat16`] format is a truncated IEEE 754 standard `binary32` float that preserves the
+//! exponent to allow the same range as [`f32`] but with only 8 bits of precision (instead of 11
+//! bits for [`struct@f16`]). See the [`struct@bf16`] type for details.
+//!
+//! Because [`struct@f16`] and [`struct@bf16`] are primarily for efficient storage, floating point operations such
+//! as addition, multiplication, etc. are not always implemented by hardware. When hardware does not
+//! support these operations, this crate emulates them by converting the value to
+//! [`f32`] before performing the operation and then back afterward.
+//!
+//! Note that conversion from [`f32`]/[`f64`] to both [`struct@f16`] and [`struct@bf16`] are lossy operations, and
+//! just as converting a [`f64`] to [`f32`] is lossy and does not have `Into`/`From` trait
+//! implementations, so too do these smaller types not have those trait implementations either.
+//! Instead, use `from_f32`/`from_f64` functions for the types in this crate. If you don't care
+//! about lossy conversions and need trait conversions, use the appropriate [`num-traits`]
+//! traits that are implemented.
+//!
+//! This crate also provides a [`slice`][mod@slice] module for zero-copy in-place conversions of
+//! [`u16`] slices to both [`struct@f16`] and [`struct@bf16`], as well as efficient vectorized conversions of
+//! larger buffers of floating point values to and from these half formats.
+//!
+//! The crate supports `#[no_std]` when the `std` cargo feature is not enabled, so can be used in
+//! embedded environments without using the Rust [`std`] library. The `std` feature enables support
+//! for the standard library and is enabled by default, see the [Cargo Features](#cargo-features)
+//! section below.
+//!
+//! A [`prelude`] module is provided for easy importing of available utility traits.
+//!
+//! # Serialization
+//!
+//! When the `serde` feature is enabled, [`struct@f16`] and [`struct@bf16`] will be serialized as a newtype of
+//! [`u16`] by default. In binary formats this is ideal, as it will generally use just two bytes for
+//! storage. For string formats like JSON, however, this isn't as useful, and due to design
+//! limitations of serde, it's not possible for the default `Serialize` implementation to support
+//! different serialization for different formats.
+//!
+//! Instead, it's up to the containter type of the floats to control how it is serialized. This can
+//! easily be controlled when using the derive macros using `#[serde(serialize_with="")]`
+//! attributes. For both [`struct@f16`] and [`struct@bf16`] a `serialize_as_f32` and `serialize_as_string` are
+//! provided for use with this attribute.
+//!
+//! Deserialization of both float types supports deserializing from the default serialization,
+//! strings, and `f32`/`f64` values, so no additional work is required.
+//!
+//! # Hardware support
+//!
+//! Hardware support for these conversions and arithmetic will be used
+//! whenever hardware support is available—either through instrinsics or targeted assembly—although
+//! a nightly Rust toolchain may be required for some hardware. When hardware supports it the
+//! functions and traits in the [`slice`][mod@slice] and [`vec`] modules will also use vectorized
+//! SIMD intructions for increased efficiency.
+//!
+//! The following list details hardware support for floating point types in this crate. When using
+//! `std` cargo feature, runtime CPU target detection will be used. To get the most performance
+//! benefits, compile for specific CPU features which avoids the runtime overhead and works in a
+//! `no_std` environment.
+//!
+//! | Architecture | CPU Target Feature | Notes |
+//! | ------------ | ------------------ | ----- |
+//! | `x86`/`x86_64` | `f16c` | This supports conversion to/from [`struct@f16`] only (including vector SIMD) and does not support any [`struct@bf16`] or arithmetic operations. |
+//! | `aarch64` | `fp16` | This supports all operations on [`struct@f16`] only. |
+//!
+//! # Cargo Features
+//!
+//! This crate supports a number of optional cargo features. None of these features are enabled by
+//! default, even `std`.
+//!
+//! - **`alloc`** — Enable use of the [`alloc`] crate when not using the `std` library.
+//!
+//!   Among other functions, this enables the [`vec`] module, which contains zero-copy
+//!   conversions for the [`Vec`] type. This allows fast conversion between raw `Vec<u16>` bits and
+//!   `Vec<f16>` or `Vec<bf16>` arrays, and vice versa.
+//!
+//! - **`std`** — Enable features that depend on the Rust [`std`] library. This also enables the
+//!   `alloc` feature automatically.
+//!
+//!   Enabling the `std` feature enables runtime CPU feature detection of hardware support.
+//!   Without this feature detection, harware is only used when compiler target supports them.
+//!
+//! - **`serde`** — Adds support for the [`serde`] crate by implementing [`Serialize`] and
+//!   [`Deserialize`] traits for both [`struct@f16`] and [`struct@bf16`].
+//!
+//! - **`num-traits`** — Adds support for the [`num-traits`] crate by implementing [`ToPrimitive`],
+//!   [`FromPrimitive`], [`ToBytes`], `FromBytes`, [`AsPrimitive`], [`Num`], [`Float`],
+//!   [`FloatCore`], and [`Bounded`] traits for both [`struct@f16`] and [`struct@bf16`].
+//!
+//! - **`bytemuck`** — Adds support for the [`bytemuck`] crate by implementing [`Zeroable`] and
+//!   [`Pod`] traits for both [`struct@f16`] and [`struct@bf16`].
+//!
+//! - **`zerocopy`** — Adds support for the [`zerocopy`] crate by implementing [`IntoBytes`] and
+//!   [`FromBytes`] traits for both [`struct@f16`] and [`struct@bf16`].
+//!
+//! - **`rand_distr`** — Adds support for the [`rand_distr`] crate by implementing [`Distribution`]
+//!   and other traits for both [`struct@f16`] and [`struct@bf16`].
+//!
+//! - **`rkyv`** -- Enable zero-copy deserializtion with [`rkyv`] crate.
+//!
+//! - **`aribtrary`** -- Enable fuzzing support with [`arbitrary`] crate by implementing
+//!   [`Arbitrary`] trait.
+//!
+//! [`alloc`]: https://doc.rust-lang.org/alloc/
+//! [`std`]: https://doc.rust-lang.org/std/
+//! [`binary16`]: https://en.wikipedia.org/wiki/Half-precision_floating-point_format
+//! [`bfloat16`]: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
+//! [`serde`]: https://crates.io/crates/serde
+//! [`bytemuck`]: https://crates.io/crates/bytemuck
+//! [`num-traits`]: https://crates.io/crates/num-traits
+//! [`zerocopy`]: https://crates.io/crates/zerocopy
+//! [`rand_distr`]: https://crates.io/crates/rand_distr
+//! [`rkyv`]: (https://crates.io/crates/rkyv)
+//! [`arbitrary`]: (https://crates.io/crates/arbitrary)
+#![cfg_attr(
+    feature = "alloc",
+    doc = "
+[`vec`]: mod@vec"
+)]
+#![cfg_attr(
+    not(feature = "alloc"),
+    doc = "
+[`vec`]: #
+[`Vec`]: https://docs.rust-lang.org/stable/alloc/vec/struct.Vec.html"
+)]
+#![cfg_attr(
+    feature = "serde",
+    doc = "
+[`Serialize`]: serde::Serialize
+[`Deserialize`]: serde::Deserialize"
+)]
+#![cfg_attr(
+    not(feature = "serde"),
+    doc = "
+[`Serialize`]: https://docs.rs/serde/*/serde/trait.Serialize.html
+[`Deserialize`]: https://docs.rs/serde/*/serde/trait.Deserialize.html"
+)]
+#![cfg_attr(
+    feature = "num-traits",
+    doc = "
+[`ToPrimitive`]: ::num_traits::ToPrimitive
+[`FromPrimitive`]: ::num_traits::FromPrimitive
+[`ToBytes`]: ::num_traits::ToBytes
+[`AsPrimitive`]: ::num_traits::AsPrimitive
+[`Num`]: ::num_traits::Num
+[`Float`]: ::num_traits::Float
+[`FloatCore`]: ::num_traits::float::FloatCore
+[`Bounded`]: ::num_traits::Bounded"
+)]
+#![cfg_attr(
+    not(feature = "num-traits"),
+    doc = "
+[`ToPrimitive`]: https://docs.rs/num-traits/*/num_traits/cast/trait.ToPrimitive.html
+[`FromPrimitive`]: https://docs.rs/num-traits/*/num_traits/cast/trait.FromPrimitive.html
+[`ToBytes`]: https://docs.rs/num-traits/*/num_traits/ops/bytes/trait.ToBytes.html
+[`AsPrimitive`]: https://docs.rs/num-traits/*/num_traits/cast/trait.AsPrimitive.html
+[`Num`]: https://docs.rs/num-traits/*/num_traits/trait.Num.html
+[`Float`]: https://docs.rs/num-traits/*/num_traits/float/trait.Float.html
+[`FloatCore`]: https://docs.rs/num-traits/*/num_traits/float/trait.FloatCore.html
+[`Bounded`]: https://docs.rs/num-traits/*/num_traits/bounds/trait.Bounded.html"
+)]
+#![cfg_attr(
+    feature = "bytemuck",
+    doc = "
+[`Zeroable`]: bytemuck::Zeroable
+[`Pod`]: bytemuck::Pod"
+)]
+#![cfg_attr(
+    not(feature = "bytemuck"),
+    doc = "
+[`Zeroable`]: https://docs.rs/bytemuck/*/bytemuck/trait.Zeroable.html
+[`Pod`]: https://docs.rs/bytemuck/*bytemuck/trait.Pod.html"
+)]
+#![cfg_attr(
+    feature = "zerocopy",
+    doc = "
+[`IntoBytes`]: zerocopy::IntoBytes
+[`FromBytes`]: zerocopy::FromBytes"
+)]
+#![cfg_attr(
+    not(feature = "zerocopy"),
+    doc = "
+[`IntoBytes`]: https://docs.rs/zerocopy/*/zerocopy/trait.IntoBytes.html
+[`FromBytes`]: https://docs.rs/zerocopy/*/zerocopy/trait.FromBytes.html"
+)]
+#![cfg_attr(
+    feature = "rand_distr",
+    doc = "
+[`Distribution`]: rand::distr::Distribution"
+)]
+#![cfg_attr(
+    not(feature = "rand_distr"),
+    doc = "
+[`Distribution`]: https://docs.rs/rand/*/rand/distr/trait.Distribution.html"
+)]
+#![cfg_attr(
+    feature = "arbitrary",
+    doc = "
+[`Arbitrary`]: arbitrary::Arbitrary"
+)]
+#![cfg_attr(
+    not(feature = "arbitrary"),
+    doc = "
+[`Arbitrary`]: https://docs.rs/arbitrary/*/arbitrary/trait.Arbitrary.html"
+)]
+#![warn(
+    missing_docs,
+    missing_copy_implementations,
+    trivial_numeric_casts,
+    future_incompatible
+)]
+#![cfg_attr(not(target_arch = "spirv"), warn(missing_debug_implementations))]
+#![allow(clippy::verbose_bit_mask, clippy::cast_lossless, unexpected_cfgs)]
+#![cfg_attr(not(feature = "std"), no_std)]
+#![doc(html_root_url = "https://docs.rs/half/2.6.0")]
+#![doc(test(attr(deny(warnings), allow(unused))))]
+#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
+mod bfloat;
+mod binary16;
+mod leading_zeros;
+#[cfg(feature = "num-traits")]
+mod num_traits;
+
+#[cfg(not(target_arch = "spirv"))]
+pub mod slice;
+#[cfg(feature = "alloc")]
+pub mod vec;
+
+pub use bfloat::bf16;
+pub use binary16::f16;
+
+#[cfg(feature = "rand_distr")]
+mod rand_distr;
+
+/// A collection of the most used items and traits in this crate for easy importing.
+///
+/// # Examples
+///
+/// ```rust
+/// use half::prelude::*;
+/// ```
+pub mod prelude {
+    #[doc(no_inline)]
+    pub use crate::{bf16, f16};
+
+    #[cfg(not(target_arch = "spirv"))]
+    #[doc(no_inline)]
+    pub use crate::slice::{HalfBitsSliceExt, HalfFloatSliceExt};
+
+    #[cfg(feature = "alloc")]
+    #[doc(no_inline)]
+    pub use crate::vec::{HalfBitsVecExt, HalfFloatVecExt};
+}
+
+// Keep this module private to crate
+mod private {
+    use crate::{bf16, f16};
+
+    pub trait SealedHalf {}
+
+    impl SealedHalf for f16 {}
+    impl SealedHalf for bf16 {}
+}
--- a/vendor/half/src/num_traits.rs
+++ b/vendor/half/src/num_traits.rs
--- a/vendor/half/src/rand_distr.rs
+++ b/vendor/half/src/rand_distr.rs
@@ -0,0 +1,125 @@
+use crate::{bf16, f16};
+
+use rand::{distr::Distribution, Rng};
+use rand_distr::uniform::UniformFloat;
+
+macro_rules! impl_distribution_via_f32 {
+    ($Ty:ty, $Distr:ty) => {
+        impl Distribution<$Ty> for $Distr {
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $Ty {
+                <$Ty>::from_f32(<Self as Distribution<f32>>::sample(self, rng))
+            }
+        }
+    };
+}
+
+impl_distribution_via_f32!(f16, rand_distr::StandardUniform);
+impl_distribution_via_f32!(f16, rand_distr::StandardNormal);
+impl_distribution_via_f32!(f16, rand_distr::Exp1);
+impl_distribution_via_f32!(f16, rand_distr::Open01);
+impl_distribution_via_f32!(f16, rand_distr::OpenClosed01);
+
+impl_distribution_via_f32!(bf16, rand_distr::StandardUniform);
+impl_distribution_via_f32!(bf16, rand_distr::StandardNormal);
+impl_distribution_via_f32!(bf16, rand_distr::Exp1);
+impl_distribution_via_f32!(bf16, rand_distr::Open01);
+impl_distribution_via_f32!(bf16, rand_distr::OpenClosed01);
+
+#[derive(Debug, Clone, Copy)]
+pub struct Float16Sampler(UniformFloat<f32>);
+
+impl rand_distr::uniform::SampleUniform for f16 {
+    type Sampler = Float16Sampler;
+}
+
+impl rand_distr::uniform::UniformSampler for Float16Sampler {
+    type X = f16;
+    fn new<B1, B2>(low: B1, high: B2) -> Result<Self, rand_distr::uniform::Error>
+    where
+        B1: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Ok(Self(UniformFloat::new(
+            low.borrow().to_f32(),
+            high.borrow().to_f32(),
+        )?))
+    }
+    fn new_inclusive<B1, B2>(low: B1, high: B2) -> Result<Self, rand_distr::uniform::Error>
+    where
+        B1: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Ok(Self(UniformFloat::new_inclusive(
+            low.borrow().to_f32(),
+            high.borrow().to_f32(),
+        )?))
+    }
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+        f16::from_f32(self.0.sample(rng))
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct BFloat16Sampler(UniformFloat<f32>);
+
+impl rand_distr::uniform::SampleUniform for bf16 {
+    type Sampler = BFloat16Sampler;
+}
+
+impl rand_distr::uniform::UniformSampler for BFloat16Sampler {
+    type X = bf16;
+    fn new<B1, B2>(low: B1, high: B2) -> Result<Self, rand_distr::uniform::Error>
+    where
+        B1: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Ok(Self(UniformFloat::new(
+            low.borrow().to_f32(),
+            high.borrow().to_f32(),
+        )?))
+    }
+    fn new_inclusive<B1, B2>(low: B1, high: B2) -> Result<Self, rand_distr::uniform::Error>
+    where
+        B1: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Ok(Self(UniformFloat::new_inclusive(
+            low.borrow().to_f32(),
+            high.borrow().to_f32(),
+        )?))
+    }
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+        bf16::from_f32(self.0.sample(rng))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[allow(unused_imports)]
+    use rand::{rng, Rng};
+    use rand_distr::{StandardNormal, StandardUniform, Uniform};
+
+    #[test]
+    fn test_sample_f16() {
+        let mut rng = rng();
+        let _: f16 = rng.sample(StandardUniform);
+        let _: f16 = rng.sample(StandardNormal);
+        let _: f16 = rng.sample(Uniform::new(f16::from_f32(0.0), f16::from_f32(1.0)).unwrap());
+        #[cfg(feature = "num-traits")]
+        let _: f16 =
+            rng.sample(rand_distr::Normal::new(f16::from_f32(0.0), f16::from_f32(1.0)).unwrap());
+    }
+
+    #[test]
+    fn test_sample_bf16() {
+        let mut rng = rng();
+        let _: bf16 = rng.sample(StandardUniform);
+        let _: bf16 = rng.sample(StandardNormal);
+        let _: bf16 = rng.sample(Uniform::new(bf16::from_f32(0.0), bf16::from_f32(1.0)).unwrap());
+        #[cfg(feature = "num-traits")]
+        let _: bf16 =
+            rng.sample(rand_distr::Normal::new(bf16::from_f32(0.0), bf16::from_f32(1.0)).unwrap());
+    }
+}
--- a/vendor/half/src/slice.rs
+++ b/vendor/half/src/slice.rs
@@ -0,0 +1,845 @@
+//! Contains utility functions and traits to convert between slices of [`u16`] bits and [`struct@f16`] or
+//! [`struct@bf16`] numbers.
+//!
+//! The utility [`HalfBitsSliceExt`] sealed extension trait is implemented for `[u16]` slices,
+//! while the utility [`HalfFloatSliceExt`] sealed extension trait is implemented for both `[f16]`
+//! and `[bf16]` slices. These traits provide efficient conversions and reinterpret casting of
+//! larger buffers of floating point values, and are automatically included in the
+//! [`prelude`][crate::prelude] module.
+
+use crate::{bf16, binary16::arch, f16};
+#[cfg(feature = "alloc")]
+#[allow(unused_imports)]
+use alloc::{vec, vec::Vec};
+use core::slice;
+
+/// Extensions to `[f16]` and `[bf16]` slices to support conversion and reinterpret operations.
+///
+/// This trait is sealed and cannot be implemented outside of this crate.
+pub trait HalfFloatSliceExt: private::SealedHalfFloatSlice {
+    /// Reinterprets a slice of [`struct@f16`] or [`struct@bf16`] numbers as a slice of [`u16`] bits.
+    ///
+    /// This is a zero-copy operation. The reinterpreted slice has the same lifetime and memory
+    /// location as `self`.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let float_buffer = [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)];
+    /// let int_buffer = float_buffer.reinterpret_cast();
+    ///
+    /// assert_eq!(int_buffer, [float_buffer[0].to_bits(), float_buffer[1].to_bits(), float_buffer[2].to_bits()]);
+    /// ```
+    #[must_use]
+    fn reinterpret_cast(&self) -> &[u16];
+
+    /// Reinterprets a mutable slice of [`struct@f16`] or [`struct@bf16`] numbers as a mutable slice of [`u16`].
+    /// bits
+    ///
+    /// This is a zero-copy operation. The transmuted slice has the same lifetime as the original,
+    /// which prevents mutating `self` as long as the returned `&mut [u16]` is borrowed.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let mut float_buffer = [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)];
+    ///
+    /// {
+    ///     let int_buffer = float_buffer.reinterpret_cast_mut();
+    ///
+    ///     assert_eq!(int_buffer, [f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()]);
+    ///
+    ///     // Mutating the u16 slice will mutating the original
+    ///     int_buffer[0] = 0;
+    /// }
+    ///
+    /// // Note that we need to drop int_buffer before using float_buffer again or we will get a borrow error.
+    /// assert_eq!(float_buffer, [f16::from_f32(0.), f16::from_f32(2.), f16::from_f32(3.)]);
+    /// ```
+    #[must_use]
+    fn reinterpret_cast_mut(&mut self) -> &mut [u16];
+
+    /// Converts all of the elements of a `[f32]` slice into [`struct@f16`] or [`struct@bf16`] values in `self`.
+    ///
+    /// The length of `src` must be the same as `self`.
+    ///
+    /// The conversion operation is vectorized over the slice, meaning the conversion may be more
+    /// efficient than converting individual elements on some hardware that supports SIMD
+    /// conversions. See [crate documentation](crate) for more information on hardware conversion
+    /// support.
+    ///
+    /// # Panics
+    ///
+    /// This function will panic if the two slices have different lengths.
+    ///
+    /// # Examples
+    /// ```rust
+    /// # use half::prelude::*;
+    /// // Initialize an empty buffer
+    /// let mut buffer = [0u16; 4];
+    /// let buffer = buffer.reinterpret_cast_mut::<f16>();
+    ///
+    /// let float_values = [1., 2., 3., 4.];
+    ///
+    /// // Now convert
+    /// buffer.convert_from_f32_slice(&float_values);
+    ///
+    /// assert_eq!(buffer, [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.), f16::from_f32(4.)]);
+    /// ```
+    fn convert_from_f32_slice(&mut self, src: &[f32]);
+
+    /// Converts all of the elements of a `[f64]` slice into [`struct@f16`] or [`struct@bf16`] values in `self`.
+    ///
+    /// The length of `src` must be the same as `self`.
+    ///
+    /// The conversion operation is vectorized over the slice, meaning the conversion may be more
+    /// efficient than converting individual elements on some hardware that supports SIMD
+    /// conversions. See [crate documentation](crate) for more information on hardware conversion
+    /// support.
+    ///
+    /// # Panics
+    ///
+    /// This function will panic if the two slices have different lengths.
+    ///
+    /// # Examples
+    /// ```rust
+    /// # use half::prelude::*;
+    /// // Initialize an empty buffer
+    /// let mut buffer = [0u16; 4];
+    /// let buffer = buffer.reinterpret_cast_mut::<f16>();
+    ///
+    /// let float_values = [1., 2., 3., 4.];
+    ///
+    /// // Now convert
+    /// buffer.convert_from_f64_slice(&float_values);
+    ///
+    /// assert_eq!(buffer, [f16::from_f64(1.), f16::from_f64(2.), f16::from_f64(3.), f16::from_f64(4.)]);
+    /// ```
+    fn convert_from_f64_slice(&mut self, src: &[f64]);
+
+    /// Converts all of the [`struct@f16`] or [`struct@bf16`] elements of `self` into [`f32`] values in `dst`.
+    ///
+    /// The length of `src` must be the same as `self`.
+    ///
+    /// The conversion operation is vectorized over the slice, meaning the conversion may be more
+    /// efficient than converting individual elements on some hardware that supports SIMD
+    /// conversions. See [crate documentation](crate) for more information on hardware conversion
+    /// support.
+    ///
+    /// # Panics
+    ///
+    /// This function will panic if the two slices have different lengths.
+    ///
+    /// # Examples
+    /// ```rust
+    /// # use half::prelude::*;
+    /// // Initialize an empty buffer
+    /// let mut buffer = [0f32; 4];
+    ///
+    /// let half_values = [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.), f16::from_f32(4.)];
+    ///
+    /// // Now convert
+    /// half_values.convert_to_f32_slice(&mut buffer);
+    ///
+    /// assert_eq!(buffer, [1., 2., 3., 4.]);
+    /// ```
+    fn convert_to_f32_slice(&self, dst: &mut [f32]);
+
+    /// Converts all of the [`struct@f16`] or [`struct@bf16`] elements of `self` into [`f64`] values in `dst`.
+    ///
+    /// The length of `src` must be the same as `self`.
+    ///
+    /// The conversion operation is vectorized over the slice, meaning the conversion may be more
+    /// efficient than converting individual elements on some hardware that supports SIMD
+    /// conversions. See [crate documentation](crate) for more information on hardware conversion
+    /// support.
+    ///
+    /// # Panics
+    ///
+    /// This function will panic if the two slices have different lengths.
+    ///
+    /// # Examples
+    /// ```rust
+    /// # use half::prelude::*;
+    /// // Initialize an empty buffer
+    /// let mut buffer = [0f64; 4];
+    ///
+    /// let half_values = [f16::from_f64(1.), f16::from_f64(2.), f16::from_f64(3.), f16::from_f64(4.)];
+    ///
+    /// // Now convert
+    /// half_values.convert_to_f64_slice(&mut buffer);
+    ///
+    /// assert_eq!(buffer, [1., 2., 3., 4.]);
+    /// ```
+    fn convert_to_f64_slice(&self, dst: &mut [f64]);
+
+    // Because trait is sealed, we can get away with different interfaces between features.
+
+    /// Converts all of the [`struct@f16`] or [`struct@bf16`] elements of `self` into [`f32`] values in a new
+    /// vector
+    ///
+    /// The conversion operation is vectorized over the slice, meaning the conversion may be more
+    /// efficient than converting individual elements on some hardware that supports SIMD
+    /// conversions. See [crate documentation](crate) for more information on hardware conversion
+    /// support.
+    ///
+    /// This method is only available with the `std` or `alloc` feature.
+    ///
+    /// # Examples
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let half_values = [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.), f16::from_f32(4.)];
+    /// let vec = half_values.to_f32_vec();
+    ///
+    /// assert_eq!(vec, vec![1., 2., 3., 4.]);
+    /// ```
+    #[cfg(any(feature = "alloc", feature = "std"))]
+    #[must_use]
+    fn to_f32_vec(&self) -> Vec<f32>;
+
+    /// Converts all of the [`struct@f16`] or [`struct@bf16`] elements of `self` into [`f64`] values in a new
+    /// vector.
+    ///
+    /// The conversion operation is vectorized over the slice, meaning the conversion may be more
+    /// efficient than converting individual elements on some hardware that supports SIMD
+    /// conversions. See [crate documentation](crate) for more information on hardware conversion
+    /// support.
+    ///
+    /// This method is only available with the `std` or `alloc` feature.
+    ///
+    /// # Examples
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let half_values = [f16::from_f64(1.), f16::from_f64(2.), f16::from_f64(3.), f16::from_f64(4.)];
+    /// let vec = half_values.to_f64_vec();
+    ///
+    /// assert_eq!(vec, vec![1., 2., 3., 4.]);
+    /// ```
+    #[cfg(feature = "alloc")]
+    #[must_use]
+    fn to_f64_vec(&self) -> Vec<f64>;
+}
+
+/// Extensions to `[u16]` slices to support reinterpret operations.
+///
+/// This trait is sealed and cannot be implemented outside of this crate.
+pub trait HalfBitsSliceExt: private::SealedHalfBitsSlice {
+    /// Reinterprets a slice of [`u16`] bits as a slice of [`struct@f16`] or [`struct@bf16`] numbers.
+    ///
+    /// `H` is the type to cast to, and must be either the [`struct@f16`] or [`struct@bf16`] type.
+    ///
+    /// This is a zero-copy operation. The reinterpreted slice has the same lifetime and memory
+    /// location as `self`.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let int_buffer = [f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()];
+    /// let float_buffer: &[f16] = int_buffer.reinterpret_cast();
+    ///
+    /// assert_eq!(float_buffer, [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)]);
+    ///
+    /// // You may have to specify the cast type directly if the compiler can't infer the type.
+    /// // The following is also valid in Rust.
+    /// let typed_buffer = int_buffer.reinterpret_cast::<f16>();
+    /// ```
+    #[must_use]
+    fn reinterpret_cast<H>(&self) -> &[H]
+    where
+        H: crate::private::SealedHalf;
+
+    /// Reinterprets a mutable slice of [`u16`] bits as a mutable slice of [`struct@f16`] or [`struct@bf16`]
+    /// numbers.
+    ///
+    /// `H` is the type to cast to, and must be either the [`struct@f16`] or [`struct@bf16`] type.
+    ///
+    /// This is a zero-copy operation. The transmuted slice has the same lifetime as the original,
+    /// which prevents mutating `self` as long as the returned `&mut [f16]` is borrowed.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let mut int_buffer = [f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()];
+    ///
+    /// {
+    ///     let float_buffer: &mut [f16] = int_buffer.reinterpret_cast_mut();
+    ///
+    ///     assert_eq!(float_buffer, [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)]);
+    ///
+    ///     // Mutating the f16 slice will mutating the original
+    ///     float_buffer[0] = f16::from_f32(0.);
+    /// }
+    ///
+    /// // Note that we need to drop float_buffer before using int_buffer again or we will get a borrow error.
+    /// assert_eq!(int_buffer, [f16::from_f32(0.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()]);
+    ///
+    /// // You may have to specify the cast type directly if the compiler can't infer the type.
+    /// // The following is also valid in Rust.
+    /// let typed_buffer = int_buffer.reinterpret_cast_mut::<f16>();
+    /// ```
+    #[must_use]
+    fn reinterpret_cast_mut<H>(&mut self) -> &mut [H]
+    where
+        H: crate::private::SealedHalf;
+}
+
+mod private {
+    use crate::{bf16, f16};
+
+    pub trait SealedHalfFloatSlice {}
+    impl SealedHalfFloatSlice for [f16] {}
+    impl SealedHalfFloatSlice for [bf16] {}
+
+    pub trait SealedHalfBitsSlice {}
+    impl SealedHalfBitsSlice for [u16] {}
+}
+
+impl HalfFloatSliceExt for [f16] {
+    #[inline]
+    fn reinterpret_cast(&self) -> &[u16] {
+        let pointer = self.as_ptr() as *const u16;
+        let length = self.len();
+        // SAFETY: We are reconstructing full length of original slice, using its same lifetime,
+        // and the size of elements are identical
+        unsafe { slice::from_raw_parts(pointer, length) }
+    }
+
+    #[inline]
+    fn reinterpret_cast_mut(&mut self) -> &mut [u16] {
+        let pointer = self.as_mut_ptr().cast::<u16>();
+        let length = self.len();
+        // SAFETY: We are reconstructing full length of original slice, using its same lifetime,
+        // and the size of elements are identical
+        unsafe { slice::from_raw_parts_mut(pointer, length) }
+    }
+
+    #[inline]
+    fn convert_from_f32_slice(&mut self, src: &[f32]) {
+        assert_eq!(
+            self.len(),
+            src.len(),
+            "destination and source slices have different lengths"
+        );
+
+        arch::f32_to_f16_slice(src, self.reinterpret_cast_mut())
+    }
+
+    #[inline]
+    fn convert_from_f64_slice(&mut self, src: &[f64]) {
+        assert_eq!(
+            self.len(),
+            src.len(),
+            "destination and source slices have different lengths"
+        );
+
+        arch::f64_to_f16_slice(src, self.reinterpret_cast_mut())
+    }
+
+    #[inline]
+    fn convert_to_f32_slice(&self, dst: &mut [f32]) {
+        assert_eq!(
+            self.len(),
+            dst.len(),
+            "destination and source slices have different lengths"
+        );
+
+        arch::f16_to_f32_slice(self.reinterpret_cast(), dst)
+    }
+
+    #[inline]
+    fn convert_to_f64_slice(&self, dst: &mut [f64]) {
+        assert_eq!(
+            self.len(),
+            dst.len(),
+            "destination and source slices have different lengths"
+        );
+
+        arch::f16_to_f64_slice(self.reinterpret_cast(), dst)
+    }
+
+    #[cfg(any(feature = "alloc", feature = "std"))]
+    #[inline]
+    #[allow(clippy::uninit_vec)]
+    fn to_f32_vec(&self) -> Vec<f32> {
+        let mut vec = vec![0f32; self.len()];
+        self.convert_to_f32_slice(&mut vec);
+        vec
+    }
+
+    #[cfg(any(feature = "alloc", feature = "std"))]
+    #[inline]
+    #[allow(clippy::uninit_vec)]
+    fn to_f64_vec(&self) -> Vec<f64> {
+        let mut vec = vec![0f64; self.len()];
+        self.convert_to_f64_slice(&mut vec);
+        vec
+    }
+}
+
+impl HalfFloatSliceExt for [bf16] {
+    #[inline]
+    fn reinterpret_cast(&self) -> &[u16] {
+        let pointer = self.as_ptr() as *const u16;
+        let length = self.len();
+        // SAFETY: We are reconstructing full length of original slice, using its same lifetime,
+        // and the size of elements are identical
+        unsafe { slice::from_raw_parts(pointer, length) }
+    }
+
+    #[inline]
+    fn reinterpret_cast_mut(&mut self) -> &mut [u16] {
+        let pointer = self.as_mut_ptr().cast::<u16>();
+        let length = self.len();
+        // SAFETY: We are reconstructing full length of original slice, using its same lifetime,
+        // and the size of elements are identical
+        unsafe { slice::from_raw_parts_mut(pointer, length) }
+    }
+
+    #[inline]
+    fn convert_from_f32_slice(&mut self, src: &[f32]) {
+        assert_eq!(
+            self.len(),
+            src.len(),
+            "destination and source slices have different lengths"
+        );
+
+        // Just use regular loop here until there's any bf16 SIMD support.
+        for (i, f) in src.iter().enumerate() {
+            self[i] = bf16::from_f32(*f);
+        }
+    }
+
+    #[inline]
+    fn convert_from_f64_slice(&mut self, src: &[f64]) {
+        assert_eq!(
+            self.len(),
+            src.len(),
+            "destination and source slices have different lengths"
+        );
+
+        // Just use regular loop here until there's any bf16 SIMD support.
+        for (i, f) in src.iter().enumerate() {
+            self[i] = bf16::from_f64(*f);
+        }
+    }
+
+    #[inline]
+    fn convert_to_f32_slice(&self, dst: &mut [f32]) {
+        assert_eq!(
+            self.len(),
+            dst.len(),
+            "destination and source slices have different lengths"
+        );
+
+        // Just use regular loop here until there's any bf16 SIMD support.
+        for (i, f) in self.iter().enumerate() {
+            dst[i] = f.to_f32();
+        }
+    }
+
+    #[inline]
+    fn convert_to_f64_slice(&self, dst: &mut [f64]) {
+        assert_eq!(
+            self.len(),
+            dst.len(),
+            "destination and source slices have different lengths"
+        );
+
+        // Just use regular loop here until there's any bf16 SIMD support.
+        for (i, f) in self.iter().enumerate() {
+            dst[i] = f.to_f64();
+        }
+    }
+
+    #[cfg(any(feature = "alloc", feature = "std"))]
+    #[inline]
+    #[allow(clippy::uninit_vec)]
+    fn to_f32_vec(&self) -> Vec<f32> {
+        let mut vec = vec![0f32; self.len()];
+        self.convert_to_f32_slice(&mut vec);
+        vec
+    }
+
+    #[cfg(any(feature = "alloc", feature = "std"))]
+    #[inline]
+    #[allow(clippy::uninit_vec)]
+    fn to_f64_vec(&self) -> Vec<f64> {
+        let mut vec = vec![0f64; self.len()];
+        self.convert_to_f64_slice(&mut vec);
+        vec
+    }
+}
+
+impl HalfBitsSliceExt for [u16] {
+    // Since we sealed all the traits involved, these are safe.
+    #[inline]
+    fn reinterpret_cast<H>(&self) -> &[H]
+    where
+        H: crate::private::SealedHalf,
+    {
+        let pointer = self.as_ptr() as *const H;
+        let length = self.len();
+        // SAFETY: We are reconstructing full length of original slice, using its same lifetime,
+        // and the size of elements are identical
+        unsafe { slice::from_raw_parts(pointer, length) }
+    }
+
+    #[inline]
+    fn reinterpret_cast_mut<H>(&mut self) -> &mut [H]
+    where
+        H: crate::private::SealedHalf,
+    {
+        let pointer = self.as_mut_ptr() as *mut H;
+        let length = self.len();
+        // SAFETY: We are reconstructing full length of original slice, using its same lifetime,
+        // and the size of elements are identical
+        unsafe { slice::from_raw_parts_mut(pointer, length) }
+    }
+}
+
+#[allow(clippy::float_cmp)]
+#[cfg(test)]
+mod test {
+    use super::{HalfBitsSliceExt, HalfFloatSliceExt};
+    use crate::{bf16, f16};
+
+    #[test]
+    fn test_slice_conversions_f16() {
+        let bits = &[
+            f16::E.to_bits(),
+            f16::PI.to_bits(),
+            f16::EPSILON.to_bits(),
+            f16::FRAC_1_SQRT_2.to_bits(),
+        ];
+        let numbers = &[f16::E, f16::PI, f16::EPSILON, f16::FRAC_1_SQRT_2];
+
+        // Convert from bits to numbers
+        let from_bits = bits.reinterpret_cast::<f16>();
+        assert_eq!(from_bits, numbers);
+
+        // Convert from numbers back to bits
+        let to_bits = from_bits.reinterpret_cast();
+        assert_eq!(to_bits, bits);
+    }
+
+    #[test]
+    fn test_mutablility_f16() {
+        let mut bits_array = [f16::PI.to_bits()];
+        let bits = &mut bits_array[..];
+
+        {
+            // would not compile without these braces
+            let numbers = bits.reinterpret_cast_mut();
+            numbers[0] = f16::E;
+        }
+
+        assert_eq!(bits, &[f16::E.to_bits()]);
+
+        bits[0] = f16::LN_2.to_bits();
+        assert_eq!(bits, &[f16::LN_2.to_bits()]);
+    }
+
+    #[test]
+    fn test_slice_conversions_bf16() {
+        let bits = &[
+            bf16::E.to_bits(),
+            bf16::PI.to_bits(),
+            bf16::EPSILON.to_bits(),
+            bf16::FRAC_1_SQRT_2.to_bits(),
+        ];
+        let numbers = &[bf16::E, bf16::PI, bf16::EPSILON, bf16::FRAC_1_SQRT_2];
+
+        // Convert from bits to numbers
+        let from_bits = bits.reinterpret_cast::<bf16>();
+        assert_eq!(from_bits, numbers);
+
+        // Convert from numbers back to bits
+        let to_bits = from_bits.reinterpret_cast();
+        assert_eq!(to_bits, bits);
+    }
+
+    #[test]
+    fn test_mutablility_bf16() {
+        let mut bits_array = [bf16::PI.to_bits()];
+        let bits = &mut bits_array[..];
+
+        {
+            // would not compile without these braces
+            let numbers = bits.reinterpret_cast_mut();
+            numbers[0] = bf16::E;
+        }
+
+        assert_eq!(bits, &[bf16::E.to_bits()]);
+
+        bits[0] = bf16::LN_2.to_bits();
+        assert_eq!(bits, &[bf16::LN_2.to_bits()]);
+    }
+
+    #[test]
+    fn slice_convert_f16_f32() {
+        // Exact chunks
+        let vf32 = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let vf16 = [
+            f16::from_f32(1.),
+            f16::from_f32(2.),
+            f16::from_f32(3.),
+            f16::from_f32(4.),
+            f16::from_f32(5.),
+            f16::from_f32(6.),
+            f16::from_f32(7.),
+            f16::from_f32(8.),
+        ];
+        let mut buf32 = vf32;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f32_slice(&mut buf32);
+        assert_eq!(&vf32, &buf32);
+
+        buf16.convert_from_f32_slice(&vf32);
+        assert_eq!(&vf16, &buf16);
+
+        // Partial with chunks
+        let vf32 = [1., 2., 3., 4., 5., 6., 7., 8., 9.];
+        let vf16 = [
+            f16::from_f32(1.),
+            f16::from_f32(2.),
+            f16::from_f32(3.),
+            f16::from_f32(4.),
+            f16::from_f32(5.),
+            f16::from_f32(6.),
+            f16::from_f32(7.),
+            f16::from_f32(8.),
+            f16::from_f32(9.),
+        ];
+        let mut buf32 = vf32;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f32_slice(&mut buf32);
+        assert_eq!(&vf32, &buf32);
+
+        buf16.convert_from_f32_slice(&vf32);
+        assert_eq!(&vf16, &buf16);
+
+        // Partial with chunks
+        let vf32 = [1., 2.];
+        let vf16 = [f16::from_f32(1.), f16::from_f32(2.)];
+        let mut buf32 = vf32;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f32_slice(&mut buf32);
+        assert_eq!(&vf32, &buf32);
+
+        buf16.convert_from_f32_slice(&vf32);
+        assert_eq!(&vf16, &buf16);
+    }
+
+    #[test]
+    fn slice_convert_bf16_f32() {
+        // Exact chunks
+        let vf32 = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let vf16 = [
+            bf16::from_f32(1.),
+            bf16::from_f32(2.),
+            bf16::from_f32(3.),
+            bf16::from_f32(4.),
+            bf16::from_f32(5.),
+            bf16::from_f32(6.),
+            bf16::from_f32(7.),
+            bf16::from_f32(8.),
+        ];
+        let mut buf32 = vf32;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f32_slice(&mut buf32);
+        assert_eq!(&vf32, &buf32);
+
+        buf16.convert_from_f32_slice(&vf32);
+        assert_eq!(&vf16, &buf16);
+
+        // Partial with chunks
+        let vf32 = [1., 2., 3., 4., 5., 6., 7., 8., 9.];
+        let vf16 = [
+            bf16::from_f32(1.),
+            bf16::from_f32(2.),
+            bf16::from_f32(3.),
+            bf16::from_f32(4.),
+            bf16::from_f32(5.),
+            bf16::from_f32(6.),
+            bf16::from_f32(7.),
+            bf16::from_f32(8.),
+            bf16::from_f32(9.),
+        ];
+        let mut buf32 = vf32;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f32_slice(&mut buf32);
+        assert_eq!(&vf32, &buf32);
+
+        buf16.convert_from_f32_slice(&vf32);
+        assert_eq!(&vf16, &buf16);
+
+        // Partial with chunks
+        let vf32 = [1., 2.];
+        let vf16 = [bf16::from_f32(1.), bf16::from_f32(2.)];
+        let mut buf32 = vf32;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f32_slice(&mut buf32);
+        assert_eq!(&vf32, &buf32);
+
+        buf16.convert_from_f32_slice(&vf32);
+        assert_eq!(&vf16, &buf16);
+    }
+
+    #[test]
+    fn slice_convert_f16_f64() {
+        // Exact chunks
+        let vf64 = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let vf16 = [
+            f16::from_f64(1.),
+            f16::from_f64(2.),
+            f16::from_f64(3.),
+            f16::from_f64(4.),
+            f16::from_f64(5.),
+            f16::from_f64(6.),
+            f16::from_f64(7.),
+            f16::from_f64(8.),
+        ];
+        let mut buf64 = vf64;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f64_slice(&mut buf64);
+        assert_eq!(&vf64, &buf64);
+
+        buf16.convert_from_f64_slice(&vf64);
+        assert_eq!(&vf16, &buf16);
+
+        // Partial with chunks
+        let vf64 = [1., 2., 3., 4., 5., 6., 7., 8., 9.];
+        let vf16 = [
+            f16::from_f64(1.),
+            f16::from_f64(2.),
+            f16::from_f64(3.),
+            f16::from_f64(4.),
+            f16::from_f64(5.),
+            f16::from_f64(6.),
+            f16::from_f64(7.),
+            f16::from_f64(8.),
+            f16::from_f64(9.),
+        ];
+        let mut buf64 = vf64;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f64_slice(&mut buf64);
+        assert_eq!(&vf64, &buf64);
+
+        buf16.convert_from_f64_slice(&vf64);
+        assert_eq!(&vf16, &buf16);
+
+        // Partial with chunks
+        let vf64 = [1., 2.];
+        let vf16 = [f16::from_f64(1.), f16::from_f64(2.)];
+        let mut buf64 = vf64;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f64_slice(&mut buf64);
+        assert_eq!(&vf64, &buf64);
+
+        buf16.convert_from_f64_slice(&vf64);
+        assert_eq!(&vf16, &buf16);
+    }
+
+    #[test]
+    fn slice_convert_bf16_f64() {
+        // Exact chunks
+        let vf64 = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let vf16 = [
+            bf16::from_f64(1.),
+            bf16::from_f64(2.),
+            bf16::from_f64(3.),
+            bf16::from_f64(4.),
+            bf16::from_f64(5.),
+            bf16::from_f64(6.),
+            bf16::from_f64(7.),
+            bf16::from_f64(8.),
+        ];
+        let mut buf64 = vf64;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f64_slice(&mut buf64);
+        assert_eq!(&vf64, &buf64);
+
+        buf16.convert_from_f64_slice(&vf64);
+        assert_eq!(&vf16, &buf16);
+
+        // Partial with chunks
+        let vf64 = [1., 2., 3., 4., 5., 6., 7., 8., 9.];
+        let vf16 = [
+            bf16::from_f64(1.),
+            bf16::from_f64(2.),
+            bf16::from_f64(3.),
+            bf16::from_f64(4.),
+            bf16::from_f64(5.),
+            bf16::from_f64(6.),
+            bf16::from_f64(7.),
+            bf16::from_f64(8.),
+            bf16::from_f64(9.),
+        ];
+        let mut buf64 = vf64;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f64_slice(&mut buf64);
+        assert_eq!(&vf64, &buf64);
+
+        buf16.convert_from_f64_slice(&vf64);
+        assert_eq!(&vf16, &buf16);
+
+        // Partial with chunks
+        let vf64 = [1., 2.];
+        let vf16 = [bf16::from_f64(1.), bf16::from_f64(2.)];
+        let mut buf64 = vf64;
+        let mut buf16 = vf16;
+
+        vf16.convert_to_f64_slice(&mut buf64);
+        assert_eq!(&vf64, &buf64);
+
+        buf16.convert_from_f64_slice(&vf64);
+        assert_eq!(&vf16, &buf16);
+    }
+
+    #[test]
+    #[should_panic]
+    fn convert_from_f32_slice_len_mismatch_panics() {
+        let mut slice1 = [f16::ZERO; 3];
+        let slice2 = [0f32; 4];
+        slice1.convert_from_f32_slice(&slice2);
+    }
+
+    #[test]
+    #[should_panic]
+    fn convert_from_f64_slice_len_mismatch_panics() {
+        let mut slice1 = [f16::ZERO; 3];
+        let slice2 = [0f64; 4];
+        slice1.convert_from_f64_slice(&slice2);
+    }
+
+    #[test]
+    #[should_panic]
+    fn convert_to_f32_slice_len_mismatch_panics() {
+        let slice1 = [f16::ZERO; 3];
+        let mut slice2 = [0f32; 4];
+        slice1.convert_to_f32_slice(&mut slice2);
+    }
+
+    #[test]
+    #[should_panic]
+    fn convert_to_f64_slice_len_mismatch_panics() {
+        let slice1 = [f16::ZERO; 3];
+        let mut slice2 = [0f64; 4];
+        slice1.convert_to_f64_slice(&mut slice2);
+    }
+}
--- a/vendor/half/src/vec.rs
+++ b/vendor/half/src/vec.rs
@@ -0,0 +1,260 @@
+//! Contains utility functions and traits to convert between vectors of [`u16`] bits and [`struct@f16`] or
+//! [`bf16`] vectors.
+//!
+//! The utility [`HalfBitsVecExt`] sealed extension trait is implemented for [`Vec<u16>`] vectors,
+//! while the utility [`HalfFloatVecExt`] sealed extension trait is implemented for both
+//! [`Vec<f16>`] and [`Vec<bf16>`] vectors. These traits provide efficient conversions and
+//! reinterpret casting of larger buffers of floating point values, and are automatically included
+//! in the [`prelude`][crate::prelude] module.
+//!
+//! This module is only available with the `std` or `alloc` feature.
+
+use super::{bf16, f16, slice::HalfFloatSliceExt};
+#[cfg(feature = "alloc")]
+#[allow(unused_imports)]
+use alloc::{vec, vec::Vec};
+use core::mem;
+
+/// Extensions to [`Vec<f16>`] and [`Vec<bf16>`] to support reinterpret operations.
+///
+/// This trait is sealed and cannot be implemented outside of this crate.
+pub trait HalfFloatVecExt: private::SealedHalfFloatVec {
+    /// Reinterprets a vector of [`struct@f16`]or [`bf16`] numbers as a vector of [`u16`] bits.
+    ///
+    /// This is a zero-copy operation. The reinterpreted vector has the same memory location as
+    /// `self`.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let float_buffer = vec![f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)];
+    /// let int_buffer = float_buffer.reinterpret_into();
+    ///
+    /// assert_eq!(int_buffer, [f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()]);
+    /// ```
+    #[must_use]
+    fn reinterpret_into(self) -> Vec<u16>;
+
+    /// Converts all of the elements of a `[f32]` slice into a new [`struct@f16`] or [`bf16`] vector.
+    ///
+    /// The conversion operation is vectorized over the slice, meaning the conversion may be more
+    /// efficient than converting individual elements on some hardware that supports SIMD
+    /// conversions. See [crate documentation][crate] for more information on hardware conversion
+    /// support.
+    ///
+    /// # Examples
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let float_values = [1., 2., 3., 4.];
+    /// let vec: Vec<f16> = Vec::from_f32_slice(&float_values);
+    ///
+    /// assert_eq!(vec, vec![f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.), f16::from_f32(4.)]);
+    /// ```
+    #[must_use]
+    fn from_f32_slice(slice: &[f32]) -> Self;
+
+    /// Converts all of the elements of a `[f64]` slice into a new [`struct@f16`] or [`bf16`] vector.
+    ///
+    /// The conversion operation is vectorized over the slice, meaning the conversion may be more
+    /// efficient than converting individual elements on some hardware that supports SIMD
+    /// conversions. See [crate documentation][crate] for more information on hardware conversion
+    /// support.
+    ///
+    /// # Examples
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let float_values = [1., 2., 3., 4.];
+    /// let vec: Vec<f16> = Vec::from_f64_slice(&float_values);
+    ///
+    /// assert_eq!(vec, vec![f16::from_f64(1.), f16::from_f64(2.), f16::from_f64(3.), f16::from_f64(4.)]);
+    /// ```
+    #[must_use]
+    fn from_f64_slice(slice: &[f64]) -> Self;
+}
+
+/// Extensions to [`Vec<u16>`] to support reinterpret operations.
+///
+/// This trait is sealed and cannot be implemented outside of this crate.
+pub trait HalfBitsVecExt: private::SealedHalfBitsVec {
+    /// Reinterprets a vector of [`u16`] bits as a vector of [`struct@f16`] or [`bf16`] numbers.
+    ///
+    /// `H` is the type to cast to, and must be either the [`struct@f16`] or [`bf16`] type.
+    ///
+    /// This is a zero-copy operation. The reinterpreted vector has the same memory location as
+    /// `self`.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// # use half::prelude::*;
+    /// let int_buffer = vec![f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()];
+    /// let float_buffer = int_buffer.reinterpret_into::<f16>();
+    ///
+    /// assert_eq!(float_buffer, [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)]);
+    /// ```
+    #[must_use]
+    fn reinterpret_into<H>(self) -> Vec<H>
+    where
+        H: crate::private::SealedHalf;
+}
+
+mod private {
+    use crate::{bf16, f16};
+    #[cfg(feature = "alloc")]
+    #[allow(unused_imports)]
+    use alloc::vec::Vec;
+
+    pub trait SealedHalfFloatVec {}
+    impl SealedHalfFloatVec for Vec<f16> {}
+    impl SealedHalfFloatVec for Vec<bf16> {}
+
+    pub trait SealedHalfBitsVec {}
+    impl SealedHalfBitsVec for Vec<u16> {}
+}
+
+impl HalfFloatVecExt for Vec<f16> {
+    #[inline]
+    fn reinterpret_into(mut self) -> Vec<u16> {
+        // An f16 array has same length and capacity as u16 array
+        let length = self.len();
+        let capacity = self.capacity();
+
+        // Actually reinterpret the contents of the Vec<f16> as u16,
+        // knowing that structs are represented as only their members in memory,
+        // which is the u16 part of `f16(u16)`
+        let pointer = self.as_mut_ptr() as *mut u16;
+
+        // Prevent running a destructor on the old Vec<u16>, so the pointer won't be deleted
+        mem::forget(self);
+
+        // Finally construct a new Vec<f16> from the raw pointer
+        // SAFETY: We are reconstructing full length and capacity of original vector,
+        // using its original pointer, and the size of elements are identical.
+        unsafe { Vec::from_raw_parts(pointer, length, capacity) }
+    }
+
+    #[allow(clippy::uninit_vec)]
+    fn from_f32_slice(slice: &[f32]) -> Self {
+        let mut vec = vec![f16::from_bits(0); slice.len()];
+        vec.convert_from_f32_slice(slice);
+        vec
+    }
+
+    #[allow(clippy::uninit_vec)]
+    fn from_f64_slice(slice: &[f64]) -> Self {
+        let mut vec = vec![f16::from_bits(0); slice.len()];
+        vec.convert_from_f64_slice(slice);
+        vec
+    }
+}
+
+impl HalfFloatVecExt for Vec<bf16> {
+    #[inline]
+    fn reinterpret_into(mut self) -> Vec<u16> {
+        // An f16 array has same length and capacity as u16 array
+        let length = self.len();
+        let capacity = self.capacity();
+
+        // Actually reinterpret the contents of the Vec<f16> as u16,
+        // knowing that structs are represented as only their members in memory,
+        // which is the u16 part of `f16(u16)`
+        let pointer = self.as_mut_ptr() as *mut u16;
+
+        // Prevent running a destructor on the old Vec<u16>, so the pointer won't be deleted
+        mem::forget(self);
+
+        // Finally construct a new Vec<f16> from the raw pointer
+        // SAFETY: We are reconstructing full length and capacity of original vector,
+        // using its original pointer, and the size of elements are identical.
+        unsafe { Vec::from_raw_parts(pointer, length, capacity) }
+    }
+
+    #[allow(clippy::uninit_vec)]
+    fn from_f32_slice(slice: &[f32]) -> Self {
+        let mut vec = vec![bf16::from_bits(0); slice.len()];
+        vec.convert_from_f32_slice(slice);
+        vec
+    }
+
+    #[allow(clippy::uninit_vec)]
+    fn from_f64_slice(slice: &[f64]) -> Self {
+        let mut vec = vec![bf16::from_bits(0); slice.len()];
+        vec.convert_from_f64_slice(slice);
+        vec
+    }
+}
+
+impl HalfBitsVecExt for Vec<u16> {
+    // This is safe because all traits are sealed
+    #[inline]
+    fn reinterpret_into<H>(mut self) -> Vec<H>
+    where
+        H: crate::private::SealedHalf,
+    {
+        // An f16 array has same length and capacity as u16 array
+        let length = self.len();
+        let capacity = self.capacity();
+
+        // Actually reinterpret the contents of the Vec<u16> as f16,
+        // knowing that structs are represented as only their members in memory,
+        // which is the u16 part of `f16(u16)`
+        let pointer = self.as_mut_ptr() as *mut H;
+
+        // Prevent running a destructor on the old Vec<u16>, so the pointer won't be deleted
+        mem::forget(self);
+
+        // Finally construct a new Vec<f16> from the raw pointer
+        // SAFETY: We are reconstructing full length and capacity of original vector,
+        // using its original pointer, and the size of elements are identical.
+        unsafe { Vec::from_raw_parts(pointer, length, capacity) }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::{HalfBitsVecExt, HalfFloatVecExt};
+    use crate::{bf16, f16};
+    #[cfg(all(feature = "alloc", not(feature = "std")))]
+    use alloc::vec;
+
+    #[test]
+    fn test_vec_conversions_f16() {
+        let numbers = vec![f16::E, f16::PI, f16::EPSILON, f16::FRAC_1_SQRT_2];
+        let bits = vec![
+            f16::E.to_bits(),
+            f16::PI.to_bits(),
+            f16::EPSILON.to_bits(),
+            f16::FRAC_1_SQRT_2.to_bits(),
+        ];
+        let bits_cloned = bits.clone();
+
+        // Convert from bits to numbers
+        let from_bits = bits.reinterpret_into::<f16>();
+        assert_eq!(&from_bits[..], &numbers[..]);
+
+        // Convert from numbers back to bits
+        let to_bits = from_bits.reinterpret_into();
+        assert_eq!(&to_bits[..], &bits_cloned[..]);
+    }
+
+    #[test]
+    fn test_vec_conversions_bf16() {
+        let numbers = vec![bf16::E, bf16::PI, bf16::EPSILON, bf16::FRAC_1_SQRT_2];
+        let bits = vec![
+            bf16::E.to_bits(),
+            bf16::PI.to_bits(),
+            bf16::EPSILON.to_bits(),
+            bf16::FRAC_1_SQRT_2.to_bits(),
+        ];
+        let bits_cloned = bits.clone();
+
+        // Convert from bits to numbers
+        let from_bits = bits.reinterpret_into::<bf16>();
+        assert_eq!(&from_bits[..], &numbers[..]);
+
+        // Convert from numbers back to bits
+        let to_bits = from_bits.reinterpret_into();
+        assert_eq!(&to_bits[..], &bits_cloned[..]);
+    }
+}