Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

1880
vendor/half/src/bfloat.rs vendored Normal file

File diff suppressed because it is too large Load Diff

152
vendor/half/src/bfloat/convert.rs vendored Normal file
View File

@@ -0,0 +1,152 @@
use crate::leading_zeros::leading_zeros_u16;
use core::mem;
#[inline]
pub(crate) const fn f32_to_bf16(value: f32) -> u16 {
// TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
// Convert to raw bytes
let x: u32 = unsafe { mem::transmute::<f32, u32>(value) };
// check for NaN
if x & 0x7FFF_FFFFu32 > 0x7F80_0000u32 {
// Keep high part of current mantissa but also set most significiant mantissa bit
return ((x >> 16) | 0x0040u32) as u16;
}
// round and shift
let round_bit = 0x0000_8000u32;
if (x & round_bit) != 0 && (x & (3 * round_bit - 1)) != 0 {
(x >> 16) as u16 + 1
} else {
(x >> 16) as u16
}
}
#[inline]
pub(crate) const fn f64_to_bf16(value: f64) -> u16 {
// TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
// Convert to raw bytes, truncating the last 32-bits of mantissa; that precision will always
// be lost on half-precision.
let val: u64 = unsafe { mem::transmute::<f64, u64>(value) };
let x = (val >> 32) as u32;
// Extract IEEE754 components
let sign = x & 0x8000_0000u32;
let exp = x & 0x7FF0_0000u32;
let man = x & 0x000F_FFFFu32;
// Check for all exponent bits being set, which is Infinity or NaN
if exp == 0x7FF0_0000u32 {
// Set mantissa MSB for NaN (and also keep shifted mantissa bits).
// We also have to check the last 32 bits.
let nan_bit = if man == 0 && (val as u32 == 0) {
0
} else {
0x0040u32
};
return ((sign >> 16) | 0x7F80u32 | nan_bit | (man >> 13)) as u16;
}
// The number is normalized, start assembling half precision version
let half_sign = sign >> 16;
// Unbias the exponent, then bias for bfloat16 precision
let unbiased_exp = ((exp >> 20) as i64) - 1023;
let half_exp = unbiased_exp + 127;
// Check for exponent overflow, return +infinity
if half_exp >= 0xFF {
return (half_sign | 0x7F80u32) as u16;
}
// Check for underflow
if half_exp <= 0 {
// Check mantissa for what we can do
if 7 - half_exp > 21 {
// No rounding possibility, so this is a full underflow, return signed zero
return half_sign as u16;
}
// Don't forget about hidden leading mantissa bit when assembling mantissa
let man = man | 0x0010_0000u32;
let mut half_man = man >> (14 - half_exp);
// Check for rounding
let round_bit = 1 << (13 - half_exp);
if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
half_man += 1;
}
// No exponent for subnormals
return (half_sign | half_man) as u16;
}
// Rebias the exponent
let half_exp = (half_exp as u32) << 7;
let half_man = man >> 13;
// Check for rounding
let round_bit = 0x0000_1000u32;
if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
// Round it
((half_sign | half_exp | half_man) + 1) as u16
} else {
(half_sign | half_exp | half_man) as u16
}
}
#[inline]
pub(crate) const fn bf16_to_f32(i: u16) -> f32 {
// TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
// If NaN, keep current mantissa but also set most significiant mantissa bit
if i & 0x7FFFu16 > 0x7F80u16 {
unsafe { mem::transmute::<u32, f32>((i as u32 | 0x0040u32) << 16) }
} else {
unsafe { mem::transmute::<u32, f32>((i as u32) << 16) }
}
}
#[inline]
pub(crate) const fn bf16_to_f64(i: u16) -> f64 {
// TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
// Check for signed zero
if i & 0x7FFFu16 == 0 {
return unsafe { mem::transmute::<u64, f64>((i as u64) << 48) };
}
let half_sign = (i & 0x8000u16) as u64;
let half_exp = (i & 0x7F80u16) as u64;
let half_man = (i & 0x007Fu16) as u64;
// Check for an infinity or NaN when all exponent bits set
if half_exp == 0x7F80u64 {
// Check for signed infinity if mantissa is zero
if half_man == 0 {
return unsafe {
mem::transmute::<u64, f64>((half_sign << 48) | 0x7FF0_0000_0000_0000u64)
};
} else {
// NaN, keep current mantissa but also set most significiant mantissa bit
return unsafe {
mem::transmute::<u64, f64>(
(half_sign << 48) | 0x7FF8_0000_0000_0000u64 | (half_man << 45),
)
};
}
}
// Calculate double-precision components with adjusted exponent
let sign = half_sign << 48;
// Unbias exponent
let unbiased_exp = ((half_exp as i64) >> 7) - 127;
// Check for subnormals, which will be normalized by adjusting exponent
if half_exp == 0 {
// Calculate how much to adjust the exponent by
let e = leading_zeros_u16(half_man as u16) - 9;
// Rebias and adjust exponent
let exp = ((1023 - 127 - e) as u64) << 52;
let man = (half_man << (46 + e)) & 0xF_FFFF_FFFF_FFFFu64;
return unsafe { mem::transmute::<u64, f64>(sign | exp | man) };
}
// Rebias exponent for a normalized normal
let exp = ((unbiased_exp + 1023) as u64) << 52;
let man = (half_man & 0x007Fu64) << 45;
unsafe { mem::transmute::<u64, f64>(sign | exp | man) }
}

1964
vendor/half/src/binary16.rs vendored Normal file

File diff suppressed because it is too large Load Diff

847
vendor/half/src/binary16/arch.rs vendored Normal file
View File

@@ -0,0 +1,847 @@
#![allow(dead_code, unused_imports)]
use crate::leading_zeros::leading_zeros_u16;
use core::mem;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
mod x86;
#[cfg(target_arch = "aarch64")]
mod aarch64;
macro_rules! convert_fn {
(if x86_feature("f16c") { $f16c:expr }
else if aarch64_feature("fp16") { $aarch64:expr }
else { $fallback:expr }) => {
cfg_if::cfg_if! {
// Use intrinsics directly when a compile target or using no_std
if #[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "f16c"
))] {
$f16c
}
else if #[cfg(all(
target_arch = "aarch64",
target_feature = "fp16"
))] {
$aarch64
}
// Use CPU feature detection if using std
else if #[cfg(all(
feature = "std",
any(target_arch = "x86", target_arch = "x86_64")
))] {
use std::arch::is_x86_feature_detected;
if is_x86_feature_detected!("f16c") {
$f16c
} else {
$fallback
}
}
else if #[cfg(all(
feature = "std",
target_arch = "aarch64",
))] {
use std::arch::is_aarch64_feature_detected;
if is_aarch64_feature_detected!("fp16") {
$aarch64
} else {
$fallback
}
}
// Fallback to software
else {
$fallback
}
}
};
}
#[inline]
pub(crate) fn f32_to_f16(f: f32) -> u16 {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f32_to_f16_x86_f16c(f) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f32_to_f16_fp16(f) }
} else {
f32_to_f16_fallback(f)
}
}
}
#[inline]
pub(crate) fn f64_to_f16(f: f64) -> u16 {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f32_to_f16_x86_f16c(f as f32) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f64_to_f16_fp16(f) }
} else {
f64_to_f16_fallback(f)
}
}
}
#[inline]
pub(crate) fn f16_to_f32(i: u16) -> f32 {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f16_to_f32_x86_f16c(i) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f16_to_f32_fp16(i) }
} else {
f16_to_f32_fallback(i)
}
}
}
#[inline]
pub(crate) fn f16_to_f64(i: u16) -> f64 {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f16_to_f32_x86_f16c(i) as f64 }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f16_to_f64_fp16(i) }
} else {
f16_to_f64_fallback(i)
}
}
}
#[inline]
pub(crate) fn f32x4_to_f16x4(f: &[f32; 4]) -> [u16; 4] {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f32x4_to_f16x4_x86_f16c(f) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f32x4_to_f16x4_fp16(f) }
} else {
f32x4_to_f16x4_fallback(f)
}
}
}
#[inline]
pub(crate) fn f16x4_to_f32x4(i: &[u16; 4]) -> [f32; 4] {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f16x4_to_f32x4_x86_f16c(i) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f16x4_to_f32x4_fp16(i) }
} else {
f16x4_to_f32x4_fallback(i)
}
}
}
#[inline]
pub(crate) fn f64x4_to_f16x4(f: &[f64; 4]) -> [u16; 4] {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f64x4_to_f16x4_x86_f16c(f) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f64x4_to_f16x4_fp16(f) }
} else {
f64x4_to_f16x4_fallback(f)
}
}
}
#[inline]
pub(crate) fn f16x4_to_f64x4(i: &[u16; 4]) -> [f64; 4] {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f16x4_to_f64x4_x86_f16c(i) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f16x4_to_f64x4_fp16(i) }
} else {
f16x4_to_f64x4_fallback(i)
}
}
}
#[inline]
pub(crate) fn f32x8_to_f16x8(f: &[f32; 8]) -> [u16; 8] {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f32x8_to_f16x8_x86_f16c(f) }
} else if aarch64_feature("fp16") {
{
let mut result = [0u16; 8];
convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
aarch64::f32x4_to_f16x4_fp16);
result
}
} else {
f32x8_to_f16x8_fallback(f)
}
}
}
#[inline]
pub(crate) fn f16x8_to_f32x8(i: &[u16; 8]) -> [f32; 8] {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f16x8_to_f32x8_x86_f16c(i) }
} else if aarch64_feature("fp16") {
{
let mut result = [0f32; 8];
convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
aarch64::f16x4_to_f32x4_fp16);
result
}
} else {
f16x8_to_f32x8_fallback(i)
}
}
}
#[inline]
pub(crate) fn f64x8_to_f16x8(f: &[f64; 8]) -> [u16; 8] {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f64x8_to_f16x8_x86_f16c(f) }
} else if aarch64_feature("fp16") {
{
let mut result = [0u16; 8];
convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
aarch64::f64x4_to_f16x4_fp16);
result
}
} else {
f64x8_to_f16x8_fallback(f)
}
}
}
#[inline]
pub(crate) fn f16x8_to_f64x8(i: &[u16; 8]) -> [f64; 8] {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f16x8_to_f64x8_x86_f16c(i) }
} else if aarch64_feature("fp16") {
{
let mut result = [0f64; 8];
convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
aarch64::f16x4_to_f64x4_fp16);
result
}
} else {
f16x8_to_f64x8_fallback(i)
}
}
}
#[inline]
pub(crate) fn f32_to_f16_slice(src: &[f32], dst: &mut [u16]) {
convert_fn! {
if x86_feature("f16c") {
convert_chunked_slice_8(src, dst, x86::f32x8_to_f16x8_x86_f16c,
x86::f32x4_to_f16x4_x86_f16c)
} else if aarch64_feature("fp16") {
convert_chunked_slice_4(src, dst, aarch64::f32x4_to_f16x4_fp16)
} else {
slice_fallback(src, dst, f32_to_f16_fallback)
}
}
}
#[inline]
pub(crate) fn f16_to_f32_slice(src: &[u16], dst: &mut [f32]) {
convert_fn! {
if x86_feature("f16c") {
convert_chunked_slice_8(src, dst, x86::f16x8_to_f32x8_x86_f16c,
x86::f16x4_to_f32x4_x86_f16c)
} else if aarch64_feature("fp16") {
convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f32x4_fp16)
} else {
slice_fallback(src, dst, f16_to_f32_fallback)
}
}
}
#[inline]
pub(crate) fn f64_to_f16_slice(src: &[f64], dst: &mut [u16]) {
convert_fn! {
if x86_feature("f16c") {
convert_chunked_slice_8(src, dst, x86::f64x8_to_f16x8_x86_f16c,
x86::f64x4_to_f16x4_x86_f16c)
} else if aarch64_feature("fp16") {
convert_chunked_slice_4(src, dst, aarch64::f64x4_to_f16x4_fp16)
} else {
slice_fallback(src, dst, f64_to_f16_fallback)
}
}
}
#[inline]
pub(crate) fn f16_to_f64_slice(src: &[u16], dst: &mut [f64]) {
convert_fn! {
if x86_feature("f16c") {
convert_chunked_slice_8(src, dst, x86::f16x8_to_f64x8_x86_f16c,
x86::f16x4_to_f64x4_x86_f16c)
} else if aarch64_feature("fp16") {
convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f64x4_fp16)
} else {
slice_fallback(src, dst, f16_to_f64_fallback)
}
}
}
macro_rules! math_fn {
(if aarch64_feature("fp16") { $aarch64:expr }
else { $fallback:expr }) => {
cfg_if::cfg_if! {
// Use intrinsics directly when a compile target or using no_std
if #[cfg(all(
target_arch = "aarch64",
target_feature = "fp16"
))] {
$aarch64
}
// Use CPU feature detection if using std
else if #[cfg(all(
feature = "std",
target_arch = "aarch64",
not(target_feature = "fp16")
))] {
use std::arch::is_aarch64_feature_detected;
if is_aarch64_feature_detected!("fp16") {
$aarch64
} else {
$fallback
}
}
// Fallback to software
else {
$fallback
}
}
};
}
#[inline]
pub(crate) fn add_f16(a: u16, b: u16) -> u16 {
math_fn! {
if aarch64_feature("fp16") {
unsafe { aarch64::add_f16_fp16(a, b) }
} else {
add_f16_fallback(a, b)
}
}
}
#[inline]
pub(crate) fn subtract_f16(a: u16, b: u16) -> u16 {
math_fn! {
if aarch64_feature("fp16") {
unsafe { aarch64::subtract_f16_fp16(a, b) }
} else {
subtract_f16_fallback(a, b)
}
}
}
#[inline]
pub(crate) fn multiply_f16(a: u16, b: u16) -> u16 {
math_fn! {
if aarch64_feature("fp16") {
unsafe { aarch64::multiply_f16_fp16(a, b) }
} else {
multiply_f16_fallback(a, b)
}
}
}
#[inline]
pub(crate) fn divide_f16(a: u16, b: u16) -> u16 {
math_fn! {
if aarch64_feature("fp16") {
unsafe { aarch64::divide_f16_fp16(a, b) }
} else {
divide_f16_fallback(a, b)
}
}
}
#[inline]
pub(crate) fn remainder_f16(a: u16, b: u16) -> u16 {
remainder_f16_fallback(a, b)
}
#[inline]
pub(crate) fn product_f16<I: Iterator<Item = u16>>(iter: I) -> u16 {
math_fn! {
if aarch64_feature("fp16") {
iter.fold(0, |acc, x| unsafe { aarch64::multiply_f16_fp16(acc, x) })
} else {
product_f16_fallback(iter)
}
}
}
#[inline]
pub(crate) fn sum_f16<I: Iterator<Item = u16>>(iter: I) -> u16 {
math_fn! {
if aarch64_feature("fp16") {
iter.fold(0, |acc, x| unsafe { aarch64::add_f16_fp16(acc, x) })
} else {
sum_f16_fallback(iter)
}
}
}
/// Chunks sliced into x8 or x4 arrays
#[inline]
fn convert_chunked_slice_8<S: Copy + Default, D: Copy>(
src: &[S],
dst: &mut [D],
fn8: unsafe fn(&[S; 8]) -> [D; 8],
fn4: unsafe fn(&[S; 4]) -> [D; 4],
) {
assert_eq!(src.len(), dst.len());
// TODO: Can be further optimized with array_chunks when it becomes stabilized
let src_chunks = src.chunks_exact(8);
let mut dst_chunks = dst.chunks_exact_mut(8);
let src_remainder = src_chunks.remainder();
for (s, d) in src_chunks.zip(&mut dst_chunks) {
let chunk: &[S; 8] = s.try_into().unwrap();
d.copy_from_slice(unsafe { &fn8(chunk) });
}
// Process remainder
if src_remainder.len() > 4 {
let mut buf: [S; 8] = Default::default();
buf[..src_remainder.len()].copy_from_slice(src_remainder);
let vec = unsafe { fn8(&buf) };
let dst_remainder = dst_chunks.into_remainder();
dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
} else if !src_remainder.is_empty() {
let mut buf: [S; 4] = Default::default();
buf[..src_remainder.len()].copy_from_slice(src_remainder);
let vec = unsafe { fn4(&buf) };
let dst_remainder = dst_chunks.into_remainder();
dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
}
}
/// Chunks sliced into x4 arrays
#[inline]
fn convert_chunked_slice_4<S: Copy + Default, D: Copy>(
src: &[S],
dst: &mut [D],
f: unsafe fn(&[S; 4]) -> [D; 4],
) {
assert_eq!(src.len(), dst.len());
// TODO: Can be further optimized with array_chunks when it becomes stabilized
let src_chunks = src.chunks_exact(4);
let mut dst_chunks = dst.chunks_exact_mut(4);
let src_remainder = src_chunks.remainder();
for (s, d) in src_chunks.zip(&mut dst_chunks) {
let chunk: &[S; 4] = s.try_into().unwrap();
d.copy_from_slice(unsafe { &f(chunk) });
}
// Process remainder
if !src_remainder.is_empty() {
let mut buf: [S; 4] = Default::default();
buf[..src_remainder.len()].copy_from_slice(src_remainder);
let vec = unsafe { f(&buf) };
let dst_remainder = dst_chunks.into_remainder();
dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
}
}
/////////////// Fallbacks ////////////////
// In the below functions, round to nearest, with ties to even.
// Let us call the most significant bit that will be shifted out the round_bit.
//
// Round up if either
// a) Removed part > tie.
// (mantissa & round_bit) != 0 && (mantissa & (round_bit - 1)) != 0
// b) Removed part == tie, and retained part is odd.
// (mantissa & round_bit) != 0 && (mantissa & (2 * round_bit)) != 0
// (If removed part == tie and retained part is even, do not round up.)
// These two conditions can be combined into one:
// (mantissa & round_bit) != 0 && (mantissa & ((round_bit - 1) | (2 * round_bit))) != 0
// which can be simplified into
// (mantissa & round_bit) != 0 && (mantissa & (3 * round_bit - 1)) != 0
#[inline]
pub(crate) const fn f32_to_f16_fallback(value: f32) -> u16 {
// TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
// Convert to raw bytes
let x: u32 = unsafe { mem::transmute::<f32, u32>(value) };
// Extract IEEE754 components
let sign = x & 0x8000_0000u32;
let exp = x & 0x7F80_0000u32;
let man = x & 0x007F_FFFFu32;
// Check for all exponent bits being set, which is Infinity or NaN
if exp == 0x7F80_0000u32 {
// Set mantissa MSB for NaN (and also keep shifted mantissa bits)
let nan_bit = if man == 0 { 0 } else { 0x0200u32 };
return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 13)) as u16;
}
// The number is normalized, start assembling half precision version
let half_sign = sign >> 16;
// Unbias the exponent, then bias for half precision
let unbiased_exp = ((exp >> 23) as i32) - 127;
let half_exp = unbiased_exp + 15;
// Check for exponent overflow, return +infinity
if half_exp >= 0x1F {
return (half_sign | 0x7C00u32) as u16;
}
// Check for underflow
if half_exp <= 0 {
// Check mantissa for what we can do
if 14 - half_exp > 24 {
// No rounding possibility, so this is a full underflow, return signed zero
return half_sign as u16;
}
// Don't forget about hidden leading mantissa bit when assembling mantissa
let man = man | 0x0080_0000u32;
let mut half_man = man >> (14 - half_exp);
// Check for rounding (see comment above functions)
let round_bit = 1 << (13 - half_exp);
if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
half_man += 1;
}
// No exponent for subnormals
return (half_sign | half_man) as u16;
}
// Rebias the exponent
let half_exp = (half_exp as u32) << 10;
let half_man = man >> 13;
// Check for rounding (see comment above functions)
let round_bit = 0x0000_1000u32;
if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
// Round it
((half_sign | half_exp | half_man) + 1) as u16
} else {
(half_sign | half_exp | half_man) as u16
}
}
#[inline]
pub(crate) const fn f64_to_f16_fallback(value: f64) -> u16 {
// Convert to raw bytes, truncating the last 32-bits of mantissa; that precision will always
// be lost on half-precision.
// TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
let val: u64 = unsafe { mem::transmute::<f64, u64>(value) };
let x = (val >> 32) as u32;
// Extract IEEE754 components
let sign = x & 0x8000_0000u32;
let exp = x & 0x7FF0_0000u32;
let man = x & 0x000F_FFFFu32;
// Check for all exponent bits being set, which is Infinity or NaN
if exp == 0x7FF0_0000u32 {
// Set mantissa MSB for NaN (and also keep shifted mantissa bits).
// We also have to check the last 32 bits.
let nan_bit = if man == 0 && (val as u32 == 0) {
0
} else {
0x0200u32
};
return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 10)) as u16;
}
// The number is normalized, start assembling half precision version
let half_sign = sign >> 16;
// Unbias the exponent, then bias for half precision
let unbiased_exp = ((exp >> 20) as i64) - 1023;
let half_exp = unbiased_exp + 15;
// Check for exponent overflow, return +infinity
if half_exp >= 0x1F {
return (half_sign | 0x7C00u32) as u16;
}
// Check for underflow
if half_exp <= 0 {
// Check mantissa for what we can do
if 10 - half_exp > 21 {
// No rounding possibility, so this is a full underflow, return signed zero
return half_sign as u16;
}
// Don't forget about hidden leading mantissa bit when assembling mantissa
let man = man | 0x0010_0000u32;
let mut half_man = man >> (11 - half_exp);
// Check for rounding (see comment above functions)
let round_bit = 1 << (10 - half_exp);
if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
half_man += 1;
}
// No exponent for subnormals
return (half_sign | half_man) as u16;
}
// Rebias the exponent
let half_exp = (half_exp as u32) << 10;
let half_man = man >> 10;
// Check for rounding (see comment above functions)
let round_bit = 0x0000_0200u32;
if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
// Round it
((half_sign | half_exp | half_man) + 1) as u16
} else {
(half_sign | half_exp | half_man) as u16
}
}
#[inline]
pub(crate) const fn f16_to_f32_fallback(i: u16) -> f32 {
// Check for signed zero
// TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
if i & 0x7FFFu16 == 0 {
return unsafe { mem::transmute::<u32, f32>((i as u32) << 16) };
}
let half_sign = (i & 0x8000u16) as u32;
let half_exp = (i & 0x7C00u16) as u32;
let half_man = (i & 0x03FFu16) as u32;
// Check for an infinity or NaN when all exponent bits set
if half_exp == 0x7C00u32 {
// Check for signed infinity if mantissa is zero
if half_man == 0 {
return unsafe { mem::transmute::<u32, f32>((half_sign << 16) | 0x7F80_0000u32) };
} else {
// NaN, keep current mantissa but also set most significiant mantissa bit
return unsafe {
mem::transmute::<u32, f32>((half_sign << 16) | 0x7FC0_0000u32 | (half_man << 13))
};
}
}
// Calculate single-precision components with adjusted exponent
let sign = half_sign << 16;
// Unbias exponent
let unbiased_exp = ((half_exp as i32) >> 10) - 15;
// Check for subnormals, which will be normalized by adjusting exponent
if half_exp == 0 {
// Calculate how much to adjust the exponent by
let e = leading_zeros_u16(half_man as u16) - 6;
// Rebias and adjust exponent
let exp = (127 - 15 - e) << 23;
let man = (half_man << (14 + e)) & 0x7F_FF_FFu32;
return unsafe { mem::transmute::<u32, f32>(sign | exp | man) };
}
// Rebias exponent for a normalized normal
let exp = ((unbiased_exp + 127) as u32) << 23;
let man = (half_man & 0x03FFu32) << 13;
unsafe { mem::transmute::<u32, f32>(sign | exp | man) }
}
#[inline]
pub(crate) const fn f16_to_f64_fallback(i: u16) -> f64 {
// Check for signed zero
// TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
if i & 0x7FFFu16 == 0 {
return unsafe { mem::transmute::<u64, f64>((i as u64) << 48) };
}
let half_sign = (i & 0x8000u16) as u64;
let half_exp = (i & 0x7C00u16) as u64;
let half_man = (i & 0x03FFu16) as u64;
// Check for an infinity or NaN when all exponent bits set
if half_exp == 0x7C00u64 {
// Check for signed infinity if mantissa is zero
if half_man == 0 {
return unsafe {
mem::transmute::<u64, f64>((half_sign << 48) | 0x7FF0_0000_0000_0000u64)
};
} else {
// NaN, keep current mantissa but also set most significiant mantissa bit
return unsafe {
mem::transmute::<u64, f64>(
(half_sign << 48) | 0x7FF8_0000_0000_0000u64 | (half_man << 42),
)
};
}
}
// Calculate double-precision components with adjusted exponent
let sign = half_sign << 48;
// Unbias exponent
let unbiased_exp = ((half_exp as i64) >> 10) - 15;
// Check for subnormals, which will be normalized by adjusting exponent
if half_exp == 0 {
// Calculate how much to adjust the exponent by
let e = leading_zeros_u16(half_man as u16) - 6;
// Rebias and adjust exponent
let exp = ((1023 - 15 - e) as u64) << 52;
let man = (half_man << (43 + e)) & 0xF_FFFF_FFFF_FFFFu64;
return unsafe { mem::transmute::<u64, f64>(sign | exp | man) };
}
// Rebias exponent for a normalized normal
let exp = ((unbiased_exp + 1023) as u64) << 52;
let man = (half_man & 0x03FFu64) << 42;
unsafe { mem::transmute::<u64, f64>(sign | exp | man) }
}
#[inline]
fn f16x4_to_f32x4_fallback(v: &[u16; 4]) -> [f32; 4] {
[
f16_to_f32_fallback(v[0]),
f16_to_f32_fallback(v[1]),
f16_to_f32_fallback(v[2]),
f16_to_f32_fallback(v[3]),
]
}
#[inline]
fn f32x4_to_f16x4_fallback(v: &[f32; 4]) -> [u16; 4] {
[
f32_to_f16_fallback(v[0]),
f32_to_f16_fallback(v[1]),
f32_to_f16_fallback(v[2]),
f32_to_f16_fallback(v[3]),
]
}
#[inline]
fn f16x4_to_f64x4_fallback(v: &[u16; 4]) -> [f64; 4] {
[
f16_to_f64_fallback(v[0]),
f16_to_f64_fallback(v[1]),
f16_to_f64_fallback(v[2]),
f16_to_f64_fallback(v[3]),
]
}
#[inline]
fn f64x4_to_f16x4_fallback(v: &[f64; 4]) -> [u16; 4] {
[
f64_to_f16_fallback(v[0]),
f64_to_f16_fallback(v[1]),
f64_to_f16_fallback(v[2]),
f64_to_f16_fallback(v[3]),
]
}
#[inline]
fn f16x8_to_f32x8_fallback(v: &[u16; 8]) -> [f32; 8] {
[
f16_to_f32_fallback(v[0]),
f16_to_f32_fallback(v[1]),
f16_to_f32_fallback(v[2]),
f16_to_f32_fallback(v[3]),
f16_to_f32_fallback(v[4]),
f16_to_f32_fallback(v[5]),
f16_to_f32_fallback(v[6]),
f16_to_f32_fallback(v[7]),
]
}
#[inline]
fn f32x8_to_f16x8_fallback(v: &[f32; 8]) -> [u16; 8] {
[
f32_to_f16_fallback(v[0]),
f32_to_f16_fallback(v[1]),
f32_to_f16_fallback(v[2]),
f32_to_f16_fallback(v[3]),
f32_to_f16_fallback(v[4]),
f32_to_f16_fallback(v[5]),
f32_to_f16_fallback(v[6]),
f32_to_f16_fallback(v[7]),
]
}
#[inline]
fn f16x8_to_f64x8_fallback(v: &[u16; 8]) -> [f64; 8] {
[
f16_to_f64_fallback(v[0]),
f16_to_f64_fallback(v[1]),
f16_to_f64_fallback(v[2]),
f16_to_f64_fallback(v[3]),
f16_to_f64_fallback(v[4]),
f16_to_f64_fallback(v[5]),
f16_to_f64_fallback(v[6]),
f16_to_f64_fallback(v[7]),
]
}
#[inline]
fn f64x8_to_f16x8_fallback(v: &[f64; 8]) -> [u16; 8] {
[
f64_to_f16_fallback(v[0]),
f64_to_f16_fallback(v[1]),
f64_to_f16_fallback(v[2]),
f64_to_f16_fallback(v[3]),
f64_to_f16_fallback(v[4]),
f64_to_f16_fallback(v[5]),
f64_to_f16_fallback(v[6]),
f64_to_f16_fallback(v[7]),
]
}
#[inline]
fn slice_fallback<S: Copy, D>(src: &[S], dst: &mut [D], f: fn(S) -> D) {
assert_eq!(src.len(), dst.len());
for (s, d) in src.iter().copied().zip(dst.iter_mut()) {
*d = f(s);
}
}
#[inline]
fn add_f16_fallback(a: u16, b: u16) -> u16 {
f32_to_f16(f16_to_f32(a) + f16_to_f32(b))
}
#[inline]
fn subtract_f16_fallback(a: u16, b: u16) -> u16 {
f32_to_f16(f16_to_f32(a) - f16_to_f32(b))
}
#[inline]
fn multiply_f16_fallback(a: u16, b: u16) -> u16 {
f32_to_f16(f16_to_f32(a) * f16_to_f32(b))
}
#[inline]
fn divide_f16_fallback(a: u16, b: u16) -> u16 {
f32_to_f16(f16_to_f32(a) / f16_to_f32(b))
}
#[inline]
fn remainder_f16_fallback(a: u16, b: u16) -> u16 {
f32_to_f16(f16_to_f32(a) % f16_to_f32(b))
}
#[inline]
fn product_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 {
f32_to_f16(iter.map(f16_to_f32).product())
}
#[inline]
fn sum_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 {
f32_to_f16(iter.map(f16_to_f32).sum())
}
// TODO SIMD arithmetic

175
vendor/half/src/binary16/arch/aarch64.rs vendored Normal file
View File

@@ -0,0 +1,175 @@
use core::{
arch::{
aarch64::{float32x4_t, float64x2_t, uint16x4_t},
asm,
},
mem::MaybeUninit,
ptr,
};
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn f16_to_f32_fp16(i: u16) -> f32 {
let result: f32;
asm!(
"fcvt {0:s}, {1:h}",
out(vreg) result,
in(vreg) i,
options(pure, nomem, nostack, preserves_flags));
result
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn f16_to_f64_fp16(i: u16) -> f64 {
let result: f64;
asm!(
"fcvt {0:d}, {1:h}",
out(vreg) result,
in(vreg) i,
options(pure, nomem, nostack, preserves_flags));
result
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn f32_to_f16_fp16(f: f32) -> u16 {
let result: u16;
asm!(
"fcvt {0:h}, {1:s}",
out(vreg) result,
in(vreg) f,
options(pure, nomem, nostack, preserves_flags));
result
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn f64_to_f16_fp16(f: f64) -> u16 {
let result: u16;
asm!(
"fcvt {0:h}, {1:d}",
out(vreg) result,
in(vreg) f,
options(pure, nomem, nostack, preserves_flags));
result
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn f16x4_to_f32x4_fp16(v: &[u16; 4]) -> [f32; 4] {
let mut vec = MaybeUninit::<uint16x4_t>::uninit();
ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
let result: float32x4_t;
asm!(
"fcvtl {0:v}.4s, {1:v}.4h",
out(vreg) result,
in(vreg) vec.assume_init(),
options(pure, nomem, nostack));
*(&result as *const float32x4_t).cast()
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn f32x4_to_f16x4_fp16(v: &[f32; 4]) -> [u16; 4] {
let mut vec = MaybeUninit::<float32x4_t>::uninit();
ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
let result: uint16x4_t;
asm!(
"fcvtn {0:v}.4h, {1:v}.4s",
out(vreg) result,
in(vreg) vec.assume_init(),
options(pure, nomem, nostack));
*(&result as *const uint16x4_t).cast()
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn f16x4_to_f64x4_fp16(v: &[u16; 4]) -> [f64; 4] {
let mut vec = MaybeUninit::<uint16x4_t>::uninit();
ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
let low: float64x2_t;
let high: float64x2_t;
asm!(
"fcvtl {2:v}.4s, {3:v}.4h", // Convert to f32
"fcvtl {0:v}.2d, {2:v}.2s", // Convert low part to f64
"fcvtl2 {1:v}.2d, {2:v}.4s", // Convert high part to f64
lateout(vreg) low,
lateout(vreg) high,
out(vreg) _,
in(vreg) vec.assume_init(),
options(pure, nomem, nostack));
*[low, high].as_ptr().cast()
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn f64x4_to_f16x4_fp16(v: &[f64; 4]) -> [u16; 4] {
let mut low = MaybeUninit::<float64x2_t>::uninit();
let mut high = MaybeUninit::<float64x2_t>::uninit();
ptr::copy_nonoverlapping(v.as_ptr(), low.as_mut_ptr().cast(), 2);
ptr::copy_nonoverlapping(v[2..].as_ptr(), high.as_mut_ptr().cast(), 2);
let result: uint16x4_t;
asm!(
"fcvtn {1:v}.2s, {2:v}.2d", // Convert low to f32
"fcvtn2 {1:v}.4s, {3:v}.2d", // Convert high to f32
"fcvtn {0:v}.4h, {1:v}.4s", // Convert to f16
lateout(vreg) result,
out(vreg) _,
in(vreg) low.assume_init(),
in(vreg) high.assume_init(),
options(pure, nomem, nostack));
*(&result as *const uint16x4_t).cast()
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn add_f16_fp16(a: u16, b: u16) -> u16 {
let result: u16;
asm!(
"fadd {0:h}, {1:h}, {2:h}",
out(vreg) result,
in(vreg) a,
in(vreg) b,
options(pure, nomem, nostack));
result
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn subtract_f16_fp16(a: u16, b: u16) -> u16 {
let result: u16;
asm!(
"fsub {0:h}, {1:h}, {2:h}",
out(vreg) result,
in(vreg) a,
in(vreg) b,
options(pure, nomem, nostack));
result
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn multiply_f16_fp16(a: u16, b: u16) -> u16 {
let result: u16;
asm!(
"fmul {0:h}, {1:h}, {2:h}",
out(vreg) result,
in(vreg) a,
in(vreg) b,
options(pure, nomem, nostack));
result
}
#[target_feature(enable = "fp16")]
#[inline]
pub(super) unsafe fn divide_f16_fp16(a: u16, b: u16) -> u16 {
let result: u16;
asm!(
"fdiv {0:h}, {1:h}, {2:h}",
out(vreg) result,
in(vreg) a,
in(vreg) b,
options(pure, nomem, nostack));
result
}

132
vendor/half/src/binary16/arch/x86.rs vendored Normal file
View File

@@ -0,0 +1,132 @@
use core::{mem::MaybeUninit, ptr};
#[cfg(target_arch = "x86")]
use core::arch::x86::{
__m128, __m128i, __m256, _mm256_cvtph_ps, _mm256_cvtps_ph, _mm_cvtph_ps,
_MM_FROUND_TO_NEAREST_INT,
};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{
__m128, __m128i, __m256, _mm256_cvtph_ps, _mm256_cvtps_ph, _mm_cvtph_ps, _mm_cvtps_ph,
_MM_FROUND_TO_NEAREST_INT,
};
#[cfg(target_arch = "x86")]
use core::arch::x86::_mm_cvtps_ph;
use super::convert_chunked_slice_8;
/////////////// x86/x86_64 f16c ////////////////
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f16_to_f32_x86_f16c(i: u16) -> f32 {
let mut vec = MaybeUninit::<__m128i>::zeroed();
vec.as_mut_ptr().cast::<u16>().write(i);
let retval = _mm_cvtph_ps(vec.assume_init());
*(&retval as *const __m128).cast()
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 {
let mut vec = MaybeUninit::<__m128>::zeroed();
vec.as_mut_ptr().cast::<f32>().write(f);
let retval = _mm_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
*(&retval as *const __m128i).cast()
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f16x4_to_f32x4_x86_f16c(v: &[u16; 4]) -> [f32; 4] {
let mut vec = MaybeUninit::<__m128i>::zeroed();
ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
let retval = _mm_cvtph_ps(vec.assume_init());
*(&retval as *const __m128).cast()
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f32x4_to_f16x4_x86_f16c(v: &[f32; 4]) -> [u16; 4] {
let mut vec = MaybeUninit::<__m128>::uninit();
ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
let retval = _mm_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
*(&retval as *const __m128i).cast()
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f16x4_to_f64x4_x86_f16c(v: &[u16; 4]) -> [f64; 4] {
let array = f16x4_to_f32x4_x86_f16c(v);
// Let compiler vectorize this regular cast for now.
// TODO: investigate auto-detecting sse2/avx convert features
[
array[0] as f64,
array[1] as f64,
array[2] as f64,
array[3] as f64,
]
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f64x4_to_f16x4_x86_f16c(v: &[f64; 4]) -> [u16; 4] {
// Let compiler vectorize this regular cast for now.
// TODO: investigate auto-detecting sse2/avx convert features
let v = [v[0] as f32, v[1] as f32, v[2] as f32, v[3] as f32];
f32x4_to_f16x4_x86_f16c(&v)
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f16x8_to_f32x8_x86_f16c(v: &[u16; 8]) -> [f32; 8] {
let mut vec = MaybeUninit::<__m128i>::zeroed();
ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 8);
let retval = _mm256_cvtph_ps(vec.assume_init());
*(&retval as *const __m256).cast()
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f32x8_to_f16x8_x86_f16c(v: &[f32; 8]) -> [u16; 8] {
let mut vec = MaybeUninit::<__m256>::uninit();
ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 8);
let retval = _mm256_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
*(&retval as *const __m128i).cast()
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f16x8_to_f64x8_x86_f16c(v: &[u16; 8]) -> [f64; 8] {
let array = f16x8_to_f32x8_x86_f16c(v);
// Let compiler vectorize this regular cast for now.
// TODO: investigate auto-detecting sse2/avx convert features
[
array[0] as f64,
array[1] as f64,
array[2] as f64,
array[3] as f64,
array[4] as f64,
array[5] as f64,
array[6] as f64,
array[7] as f64,
]
}
#[target_feature(enable = "f16c")]
#[inline]
pub(super) unsafe fn f64x8_to_f16x8_x86_f16c(v: &[f64; 8]) -> [u16; 8] {
// Let compiler vectorize this regular cast for now.
// TODO: investigate auto-detecting sse2/avx convert features
let v = [
v[0] as f32,
v[1] as f32,
v[2] as f32,
v[3] as f32,
v[4] as f32,
v[5] as f32,
v[6] as f32,
v[7] as f32,
];
f32x8_to_f16x8_x86_f16c(&v)
}

65
vendor/half/src/leading_zeros.rs vendored Normal file
View File

@@ -0,0 +1,65 @@
// https://doc.rust-lang.org/std/primitive.u16.html#method.leading_zeros
#[cfg(not(any(all(
target_arch = "spirv",
not(all(
target_feature = "IntegerFunctions2INTEL",
target_feature = "SPV_INTEL_shader_integer_functions2"
))
))))]
#[inline]
pub(crate) const fn leading_zeros_u16(x: u16) -> u32 {
x.leading_zeros()
}
#[cfg(all(
target_arch = "spirv",
not(all(
target_feature = "IntegerFunctions2INTEL",
target_feature = "SPV_INTEL_shader_integer_functions2"
))
))]
#[inline]
pub(crate) const fn leading_zeros_u16(x: u16) -> u32 {
leading_zeros_u16_fallback(x)
}
#[cfg(any(
test,
all(
target_arch = "spirv",
not(all(
target_feature = "IntegerFunctions2INTEL",
target_feature = "SPV_INTEL_shader_integer_functions2"
))
)
))]
#[inline]
const fn leading_zeros_u16_fallback(mut x: u16) -> u32 {
use crunchy::unroll;
let mut c = 0;
let msb = 1 << 15;
unroll! { for i in 0 .. 16 {
if x & msb == 0 {
c += 1;
} else {
return c;
}
#[allow(unused_assignments)]
if i < 15 {
x <<= 1;
}
}}
c
}
#[cfg(test)]
mod test {
#[test]
fn leading_zeros_u16_fallback() {
for x in [44, 97, 304, 1179, 23571] {
assert_eq!(super::leading_zeros_u16_fallback(x), x.leading_zeros());
}
}
}

271
vendor/half/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,271 @@
//! A crate that provides support for half-precision 16-bit floating point types.
//!
//! This crate provides the [`struct@f16`] type, which is an implementation of the IEEE 754-2008 standard
//! [`binary16`] a.k.a "half" floating point type. This 16-bit floating point type is intended for
//! efficient storage where the full range and precision of a larger floating point value is not
//! required. This is especially useful for image storage formats.
//!
//! This crate also provides a [`struct@bf16`] type, an alternative 16-bit floating point format. The
//! [`bfloat16`] format is a truncated IEEE 754 standard `binary32` float that preserves the
//! exponent to allow the same range as [`f32`] but with only 8 bits of precision (instead of 11
//! bits for [`struct@f16`]). See the [`struct@bf16`] type for details.
//!
//! Because [`struct@f16`] and [`struct@bf16`] are primarily for efficient storage, floating point operations such
//! as addition, multiplication, etc. are not always implemented by hardware. When hardware does not
//! support these operations, this crate emulates them by converting the value to
//! [`f32`] before performing the operation and then back afterward.
//!
//! Note that conversion from [`f32`]/[`f64`] to both [`struct@f16`] and [`struct@bf16`] are lossy operations, and
//! just as converting a [`f64`] to [`f32`] is lossy and does not have `Into`/`From` trait
//! implementations, so too do these smaller types not have those trait implementations either.
//! Instead, use `from_f32`/`from_f64` functions for the types in this crate. If you don't care
//! about lossy conversions and need trait conversions, use the appropriate [`num-traits`]
//! traits that are implemented.
//!
//! This crate also provides a [`slice`][mod@slice] module for zero-copy in-place conversions of
//! [`u16`] slices to both [`struct@f16`] and [`struct@bf16`], as well as efficient vectorized conversions of
//! larger buffers of floating point values to and from these half formats.
//!
//! The crate supports `#[no_std]` when the `std` cargo feature is not enabled, so can be used in
//! embedded environments without using the Rust [`std`] library. The `std` feature enables support
//! for the standard library and is enabled by default, see the [Cargo Features](#cargo-features)
//! section below.
//!
//! A [`prelude`] module is provided for easy importing of available utility traits.
//!
//! # Serialization
//!
//! When the `serde` feature is enabled, [`struct@f16`] and [`struct@bf16`] will be serialized as a newtype of
//! [`u16`] by default. In binary formats this is ideal, as it will generally use just two bytes for
//! storage. For string formats like JSON, however, this isn't as useful, and due to design
//! limitations of serde, it's not possible for the default `Serialize` implementation to support
//! different serialization for different formats.
//!
//! Instead, it's up to the containter type of the floats to control how it is serialized. This can
//! easily be controlled when using the derive macros using `#[serde(serialize_with="")]`
//! attributes. For both [`struct@f16`] and [`struct@bf16`] a `serialize_as_f32` and `serialize_as_string` are
//! provided for use with this attribute.
//!
//! Deserialization of both float types supports deserializing from the default serialization,
//! strings, and `f32`/`f64` values, so no additional work is required.
//!
//! # Hardware support
//!
//! Hardware support for these conversions and arithmetic will be used
//! whenever hardware support is available—either through instrinsics or targeted assembly—although
//! a nightly Rust toolchain may be required for some hardware. When hardware supports it the
//! functions and traits in the [`slice`][mod@slice] and [`vec`] modules will also use vectorized
//! SIMD intructions for increased efficiency.
//!
//! The following list details hardware support for floating point types in this crate. When using
//! `std` cargo feature, runtime CPU target detection will be used. To get the most performance
//! benefits, compile for specific CPU features which avoids the runtime overhead and works in a
//! `no_std` environment.
//!
//! | Architecture | CPU Target Feature | Notes |
//! | ------------ | ------------------ | ----- |
//! | `x86`/`x86_64` | `f16c` | This supports conversion to/from [`struct@f16`] only (including vector SIMD) and does not support any [`struct@bf16`] or arithmetic operations. |
//! | `aarch64` | `fp16` | This supports all operations on [`struct@f16`] only. |
//!
//! # Cargo Features
//!
//! This crate supports a number of optional cargo features. None of these features are enabled by
//! default, even `std`.
//!
//! - **`alloc`** — Enable use of the [`alloc`] crate when not using the `std` library.
//!
//! Among other functions, this enables the [`vec`] module, which contains zero-copy
//! conversions for the [`Vec`] type. This allows fast conversion between raw `Vec<u16>` bits and
//! `Vec<f16>` or `Vec<bf16>` arrays, and vice versa.
//!
//! - **`std`** — Enable features that depend on the Rust [`std`] library. This also enables the
//! `alloc` feature automatically.
//!
//! Enabling the `std` feature enables runtime CPU feature detection of hardware support.
//! Without this feature detection, harware is only used when compiler target supports them.
//!
//! - **`serde`** — Adds support for the [`serde`] crate by implementing [`Serialize`] and
//! [`Deserialize`] traits for both [`struct@f16`] and [`struct@bf16`].
//!
//! - **`num-traits`** — Adds support for the [`num-traits`] crate by implementing [`ToPrimitive`],
//! [`FromPrimitive`], [`ToBytes`], `FromBytes`, [`AsPrimitive`], [`Num`], [`Float`],
//! [`FloatCore`], and [`Bounded`] traits for both [`struct@f16`] and [`struct@bf16`].
//!
//! - **`bytemuck`** — Adds support for the [`bytemuck`] crate by implementing [`Zeroable`] and
//! [`Pod`] traits for both [`struct@f16`] and [`struct@bf16`].
//!
//! - **`zerocopy`** — Adds support for the [`zerocopy`] crate by implementing [`IntoBytes`] and
//! [`FromBytes`] traits for both [`struct@f16`] and [`struct@bf16`].
//!
//! - **`rand_distr`** — Adds support for the [`rand_distr`] crate by implementing [`Distribution`]
//! and other traits for both [`struct@f16`] and [`struct@bf16`].
//!
//! - **`rkyv`** -- Enable zero-copy deserializtion with [`rkyv`] crate.
//!
//! - **`aribtrary`** -- Enable fuzzing support with [`arbitrary`] crate by implementing
//! [`Arbitrary`] trait.
//!
//! [`alloc`]: https://doc.rust-lang.org/alloc/
//! [`std`]: https://doc.rust-lang.org/std/
//! [`binary16`]: https://en.wikipedia.org/wiki/Half-precision_floating-point_format
//! [`bfloat16`]: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
//! [`serde`]: https://crates.io/crates/serde
//! [`bytemuck`]: https://crates.io/crates/bytemuck
//! [`num-traits`]: https://crates.io/crates/num-traits
//! [`zerocopy`]: https://crates.io/crates/zerocopy
//! [`rand_distr`]: https://crates.io/crates/rand_distr
//! [`rkyv`]: (https://crates.io/crates/rkyv)
//! [`arbitrary`]: (https://crates.io/crates/arbitrary)
#![cfg_attr(
feature = "alloc",
doc = "
[`vec`]: mod@vec"
)]
#![cfg_attr(
not(feature = "alloc"),
doc = "
[`vec`]: #
[`Vec`]: https://docs.rust-lang.org/stable/alloc/vec/struct.Vec.html"
)]
#![cfg_attr(
feature = "serde",
doc = "
[`Serialize`]: serde::Serialize
[`Deserialize`]: serde::Deserialize"
)]
#![cfg_attr(
not(feature = "serde"),
doc = "
[`Serialize`]: https://docs.rs/serde/*/serde/trait.Serialize.html
[`Deserialize`]: https://docs.rs/serde/*/serde/trait.Deserialize.html"
)]
#![cfg_attr(
feature = "num-traits",
doc = "
[`ToPrimitive`]: ::num_traits::ToPrimitive
[`FromPrimitive`]: ::num_traits::FromPrimitive
[`ToBytes`]: ::num_traits::ToBytes
[`AsPrimitive`]: ::num_traits::AsPrimitive
[`Num`]: ::num_traits::Num
[`Float`]: ::num_traits::Float
[`FloatCore`]: ::num_traits::float::FloatCore
[`Bounded`]: ::num_traits::Bounded"
)]
#![cfg_attr(
not(feature = "num-traits"),
doc = "
[`ToPrimitive`]: https://docs.rs/num-traits/*/num_traits/cast/trait.ToPrimitive.html
[`FromPrimitive`]: https://docs.rs/num-traits/*/num_traits/cast/trait.FromPrimitive.html
[`ToBytes`]: https://docs.rs/num-traits/*/num_traits/ops/bytes/trait.ToBytes.html
[`AsPrimitive`]: https://docs.rs/num-traits/*/num_traits/cast/trait.AsPrimitive.html
[`Num`]: https://docs.rs/num-traits/*/num_traits/trait.Num.html
[`Float`]: https://docs.rs/num-traits/*/num_traits/float/trait.Float.html
[`FloatCore`]: https://docs.rs/num-traits/*/num_traits/float/trait.FloatCore.html
[`Bounded`]: https://docs.rs/num-traits/*/num_traits/bounds/trait.Bounded.html"
)]
#![cfg_attr(
feature = "bytemuck",
doc = "
[`Zeroable`]: bytemuck::Zeroable
[`Pod`]: bytemuck::Pod"
)]
#![cfg_attr(
not(feature = "bytemuck"),
doc = "
[`Zeroable`]: https://docs.rs/bytemuck/*/bytemuck/trait.Zeroable.html
[`Pod`]: https://docs.rs/bytemuck/*bytemuck/trait.Pod.html"
)]
#![cfg_attr(
feature = "zerocopy",
doc = "
[`IntoBytes`]: zerocopy::IntoBytes
[`FromBytes`]: zerocopy::FromBytes"
)]
#![cfg_attr(
not(feature = "zerocopy"),
doc = "
[`IntoBytes`]: https://docs.rs/zerocopy/*/zerocopy/trait.IntoBytes.html
[`FromBytes`]: https://docs.rs/zerocopy/*/zerocopy/trait.FromBytes.html"
)]
#![cfg_attr(
feature = "rand_distr",
doc = "
[`Distribution`]: rand::distr::Distribution"
)]
#![cfg_attr(
not(feature = "rand_distr"),
doc = "
[`Distribution`]: https://docs.rs/rand/*/rand/distr/trait.Distribution.html"
)]
#![cfg_attr(
feature = "arbitrary",
doc = "
[`Arbitrary`]: arbitrary::Arbitrary"
)]
#![cfg_attr(
not(feature = "arbitrary"),
doc = "
[`Arbitrary`]: https://docs.rs/arbitrary/*/arbitrary/trait.Arbitrary.html"
)]
#![warn(
missing_docs,
missing_copy_implementations,
trivial_numeric_casts,
future_incompatible
)]
#![cfg_attr(not(target_arch = "spirv"), warn(missing_debug_implementations))]
#![allow(clippy::verbose_bit_mask, clippy::cast_lossless, unexpected_cfgs)]
#![cfg_attr(not(feature = "std"), no_std)]
#![doc(html_root_url = "https://docs.rs/half/2.6.0")]
#![doc(test(attr(deny(warnings), allow(unused))))]
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
#[cfg(feature = "alloc")]
extern crate alloc;
mod bfloat;
mod binary16;
mod leading_zeros;
#[cfg(feature = "num-traits")]
mod num_traits;
#[cfg(not(target_arch = "spirv"))]
pub mod slice;
#[cfg(feature = "alloc")]
pub mod vec;
pub use bfloat::bf16;
pub use binary16::f16;
#[cfg(feature = "rand_distr")]
mod rand_distr;
/// A collection of the most used items and traits in this crate for easy importing.
///
/// # Examples
///
/// ```rust
/// use half::prelude::*;
/// ```
pub mod prelude {
#[doc(no_inline)]
pub use crate::{bf16, f16};
#[cfg(not(target_arch = "spirv"))]
#[doc(no_inline)]
pub use crate::slice::{HalfBitsSliceExt, HalfFloatSliceExt};
#[cfg(feature = "alloc")]
#[doc(no_inline)]
pub use crate::vec::{HalfBitsVecExt, HalfFloatVecExt};
}
// Keep this module private to crate
mod private {
use crate::{bf16, f16};
pub trait SealedHalf {}
impl SealedHalf for f16 {}
impl SealedHalf for bf16 {}
}

1550
vendor/half/src/num_traits.rs vendored Normal file

File diff suppressed because it is too large Load Diff

125
vendor/half/src/rand_distr.rs vendored Normal file
View File

@@ -0,0 +1,125 @@
use crate::{bf16, f16};
use rand::{distr::Distribution, Rng};
use rand_distr::uniform::UniformFloat;
macro_rules! impl_distribution_via_f32 {
($Ty:ty, $Distr:ty) => {
impl Distribution<$Ty> for $Distr {
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $Ty {
<$Ty>::from_f32(<Self as Distribution<f32>>::sample(self, rng))
}
}
};
}
impl_distribution_via_f32!(f16, rand_distr::StandardUniform);
impl_distribution_via_f32!(f16, rand_distr::StandardNormal);
impl_distribution_via_f32!(f16, rand_distr::Exp1);
impl_distribution_via_f32!(f16, rand_distr::Open01);
impl_distribution_via_f32!(f16, rand_distr::OpenClosed01);
impl_distribution_via_f32!(bf16, rand_distr::StandardUniform);
impl_distribution_via_f32!(bf16, rand_distr::StandardNormal);
impl_distribution_via_f32!(bf16, rand_distr::Exp1);
impl_distribution_via_f32!(bf16, rand_distr::Open01);
impl_distribution_via_f32!(bf16, rand_distr::OpenClosed01);
#[derive(Debug, Clone, Copy)]
pub struct Float16Sampler(UniformFloat<f32>);
impl rand_distr::uniform::SampleUniform for f16 {
type Sampler = Float16Sampler;
}
impl rand_distr::uniform::UniformSampler for Float16Sampler {
type X = f16;
fn new<B1, B2>(low: B1, high: B2) -> Result<Self, rand_distr::uniform::Error>
where
B1: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
B2: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
{
Ok(Self(UniformFloat::new(
low.borrow().to_f32(),
high.borrow().to_f32(),
)?))
}
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Result<Self, rand_distr::uniform::Error>
where
B1: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
B2: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
{
Ok(Self(UniformFloat::new_inclusive(
low.borrow().to_f32(),
high.borrow().to_f32(),
)?))
}
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
f16::from_f32(self.0.sample(rng))
}
}
#[derive(Debug, Clone, Copy)]
pub struct BFloat16Sampler(UniformFloat<f32>);
impl rand_distr::uniform::SampleUniform for bf16 {
type Sampler = BFloat16Sampler;
}
impl rand_distr::uniform::UniformSampler for BFloat16Sampler {
type X = bf16;
fn new<B1, B2>(low: B1, high: B2) -> Result<Self, rand_distr::uniform::Error>
where
B1: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
B2: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
{
Ok(Self(UniformFloat::new(
low.borrow().to_f32(),
high.borrow().to_f32(),
)?))
}
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Result<Self, rand_distr::uniform::Error>
where
B1: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
B2: rand_distr::uniform::SampleBorrow<Self::X> + Sized,
{
Ok(Self(UniformFloat::new_inclusive(
low.borrow().to_f32(),
high.borrow().to_f32(),
)?))
}
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
bf16::from_f32(self.0.sample(rng))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[allow(unused_imports)]
use rand::{rng, Rng};
use rand_distr::{StandardNormal, StandardUniform, Uniform};
#[test]
fn test_sample_f16() {
let mut rng = rng();
let _: f16 = rng.sample(StandardUniform);
let _: f16 = rng.sample(StandardNormal);
let _: f16 = rng.sample(Uniform::new(f16::from_f32(0.0), f16::from_f32(1.0)).unwrap());
#[cfg(feature = "num-traits")]
let _: f16 =
rng.sample(rand_distr::Normal::new(f16::from_f32(0.0), f16::from_f32(1.0)).unwrap());
}
#[test]
fn test_sample_bf16() {
let mut rng = rng();
let _: bf16 = rng.sample(StandardUniform);
let _: bf16 = rng.sample(StandardNormal);
let _: bf16 = rng.sample(Uniform::new(bf16::from_f32(0.0), bf16::from_f32(1.0)).unwrap());
#[cfg(feature = "num-traits")]
let _: bf16 =
rng.sample(rand_distr::Normal::new(bf16::from_f32(0.0), bf16::from_f32(1.0)).unwrap());
}
}

845
vendor/half/src/slice.rs vendored Normal file
View File

@@ -0,0 +1,845 @@
//! Contains utility functions and traits to convert between slices of [`u16`] bits and [`struct@f16`] or
//! [`struct@bf16`] numbers.
//!
//! The utility [`HalfBitsSliceExt`] sealed extension trait is implemented for `[u16]` slices,
//! while the utility [`HalfFloatSliceExt`] sealed extension trait is implemented for both `[f16]`
//! and `[bf16]` slices. These traits provide efficient conversions and reinterpret casting of
//! larger buffers of floating point values, and are automatically included in the
//! [`prelude`][crate::prelude] module.
use crate::{bf16, binary16::arch, f16};
#[cfg(feature = "alloc")]
#[allow(unused_imports)]
use alloc::{vec, vec::Vec};
use core::slice;
/// Extensions to `[f16]` and `[bf16]` slices to support conversion and reinterpret operations.
///
/// This trait is sealed and cannot be implemented outside of this crate.
pub trait HalfFloatSliceExt: private::SealedHalfFloatSlice {
/// Reinterprets a slice of [`struct@f16`] or [`struct@bf16`] numbers as a slice of [`u16`] bits.
///
/// This is a zero-copy operation. The reinterpreted slice has the same lifetime and memory
/// location as `self`.
///
/// # Examples
///
/// ```rust
/// # use half::prelude::*;
/// let float_buffer = [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)];
/// let int_buffer = float_buffer.reinterpret_cast();
///
/// assert_eq!(int_buffer, [float_buffer[0].to_bits(), float_buffer[1].to_bits(), float_buffer[2].to_bits()]);
/// ```
#[must_use]
fn reinterpret_cast(&self) -> &[u16];
/// Reinterprets a mutable slice of [`struct@f16`] or [`struct@bf16`] numbers as a mutable slice of [`u16`].
/// bits
///
/// This is a zero-copy operation. The transmuted slice has the same lifetime as the original,
/// which prevents mutating `self` as long as the returned `&mut [u16]` is borrowed.
///
/// # Examples
///
/// ```rust
/// # use half::prelude::*;
/// let mut float_buffer = [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)];
///
/// {
/// let int_buffer = float_buffer.reinterpret_cast_mut();
///
/// assert_eq!(int_buffer, [f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()]);
///
/// // Mutating the u16 slice will mutating the original
/// int_buffer[0] = 0;
/// }
///
/// // Note that we need to drop int_buffer before using float_buffer again or we will get a borrow error.
/// assert_eq!(float_buffer, [f16::from_f32(0.), f16::from_f32(2.), f16::from_f32(3.)]);
/// ```
#[must_use]
fn reinterpret_cast_mut(&mut self) -> &mut [u16];
/// Converts all of the elements of a `[f32]` slice into [`struct@f16`] or [`struct@bf16`] values in `self`.
///
/// The length of `src` must be the same as `self`.
///
/// The conversion operation is vectorized over the slice, meaning the conversion may be more
/// efficient than converting individual elements on some hardware that supports SIMD
/// conversions. See [crate documentation](crate) for more information on hardware conversion
/// support.
///
/// # Panics
///
/// This function will panic if the two slices have different lengths.
///
/// # Examples
/// ```rust
/// # use half::prelude::*;
/// // Initialize an empty buffer
/// let mut buffer = [0u16; 4];
/// let buffer = buffer.reinterpret_cast_mut::<f16>();
///
/// let float_values = [1., 2., 3., 4.];
///
/// // Now convert
/// buffer.convert_from_f32_slice(&float_values);
///
/// assert_eq!(buffer, [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.), f16::from_f32(4.)]);
/// ```
fn convert_from_f32_slice(&mut self, src: &[f32]);
/// Converts all of the elements of a `[f64]` slice into [`struct@f16`] or [`struct@bf16`] values in `self`.
///
/// The length of `src` must be the same as `self`.
///
/// The conversion operation is vectorized over the slice, meaning the conversion may be more
/// efficient than converting individual elements on some hardware that supports SIMD
/// conversions. See [crate documentation](crate) for more information on hardware conversion
/// support.
///
/// # Panics
///
/// This function will panic if the two slices have different lengths.
///
/// # Examples
/// ```rust
/// # use half::prelude::*;
/// // Initialize an empty buffer
/// let mut buffer = [0u16; 4];
/// let buffer = buffer.reinterpret_cast_mut::<f16>();
///
/// let float_values = [1., 2., 3., 4.];
///
/// // Now convert
/// buffer.convert_from_f64_slice(&float_values);
///
/// assert_eq!(buffer, [f16::from_f64(1.), f16::from_f64(2.), f16::from_f64(3.), f16::from_f64(4.)]);
/// ```
fn convert_from_f64_slice(&mut self, src: &[f64]);
/// Converts all of the [`struct@f16`] or [`struct@bf16`] elements of `self` into [`f32`] values in `dst`.
///
/// The length of `src` must be the same as `self`.
///
/// The conversion operation is vectorized over the slice, meaning the conversion may be more
/// efficient than converting individual elements on some hardware that supports SIMD
/// conversions. See [crate documentation](crate) for more information on hardware conversion
/// support.
///
/// # Panics
///
/// This function will panic if the two slices have different lengths.
///
/// # Examples
/// ```rust
/// # use half::prelude::*;
/// // Initialize an empty buffer
/// let mut buffer = [0f32; 4];
///
/// let half_values = [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.), f16::from_f32(4.)];
///
/// // Now convert
/// half_values.convert_to_f32_slice(&mut buffer);
///
/// assert_eq!(buffer, [1., 2., 3., 4.]);
/// ```
fn convert_to_f32_slice(&self, dst: &mut [f32]);
/// Converts all of the [`struct@f16`] or [`struct@bf16`] elements of `self` into [`f64`] values in `dst`.
///
/// The length of `src` must be the same as `self`.
///
/// The conversion operation is vectorized over the slice, meaning the conversion may be more
/// efficient than converting individual elements on some hardware that supports SIMD
/// conversions. See [crate documentation](crate) for more information on hardware conversion
/// support.
///
/// # Panics
///
/// This function will panic if the two slices have different lengths.
///
/// # Examples
/// ```rust
/// # use half::prelude::*;
/// // Initialize an empty buffer
/// let mut buffer = [0f64; 4];
///
/// let half_values = [f16::from_f64(1.), f16::from_f64(2.), f16::from_f64(3.), f16::from_f64(4.)];
///
/// // Now convert
/// half_values.convert_to_f64_slice(&mut buffer);
///
/// assert_eq!(buffer, [1., 2., 3., 4.]);
/// ```
fn convert_to_f64_slice(&self, dst: &mut [f64]);
// Because trait is sealed, we can get away with different interfaces between features.
/// Converts all of the [`struct@f16`] or [`struct@bf16`] elements of `self` into [`f32`] values in a new
/// vector
///
/// The conversion operation is vectorized over the slice, meaning the conversion may be more
/// efficient than converting individual elements on some hardware that supports SIMD
/// conversions. See [crate documentation](crate) for more information on hardware conversion
/// support.
///
/// This method is only available with the `std` or `alloc` feature.
///
/// # Examples
/// ```rust
/// # use half::prelude::*;
/// let half_values = [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.), f16::from_f32(4.)];
/// let vec = half_values.to_f32_vec();
///
/// assert_eq!(vec, vec![1., 2., 3., 4.]);
/// ```
#[cfg(any(feature = "alloc", feature = "std"))]
#[must_use]
fn to_f32_vec(&self) -> Vec<f32>;
/// Converts all of the [`struct@f16`] or [`struct@bf16`] elements of `self` into [`f64`] values in a new
/// vector.
///
/// The conversion operation is vectorized over the slice, meaning the conversion may be more
/// efficient than converting individual elements on some hardware that supports SIMD
/// conversions. See [crate documentation](crate) for more information on hardware conversion
/// support.
///
/// This method is only available with the `std` or `alloc` feature.
///
/// # Examples
/// ```rust
/// # use half::prelude::*;
/// let half_values = [f16::from_f64(1.), f16::from_f64(2.), f16::from_f64(3.), f16::from_f64(4.)];
/// let vec = half_values.to_f64_vec();
///
/// assert_eq!(vec, vec![1., 2., 3., 4.]);
/// ```
#[cfg(feature = "alloc")]
#[must_use]
fn to_f64_vec(&self) -> Vec<f64>;
}
/// Extensions to `[u16]` slices to support reinterpret operations.
///
/// This trait is sealed and cannot be implemented outside of this crate.
pub trait HalfBitsSliceExt: private::SealedHalfBitsSlice {
/// Reinterprets a slice of [`u16`] bits as a slice of [`struct@f16`] or [`struct@bf16`] numbers.
///
/// `H` is the type to cast to, and must be either the [`struct@f16`] or [`struct@bf16`] type.
///
/// This is a zero-copy operation. The reinterpreted slice has the same lifetime and memory
/// location as `self`.
///
/// # Examples
///
/// ```rust
/// # use half::prelude::*;
/// let int_buffer = [f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()];
/// let float_buffer: &[f16] = int_buffer.reinterpret_cast();
///
/// assert_eq!(float_buffer, [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)]);
///
/// // You may have to specify the cast type directly if the compiler can't infer the type.
/// // The following is also valid in Rust.
/// let typed_buffer = int_buffer.reinterpret_cast::<f16>();
/// ```
#[must_use]
fn reinterpret_cast<H>(&self) -> &[H]
where
H: crate::private::SealedHalf;
/// Reinterprets a mutable slice of [`u16`] bits as a mutable slice of [`struct@f16`] or [`struct@bf16`]
/// numbers.
///
/// `H` is the type to cast to, and must be either the [`struct@f16`] or [`struct@bf16`] type.
///
/// This is a zero-copy operation. The transmuted slice has the same lifetime as the original,
/// which prevents mutating `self` as long as the returned `&mut [f16]` is borrowed.
///
/// # Examples
///
/// ```rust
/// # use half::prelude::*;
/// let mut int_buffer = [f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()];
///
/// {
/// let float_buffer: &mut [f16] = int_buffer.reinterpret_cast_mut();
///
/// assert_eq!(float_buffer, [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)]);
///
/// // Mutating the f16 slice will mutating the original
/// float_buffer[0] = f16::from_f32(0.);
/// }
///
/// // Note that we need to drop float_buffer before using int_buffer again or we will get a borrow error.
/// assert_eq!(int_buffer, [f16::from_f32(0.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()]);
///
/// // You may have to specify the cast type directly if the compiler can't infer the type.
/// // The following is also valid in Rust.
/// let typed_buffer = int_buffer.reinterpret_cast_mut::<f16>();
/// ```
#[must_use]
fn reinterpret_cast_mut<H>(&mut self) -> &mut [H]
where
H: crate::private::SealedHalf;
}
mod private {
use crate::{bf16, f16};
pub trait SealedHalfFloatSlice {}
impl SealedHalfFloatSlice for [f16] {}
impl SealedHalfFloatSlice for [bf16] {}
pub trait SealedHalfBitsSlice {}
impl SealedHalfBitsSlice for [u16] {}
}
impl HalfFloatSliceExt for [f16] {
#[inline]
fn reinterpret_cast(&self) -> &[u16] {
let pointer = self.as_ptr() as *const u16;
let length = self.len();
// SAFETY: We are reconstructing full length of original slice, using its same lifetime,
// and the size of elements are identical
unsafe { slice::from_raw_parts(pointer, length) }
}
#[inline]
fn reinterpret_cast_mut(&mut self) -> &mut [u16] {
let pointer = self.as_mut_ptr().cast::<u16>();
let length = self.len();
// SAFETY: We are reconstructing full length of original slice, using its same lifetime,
// and the size of elements are identical
unsafe { slice::from_raw_parts_mut(pointer, length) }
}
#[inline]
fn convert_from_f32_slice(&mut self, src: &[f32]) {
assert_eq!(
self.len(),
src.len(),
"destination and source slices have different lengths"
);
arch::f32_to_f16_slice(src, self.reinterpret_cast_mut())
}
#[inline]
fn convert_from_f64_slice(&mut self, src: &[f64]) {
assert_eq!(
self.len(),
src.len(),
"destination and source slices have different lengths"
);
arch::f64_to_f16_slice(src, self.reinterpret_cast_mut())
}
#[inline]
fn convert_to_f32_slice(&self, dst: &mut [f32]) {
assert_eq!(
self.len(),
dst.len(),
"destination and source slices have different lengths"
);
arch::f16_to_f32_slice(self.reinterpret_cast(), dst)
}
#[inline]
fn convert_to_f64_slice(&self, dst: &mut [f64]) {
assert_eq!(
self.len(),
dst.len(),
"destination and source slices have different lengths"
);
arch::f16_to_f64_slice(self.reinterpret_cast(), dst)
}
#[cfg(any(feature = "alloc", feature = "std"))]
#[inline]
#[allow(clippy::uninit_vec)]
fn to_f32_vec(&self) -> Vec<f32> {
let mut vec = vec![0f32; self.len()];
self.convert_to_f32_slice(&mut vec);
vec
}
#[cfg(any(feature = "alloc", feature = "std"))]
#[inline]
#[allow(clippy::uninit_vec)]
fn to_f64_vec(&self) -> Vec<f64> {
let mut vec = vec![0f64; self.len()];
self.convert_to_f64_slice(&mut vec);
vec
}
}
impl HalfFloatSliceExt for [bf16] {
#[inline]
fn reinterpret_cast(&self) -> &[u16] {
let pointer = self.as_ptr() as *const u16;
let length = self.len();
// SAFETY: We are reconstructing full length of original slice, using its same lifetime,
// and the size of elements are identical
unsafe { slice::from_raw_parts(pointer, length) }
}
#[inline]
fn reinterpret_cast_mut(&mut self) -> &mut [u16] {
let pointer = self.as_mut_ptr().cast::<u16>();
let length = self.len();
// SAFETY: We are reconstructing full length of original slice, using its same lifetime,
// and the size of elements are identical
unsafe { slice::from_raw_parts_mut(pointer, length) }
}
#[inline]
fn convert_from_f32_slice(&mut self, src: &[f32]) {
assert_eq!(
self.len(),
src.len(),
"destination and source slices have different lengths"
);
// Just use regular loop here until there's any bf16 SIMD support.
for (i, f) in src.iter().enumerate() {
self[i] = bf16::from_f32(*f);
}
}
#[inline]
fn convert_from_f64_slice(&mut self, src: &[f64]) {
assert_eq!(
self.len(),
src.len(),
"destination and source slices have different lengths"
);
// Just use regular loop here until there's any bf16 SIMD support.
for (i, f) in src.iter().enumerate() {
self[i] = bf16::from_f64(*f);
}
}
#[inline]
fn convert_to_f32_slice(&self, dst: &mut [f32]) {
assert_eq!(
self.len(),
dst.len(),
"destination and source slices have different lengths"
);
// Just use regular loop here until there's any bf16 SIMD support.
for (i, f) in self.iter().enumerate() {
dst[i] = f.to_f32();
}
}
#[inline]
fn convert_to_f64_slice(&self, dst: &mut [f64]) {
assert_eq!(
self.len(),
dst.len(),
"destination and source slices have different lengths"
);
// Just use regular loop here until there's any bf16 SIMD support.
for (i, f) in self.iter().enumerate() {
dst[i] = f.to_f64();
}
}
#[cfg(any(feature = "alloc", feature = "std"))]
#[inline]
#[allow(clippy::uninit_vec)]
fn to_f32_vec(&self) -> Vec<f32> {
let mut vec = vec![0f32; self.len()];
self.convert_to_f32_slice(&mut vec);
vec
}
#[cfg(any(feature = "alloc", feature = "std"))]
#[inline]
#[allow(clippy::uninit_vec)]
fn to_f64_vec(&self) -> Vec<f64> {
let mut vec = vec![0f64; self.len()];
self.convert_to_f64_slice(&mut vec);
vec
}
}
impl HalfBitsSliceExt for [u16] {
// Since we sealed all the traits involved, these are safe.
#[inline]
fn reinterpret_cast<H>(&self) -> &[H]
where
H: crate::private::SealedHalf,
{
let pointer = self.as_ptr() as *const H;
let length = self.len();
// SAFETY: We are reconstructing full length of original slice, using its same lifetime,
// and the size of elements are identical
unsafe { slice::from_raw_parts(pointer, length) }
}
#[inline]
fn reinterpret_cast_mut<H>(&mut self) -> &mut [H]
where
H: crate::private::SealedHalf,
{
let pointer = self.as_mut_ptr() as *mut H;
let length = self.len();
// SAFETY: We are reconstructing full length of original slice, using its same lifetime,
// and the size of elements are identical
unsafe { slice::from_raw_parts_mut(pointer, length) }
}
}
#[allow(clippy::float_cmp)]
#[cfg(test)]
mod test {
use super::{HalfBitsSliceExt, HalfFloatSliceExt};
use crate::{bf16, f16};
#[test]
fn test_slice_conversions_f16() {
let bits = &[
f16::E.to_bits(),
f16::PI.to_bits(),
f16::EPSILON.to_bits(),
f16::FRAC_1_SQRT_2.to_bits(),
];
let numbers = &[f16::E, f16::PI, f16::EPSILON, f16::FRAC_1_SQRT_2];
// Convert from bits to numbers
let from_bits = bits.reinterpret_cast::<f16>();
assert_eq!(from_bits, numbers);
// Convert from numbers back to bits
let to_bits = from_bits.reinterpret_cast();
assert_eq!(to_bits, bits);
}
#[test]
fn test_mutablility_f16() {
let mut bits_array = [f16::PI.to_bits()];
let bits = &mut bits_array[..];
{
// would not compile without these braces
let numbers = bits.reinterpret_cast_mut();
numbers[0] = f16::E;
}
assert_eq!(bits, &[f16::E.to_bits()]);
bits[0] = f16::LN_2.to_bits();
assert_eq!(bits, &[f16::LN_2.to_bits()]);
}
#[test]
fn test_slice_conversions_bf16() {
let bits = &[
bf16::E.to_bits(),
bf16::PI.to_bits(),
bf16::EPSILON.to_bits(),
bf16::FRAC_1_SQRT_2.to_bits(),
];
let numbers = &[bf16::E, bf16::PI, bf16::EPSILON, bf16::FRAC_1_SQRT_2];
// Convert from bits to numbers
let from_bits = bits.reinterpret_cast::<bf16>();
assert_eq!(from_bits, numbers);
// Convert from numbers back to bits
let to_bits = from_bits.reinterpret_cast();
assert_eq!(to_bits, bits);
}
#[test]
fn test_mutablility_bf16() {
let mut bits_array = [bf16::PI.to_bits()];
let bits = &mut bits_array[..];
{
// would not compile without these braces
let numbers = bits.reinterpret_cast_mut();
numbers[0] = bf16::E;
}
assert_eq!(bits, &[bf16::E.to_bits()]);
bits[0] = bf16::LN_2.to_bits();
assert_eq!(bits, &[bf16::LN_2.to_bits()]);
}
#[test]
fn slice_convert_f16_f32() {
// Exact chunks
let vf32 = [1., 2., 3., 4., 5., 6., 7., 8.];
let vf16 = [
f16::from_f32(1.),
f16::from_f32(2.),
f16::from_f32(3.),
f16::from_f32(4.),
f16::from_f32(5.),
f16::from_f32(6.),
f16::from_f32(7.),
f16::from_f32(8.),
];
let mut buf32 = vf32;
let mut buf16 = vf16;
vf16.convert_to_f32_slice(&mut buf32);
assert_eq!(&vf32, &buf32);
buf16.convert_from_f32_slice(&vf32);
assert_eq!(&vf16, &buf16);
// Partial with chunks
let vf32 = [1., 2., 3., 4., 5., 6., 7., 8., 9.];
let vf16 = [
f16::from_f32(1.),
f16::from_f32(2.),
f16::from_f32(3.),
f16::from_f32(4.),
f16::from_f32(5.),
f16::from_f32(6.),
f16::from_f32(7.),
f16::from_f32(8.),
f16::from_f32(9.),
];
let mut buf32 = vf32;
let mut buf16 = vf16;
vf16.convert_to_f32_slice(&mut buf32);
assert_eq!(&vf32, &buf32);
buf16.convert_from_f32_slice(&vf32);
assert_eq!(&vf16, &buf16);
// Partial with chunks
let vf32 = [1., 2.];
let vf16 = [f16::from_f32(1.), f16::from_f32(2.)];
let mut buf32 = vf32;
let mut buf16 = vf16;
vf16.convert_to_f32_slice(&mut buf32);
assert_eq!(&vf32, &buf32);
buf16.convert_from_f32_slice(&vf32);
assert_eq!(&vf16, &buf16);
}
#[test]
fn slice_convert_bf16_f32() {
// Exact chunks
let vf32 = [1., 2., 3., 4., 5., 6., 7., 8.];
let vf16 = [
bf16::from_f32(1.),
bf16::from_f32(2.),
bf16::from_f32(3.),
bf16::from_f32(4.),
bf16::from_f32(5.),
bf16::from_f32(6.),
bf16::from_f32(7.),
bf16::from_f32(8.),
];
let mut buf32 = vf32;
let mut buf16 = vf16;
vf16.convert_to_f32_slice(&mut buf32);
assert_eq!(&vf32, &buf32);
buf16.convert_from_f32_slice(&vf32);
assert_eq!(&vf16, &buf16);
// Partial with chunks
let vf32 = [1., 2., 3., 4., 5., 6., 7., 8., 9.];
let vf16 = [
bf16::from_f32(1.),
bf16::from_f32(2.),
bf16::from_f32(3.),
bf16::from_f32(4.),
bf16::from_f32(5.),
bf16::from_f32(6.),
bf16::from_f32(7.),
bf16::from_f32(8.),
bf16::from_f32(9.),
];
let mut buf32 = vf32;
let mut buf16 = vf16;
vf16.convert_to_f32_slice(&mut buf32);
assert_eq!(&vf32, &buf32);
buf16.convert_from_f32_slice(&vf32);
assert_eq!(&vf16, &buf16);
// Partial with chunks
let vf32 = [1., 2.];
let vf16 = [bf16::from_f32(1.), bf16::from_f32(2.)];
let mut buf32 = vf32;
let mut buf16 = vf16;
vf16.convert_to_f32_slice(&mut buf32);
assert_eq!(&vf32, &buf32);
buf16.convert_from_f32_slice(&vf32);
assert_eq!(&vf16, &buf16);
}
#[test]
fn slice_convert_f16_f64() {
// Exact chunks
let vf64 = [1., 2., 3., 4., 5., 6., 7., 8.];
let vf16 = [
f16::from_f64(1.),
f16::from_f64(2.),
f16::from_f64(3.),
f16::from_f64(4.),
f16::from_f64(5.),
f16::from_f64(6.),
f16::from_f64(7.),
f16::from_f64(8.),
];
let mut buf64 = vf64;
let mut buf16 = vf16;
vf16.convert_to_f64_slice(&mut buf64);
assert_eq!(&vf64, &buf64);
buf16.convert_from_f64_slice(&vf64);
assert_eq!(&vf16, &buf16);
// Partial with chunks
let vf64 = [1., 2., 3., 4., 5., 6., 7., 8., 9.];
let vf16 = [
f16::from_f64(1.),
f16::from_f64(2.),
f16::from_f64(3.),
f16::from_f64(4.),
f16::from_f64(5.),
f16::from_f64(6.),
f16::from_f64(7.),
f16::from_f64(8.),
f16::from_f64(9.),
];
let mut buf64 = vf64;
let mut buf16 = vf16;
vf16.convert_to_f64_slice(&mut buf64);
assert_eq!(&vf64, &buf64);
buf16.convert_from_f64_slice(&vf64);
assert_eq!(&vf16, &buf16);
// Partial with chunks
let vf64 = [1., 2.];
let vf16 = [f16::from_f64(1.), f16::from_f64(2.)];
let mut buf64 = vf64;
let mut buf16 = vf16;
vf16.convert_to_f64_slice(&mut buf64);
assert_eq!(&vf64, &buf64);
buf16.convert_from_f64_slice(&vf64);
assert_eq!(&vf16, &buf16);
}
#[test]
fn slice_convert_bf16_f64() {
// Exact chunks
let vf64 = [1., 2., 3., 4., 5., 6., 7., 8.];
let vf16 = [
bf16::from_f64(1.),
bf16::from_f64(2.),
bf16::from_f64(3.),
bf16::from_f64(4.),
bf16::from_f64(5.),
bf16::from_f64(6.),
bf16::from_f64(7.),
bf16::from_f64(8.),
];
let mut buf64 = vf64;
let mut buf16 = vf16;
vf16.convert_to_f64_slice(&mut buf64);
assert_eq!(&vf64, &buf64);
buf16.convert_from_f64_slice(&vf64);
assert_eq!(&vf16, &buf16);
// Partial with chunks
let vf64 = [1., 2., 3., 4., 5., 6., 7., 8., 9.];
let vf16 = [
bf16::from_f64(1.),
bf16::from_f64(2.),
bf16::from_f64(3.),
bf16::from_f64(4.),
bf16::from_f64(5.),
bf16::from_f64(6.),
bf16::from_f64(7.),
bf16::from_f64(8.),
bf16::from_f64(9.),
];
let mut buf64 = vf64;
let mut buf16 = vf16;
vf16.convert_to_f64_slice(&mut buf64);
assert_eq!(&vf64, &buf64);
buf16.convert_from_f64_slice(&vf64);
assert_eq!(&vf16, &buf16);
// Partial with chunks
let vf64 = [1., 2.];
let vf16 = [bf16::from_f64(1.), bf16::from_f64(2.)];
let mut buf64 = vf64;
let mut buf16 = vf16;
vf16.convert_to_f64_slice(&mut buf64);
assert_eq!(&vf64, &buf64);
buf16.convert_from_f64_slice(&vf64);
assert_eq!(&vf16, &buf16);
}
#[test]
#[should_panic]
fn convert_from_f32_slice_len_mismatch_panics() {
let mut slice1 = [f16::ZERO; 3];
let slice2 = [0f32; 4];
slice1.convert_from_f32_slice(&slice2);
}
#[test]
#[should_panic]
fn convert_from_f64_slice_len_mismatch_panics() {
let mut slice1 = [f16::ZERO; 3];
let slice2 = [0f64; 4];
slice1.convert_from_f64_slice(&slice2);
}
#[test]
#[should_panic]
fn convert_to_f32_slice_len_mismatch_panics() {
let slice1 = [f16::ZERO; 3];
let mut slice2 = [0f32; 4];
slice1.convert_to_f32_slice(&mut slice2);
}
#[test]
#[should_panic]
fn convert_to_f64_slice_len_mismatch_panics() {
let slice1 = [f16::ZERO; 3];
let mut slice2 = [0f64; 4];
slice1.convert_to_f64_slice(&mut slice2);
}
}

260
vendor/half/src/vec.rs vendored Normal file
View File

@@ -0,0 +1,260 @@
//! Contains utility functions and traits to convert between vectors of [`u16`] bits and [`struct@f16`] or
//! [`bf16`] vectors.
//!
//! The utility [`HalfBitsVecExt`] sealed extension trait is implemented for [`Vec<u16>`] vectors,
//! while the utility [`HalfFloatVecExt`] sealed extension trait is implemented for both
//! [`Vec<f16>`] and [`Vec<bf16>`] vectors. These traits provide efficient conversions and
//! reinterpret casting of larger buffers of floating point values, and are automatically included
//! in the [`prelude`][crate::prelude] module.
//!
//! This module is only available with the `std` or `alloc` feature.
use super::{bf16, f16, slice::HalfFloatSliceExt};
#[cfg(feature = "alloc")]
#[allow(unused_imports)]
use alloc::{vec, vec::Vec};
use core::mem;
/// Extensions to [`Vec<f16>`] and [`Vec<bf16>`] to support reinterpret operations.
///
/// This trait is sealed and cannot be implemented outside of this crate.
pub trait HalfFloatVecExt: private::SealedHalfFloatVec {
/// Reinterprets a vector of [`struct@f16`]or [`bf16`] numbers as a vector of [`u16`] bits.
///
/// This is a zero-copy operation. The reinterpreted vector has the same memory location as
/// `self`.
///
/// # Examples
///
/// ```rust
/// # use half::prelude::*;
/// let float_buffer = vec![f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)];
/// let int_buffer = float_buffer.reinterpret_into();
///
/// assert_eq!(int_buffer, [f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()]);
/// ```
#[must_use]
fn reinterpret_into(self) -> Vec<u16>;
/// Converts all of the elements of a `[f32]` slice into a new [`struct@f16`] or [`bf16`] vector.
///
/// The conversion operation is vectorized over the slice, meaning the conversion may be more
/// efficient than converting individual elements on some hardware that supports SIMD
/// conversions. See [crate documentation][crate] for more information on hardware conversion
/// support.
///
/// # Examples
/// ```rust
/// # use half::prelude::*;
/// let float_values = [1., 2., 3., 4.];
/// let vec: Vec<f16> = Vec::from_f32_slice(&float_values);
///
/// assert_eq!(vec, vec![f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.), f16::from_f32(4.)]);
/// ```
#[must_use]
fn from_f32_slice(slice: &[f32]) -> Self;
/// Converts all of the elements of a `[f64]` slice into a new [`struct@f16`] or [`bf16`] vector.
///
/// The conversion operation is vectorized over the slice, meaning the conversion may be more
/// efficient than converting individual elements on some hardware that supports SIMD
/// conversions. See [crate documentation][crate] for more information on hardware conversion
/// support.
///
/// # Examples
/// ```rust
/// # use half::prelude::*;
/// let float_values = [1., 2., 3., 4.];
/// let vec: Vec<f16> = Vec::from_f64_slice(&float_values);
///
/// assert_eq!(vec, vec![f16::from_f64(1.), f16::from_f64(2.), f16::from_f64(3.), f16::from_f64(4.)]);
/// ```
#[must_use]
fn from_f64_slice(slice: &[f64]) -> Self;
}
/// Extensions to [`Vec<u16>`] to support reinterpret operations.
///
/// This trait is sealed and cannot be implemented outside of this crate.
pub trait HalfBitsVecExt: private::SealedHalfBitsVec {
/// Reinterprets a vector of [`u16`] bits as a vector of [`struct@f16`] or [`bf16`] numbers.
///
/// `H` is the type to cast to, and must be either the [`struct@f16`] or [`bf16`] type.
///
/// This is a zero-copy operation. The reinterpreted vector has the same memory location as
/// `self`.
///
/// # Examples
///
/// ```rust
/// # use half::prelude::*;
/// let int_buffer = vec![f16::from_f32(1.).to_bits(), f16::from_f32(2.).to_bits(), f16::from_f32(3.).to_bits()];
/// let float_buffer = int_buffer.reinterpret_into::<f16>();
///
/// assert_eq!(float_buffer, [f16::from_f32(1.), f16::from_f32(2.), f16::from_f32(3.)]);
/// ```
#[must_use]
fn reinterpret_into<H>(self) -> Vec<H>
where
H: crate::private::SealedHalf;
}
mod private {
use crate::{bf16, f16};
#[cfg(feature = "alloc")]
#[allow(unused_imports)]
use alloc::vec::Vec;
pub trait SealedHalfFloatVec {}
impl SealedHalfFloatVec for Vec<f16> {}
impl SealedHalfFloatVec for Vec<bf16> {}
pub trait SealedHalfBitsVec {}
impl SealedHalfBitsVec for Vec<u16> {}
}
impl HalfFloatVecExt for Vec<f16> {
#[inline]
fn reinterpret_into(mut self) -> Vec<u16> {
// An f16 array has same length and capacity as u16 array
let length = self.len();
let capacity = self.capacity();
// Actually reinterpret the contents of the Vec<f16> as u16,
// knowing that structs are represented as only their members in memory,
// which is the u16 part of `f16(u16)`
let pointer = self.as_mut_ptr() as *mut u16;
// Prevent running a destructor on the old Vec<u16>, so the pointer won't be deleted
mem::forget(self);
// Finally construct a new Vec<f16> from the raw pointer
// SAFETY: We are reconstructing full length and capacity of original vector,
// using its original pointer, and the size of elements are identical.
unsafe { Vec::from_raw_parts(pointer, length, capacity) }
}
#[allow(clippy::uninit_vec)]
fn from_f32_slice(slice: &[f32]) -> Self {
let mut vec = vec![f16::from_bits(0); slice.len()];
vec.convert_from_f32_slice(slice);
vec
}
#[allow(clippy::uninit_vec)]
fn from_f64_slice(slice: &[f64]) -> Self {
let mut vec = vec![f16::from_bits(0); slice.len()];
vec.convert_from_f64_slice(slice);
vec
}
}
impl HalfFloatVecExt for Vec<bf16> {
#[inline]
fn reinterpret_into(mut self) -> Vec<u16> {
// An f16 array has same length and capacity as u16 array
let length = self.len();
let capacity = self.capacity();
// Actually reinterpret the contents of the Vec<f16> as u16,
// knowing that structs are represented as only their members in memory,
// which is the u16 part of `f16(u16)`
let pointer = self.as_mut_ptr() as *mut u16;
// Prevent running a destructor on the old Vec<u16>, so the pointer won't be deleted
mem::forget(self);
// Finally construct a new Vec<f16> from the raw pointer
// SAFETY: We are reconstructing full length and capacity of original vector,
// using its original pointer, and the size of elements are identical.
unsafe { Vec::from_raw_parts(pointer, length, capacity) }
}
#[allow(clippy::uninit_vec)]
fn from_f32_slice(slice: &[f32]) -> Self {
let mut vec = vec![bf16::from_bits(0); slice.len()];
vec.convert_from_f32_slice(slice);
vec
}
#[allow(clippy::uninit_vec)]
fn from_f64_slice(slice: &[f64]) -> Self {
let mut vec = vec![bf16::from_bits(0); slice.len()];
vec.convert_from_f64_slice(slice);
vec
}
}
impl HalfBitsVecExt for Vec<u16> {
// This is safe because all traits are sealed
#[inline]
fn reinterpret_into<H>(mut self) -> Vec<H>
where
H: crate::private::SealedHalf,
{
// An f16 array has same length and capacity as u16 array
let length = self.len();
let capacity = self.capacity();
// Actually reinterpret the contents of the Vec<u16> as f16,
// knowing that structs are represented as only their members in memory,
// which is the u16 part of `f16(u16)`
let pointer = self.as_mut_ptr() as *mut H;
// Prevent running a destructor on the old Vec<u16>, so the pointer won't be deleted
mem::forget(self);
// Finally construct a new Vec<f16> from the raw pointer
// SAFETY: We are reconstructing full length and capacity of original vector,
// using its original pointer, and the size of elements are identical.
unsafe { Vec::from_raw_parts(pointer, length, capacity) }
}
}
#[cfg(test)]
mod test {
use super::{HalfBitsVecExt, HalfFloatVecExt};
use crate::{bf16, f16};
#[cfg(all(feature = "alloc", not(feature = "std")))]
use alloc::vec;
#[test]
fn test_vec_conversions_f16() {
let numbers = vec![f16::E, f16::PI, f16::EPSILON, f16::FRAC_1_SQRT_2];
let bits = vec![
f16::E.to_bits(),
f16::PI.to_bits(),
f16::EPSILON.to_bits(),
f16::FRAC_1_SQRT_2.to_bits(),
];
let bits_cloned = bits.clone();
// Convert from bits to numbers
let from_bits = bits.reinterpret_into::<f16>();
assert_eq!(&from_bits[..], &numbers[..]);
// Convert from numbers back to bits
let to_bits = from_bits.reinterpret_into();
assert_eq!(&to_bits[..], &bits_cloned[..]);
}
#[test]
fn test_vec_conversions_bf16() {
let numbers = vec![bf16::E, bf16::PI, bf16::EPSILON, bf16::FRAC_1_SQRT_2];
let bits = vec![
bf16::E.to_bits(),
bf16::PI.to_bits(),
bf16::EPSILON.to_bits(),
bf16::FRAC_1_SQRT_2.to_bits(),
];
let bits_cloned = bits.clone();
// Convert from bits to numbers
let from_bits = bits.reinterpret_into::<bf16>();
assert_eq!(&from_bits[..], &numbers[..]);
// Convert from numbers back to bits
let to_bits = from_bits.reinterpret_into();
assert_eq!(&to_bits[..], &bits_cloned[..]);
}
}