Files
another-boids-in-rust/vendor/moxcms/src/conversions/neon/rgb_xyz_opt.rs

424 lines
18 KiB
Rust

/*
* // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
* //
* // Redistribution and use in source and binary forms, with or without modification,
* // are permitted provided that the following conditions are met:
* //
* // 1. Redistributions of source code must retain the above copyright notice, this
* // list of conditions and the following disclaimer.
* //
* // 2. Redistributions in binary form must reproduce the above copyright notice,
* // this list of conditions and the following disclaimer in the documentation
* // and/or other materials provided with the distribution.
* //
* // 3. Neither the name of the copyright holder nor the names of its
* // contributors may be used to endorse or promote products derived from
* // this software without specific prior written permission.
* //
* // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
use crate::conversions::neon::rgb_xyz::NeonAlignedU16;
use crate::conversions::neon::rgb_xyz_q2_13::{split_by_twos, split_by_twos_mut};
use crate::conversions::rgbxyz::TransformMatrixShaperOptimized;
use crate::transform::PointeeSizeExpressible;
use crate::{CmsError, Layout, TransformExecutor};
use num_traits::AsPrimitive;
use std::arch::aarch64::*;
pub(crate) struct TransformShaperRgbOptNeon<
T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
const SRC_LAYOUT: u8,
const DST_LAYOUT: u8,
const LINEAR_CAP: usize,
const GAMMA_LUT: usize,
> {
pub(crate) profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
pub(crate) bit_depth: usize,
}
impl<
T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
const SRC_LAYOUT: u8,
const DST_LAYOUT: u8,
const LINEAR_CAP: usize,
const GAMMA_LUT: usize,
> TransformExecutor<T>
for TransformShaperRgbOptNeon<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
where
u32: AsPrimitive<T>,
{
fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
let src_cn = Layout::from(SRC_LAYOUT);
let dst_cn = Layout::from(DST_LAYOUT);
let src_channels = src_cn.channels();
let dst_channels = dst_cn.channels();
let mut temporary0 = NeonAlignedU16([0; 8]);
let mut temporary1 = NeonAlignedU16([0; 8]);
let mut temporary2 = NeonAlignedU16([0; 8]);
let mut temporary3 = NeonAlignedU16([0; 8]);
if src.len() / src_channels != dst.len() / dst_channels {
return Err(CmsError::LaneSizeMismatch);
}
if src.len() % src_channels != 0 {
return Err(CmsError::LaneMultipleOfChannels);
}
if dst.len() % dst_channels != 0 {
return Err(CmsError::LaneMultipleOfChannels);
}
let t = self.profile.adaptation_matrix.transpose();
let scale = (GAMMA_LUT - 1) as f32;
let max_colors: T = ((1 << self.bit_depth) - 1).as_();
let (src_chunks, src_remainder) = split_by_twos(src, src_channels);
let (dst_chunks, dst_remainder) = split_by_twos_mut(dst, dst_channels);
unsafe {
let m0 = vld1q_f32([t.v[0][0], t.v[0][1], t.v[0][2], 0.].as_ptr());
let m1 = vld1q_f32([t.v[1][0], t.v[1][1], t.v[1][2], 0.].as_ptr());
let m2 = vld1q_f32([t.v[2][0], t.v[2][1], t.v[2][2], 0.].as_ptr());
let v_scale = vdupq_n_f32(scale);
let rnd = vdupq_n_f32(0.5);
if !src_chunks.is_empty() {
let (src0, src1) = src_chunks.split_at(src_chunks.len() / 2);
let (dst0, dst1) = dst_chunks.split_at_mut(dst_chunks.len() / 2);
let mut src_iter0 = src0.chunks_exact(src_channels * 2);
let mut src_iter1 = src1.chunks_exact(src_channels * 2);
let (mut r0, mut g0, mut b0, mut a0);
let (mut r1, mut g1, mut b1, mut a1);
let (mut r2, mut g2, mut b2, mut a2);
let (mut r3, mut g3, mut b3, mut a3);
if let (Some(src0), Some(src1)) = (src_iter0.next(), src_iter1.next()) {
let r0p = &self.profile.linear[src0[src_cn.r_i()]._as_usize()];
let g0p = &self.profile.linear[src0[src_cn.g_i()]._as_usize()];
let b0p = &self.profile.linear[src0[src_cn.b_i()]._as_usize()];
let r1p = &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()];
let g1p = &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()];
let b1p = &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()];
let r2p = &self.profile.linear[src1[src_cn.r_i()]._as_usize()];
let g2p = &self.profile.linear[src1[src_cn.g_i()]._as_usize()];
let b2p = &self.profile.linear[src1[src_cn.b_i()]._as_usize()];
let r3p = &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()];
let g3p = &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()];
let b3p = &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()];
r0 = vld1q_dup_f32(r0p);
g0 = vld1q_dup_f32(g0p);
b0 = vld1q_dup_f32(b0p);
r1 = vld1q_dup_f32(r1p);
g1 = vld1q_dup_f32(g1p);
b1 = vld1q_dup_f32(b1p);
r2 = vld1q_dup_f32(r2p);
g2 = vld1q_dup_f32(g2p);
b2 = vld1q_dup_f32(b2p);
r3 = vld1q_dup_f32(r3p);
g3 = vld1q_dup_f32(g3p);
b3 = vld1q_dup_f32(b3p);
a0 = if src_channels == 4 {
src0[src_cn.a_i()]
} else {
max_colors
};
a1 = if src_channels == 4 {
src0[src_cn.a_i() + src_channels]
} else {
max_colors
};
a2 = if src_channels == 4 {
src1[src_cn.a_i()]
} else {
max_colors
};
a3 = if src_channels == 4 {
src1[src_cn.a_i() + src_channels]
} else {
max_colors
};
} else {
r0 = vdupq_n_f32(0.);
g0 = vdupq_n_f32(0.);
b0 = vdupq_n_f32(0.);
r1 = vdupq_n_f32(0.);
g1 = vdupq_n_f32(0.);
b1 = vdupq_n_f32(0.);
r2 = vdupq_n_f32(0.);
g2 = vdupq_n_f32(0.);
b2 = vdupq_n_f32(0.);
r3 = vdupq_n_f32(0.);
g3 = vdupq_n_f32(0.);
b3 = vdupq_n_f32(0.);
a0 = max_colors;
a1 = max_colors;
a2 = max_colors;
a3 = max_colors;
}
for (((src0, src1), dst0), dst1) in src_iter0
.zip(src_iter1)
.zip(dst0.chunks_exact_mut(dst_channels * 2))
.zip(dst1.chunks_exact_mut(dst_channels * 2))
{
let v0_0 = vmulq_f32(r0, m0);
let v0_1 = vmulq_f32(r1, m0);
let v0_2 = vmulq_f32(r2, m0);
let v0_3 = vmulq_f32(r3, m0);
let v1_0 = vfmaq_f32(v0_0, g0, m1);
let v1_1 = vfmaq_f32(v0_1, g1, m1);
let v1_2 = vfmaq_f32(v0_2, g2, m1);
let v1_3 = vfmaq_f32(v0_3, g3, m1);
let mut vr0 = vfmaq_f32(v1_0, b0, m2);
let mut vr1 = vfmaq_f32(v1_1, b1, m2);
let mut vr2 = vfmaq_f32(v1_2, b2, m2);
let mut vr3 = vfmaq_f32(v1_3, b3, m2);
vr0 = vfmaq_f32(rnd, vr0, v_scale);
vr1 = vfmaq_f32(rnd, vr1, v_scale);
vr2 = vfmaq_f32(rnd, vr2, v_scale);
vr3 = vfmaq_f32(rnd, vr3, v_scale);
vr0 = vminq_f32(vr0, v_scale);
vr1 = vminq_f32(vr1, v_scale);
vr2 = vminq_f32(vr2, v_scale);
vr3 = vminq_f32(vr3, v_scale);
let zx0 = vcvtq_u32_f32(vr0);
let zx1 = vcvtq_u32_f32(vr1);
let zx2 = vcvtq_u32_f32(vr2);
let zx3 = vcvtq_u32_f32(vr3);
vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx0);
vst1q_u32(temporary1.0.as_mut_ptr() as *mut _, zx1);
vst1q_u32(temporary2.0.as_mut_ptr() as *mut _, zx2);
vst1q_u32(temporary3.0.as_mut_ptr() as *mut _, zx3);
let r0p = &self.profile.linear[src0[src_cn.r_i()]._as_usize()];
let g0p = &self.profile.linear[src0[src_cn.g_i()]._as_usize()];
let b0p = &self.profile.linear[src0[src_cn.b_i()]._as_usize()];
let r1p = &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()];
let g1p = &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()];
let b1p = &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()];
let r2p = &self.profile.linear[src1[src_cn.r_i()]._as_usize()];
let g2p = &self.profile.linear[src1[src_cn.g_i()]._as_usize()];
let b2p = &self.profile.linear[src1[src_cn.b_i()]._as_usize()];
let r3p = &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()];
let g3p = &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()];
let b3p = &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()];
r0 = vld1q_dup_f32(r0p);
g0 = vld1q_dup_f32(g0p);
b0 = vld1q_dup_f32(b0p);
r1 = vld1q_dup_f32(r1p);
g1 = vld1q_dup_f32(g1p);
b1 = vld1q_dup_f32(b1p);
r2 = vld1q_dup_f32(r2p);
g2 = vld1q_dup_f32(g2p);
b2 = vld1q_dup_f32(b2p);
r3 = vld1q_dup_f32(r3p);
g3 = vld1q_dup_f32(g3p);
b3 = vld1q_dup_f32(b3p);
dst0[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
dst0[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
dst0[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
if dst_channels == 4 {
dst0[dst_cn.a_i()] = a0;
}
dst0[dst_cn.r_i() + dst_channels] =
self.profile.gamma[temporary1.0[0] as usize];
dst0[dst_cn.g_i() + dst_channels] =
self.profile.gamma[temporary1.0[2] as usize];
dst0[dst_cn.b_i() + dst_channels] =
self.profile.gamma[temporary1.0[4] as usize];
if dst_channels == 4 {
dst0[dst_cn.a_i() + dst_channels] = a1;
}
dst1[dst_cn.r_i()] = self.profile.gamma[temporary2.0[0] as usize];
dst1[dst_cn.g_i()] = self.profile.gamma[temporary2.0[2] as usize];
dst1[dst_cn.b_i()] = self.profile.gamma[temporary2.0[4] as usize];
if dst_channels == 4 {
dst1[dst_cn.a_i()] = a2;
}
dst1[dst_cn.r_i() + dst_channels] =
self.profile.gamma[temporary3.0[0] as usize];
dst1[dst_cn.g_i() + dst_channels] =
self.profile.gamma[temporary3.0[2] as usize];
dst1[dst_cn.b_i() + dst_channels] =
self.profile.gamma[temporary3.0[4] as usize];
if dst_channels == 4 {
dst1[dst_cn.a_i() + dst_channels] = a3;
}
a0 = if src_channels == 4 {
src0[src_cn.a_i()]
} else {
max_colors
};
a1 = if src_channels == 4 {
src0[src_cn.a_i() + src_channels]
} else {
max_colors
};
a2 = if src_channels == 4 {
src1[src_cn.a_i()]
} else {
max_colors
};
a3 = if src_channels == 4 {
src1[src_cn.a_i() + src_channels]
} else {
max_colors
};
}
if let (Some(dst0), Some(dst1)) = (
dst0.chunks_exact_mut(dst_channels * 2).last(),
dst1.chunks_exact_mut(dst_channels * 2).last(),
) {
let v0_0 = vmulq_f32(r0, m0);
let v0_1 = vmulq_f32(r1, m0);
let v0_2 = vmulq_f32(r2, m0);
let v0_3 = vmulq_f32(r3, m0);
let v1_0 = vfmaq_f32(v0_0, g0, m1);
let v1_1 = vfmaq_f32(v0_1, g1, m1);
let v1_2 = vfmaq_f32(v0_2, g2, m1);
let v1_3 = vfmaq_f32(v0_3, g3, m1);
let mut vr0 = vfmaq_f32(v1_0, b0, m2);
let mut vr1 = vfmaq_f32(v1_1, b1, m2);
let mut vr2 = vfmaq_f32(v1_2, b2, m2);
let mut vr3 = vfmaq_f32(v1_3, b3, m2);
vr0 = vfmaq_f32(rnd, vr0, v_scale);
vr1 = vfmaq_f32(rnd, vr1, v_scale);
vr2 = vfmaq_f32(rnd, vr2, v_scale);
vr3 = vfmaq_f32(rnd, vr3, v_scale);
vr0 = vminq_f32(vr0, v_scale);
vr1 = vminq_f32(vr1, v_scale);
vr2 = vminq_f32(vr2, v_scale);
vr3 = vminq_f32(vr3, v_scale);
let zx0 = vcvtq_u32_f32(vr0);
let zx1 = vcvtq_u32_f32(vr1);
let zx2 = vcvtq_u32_f32(vr2);
let zx3 = vcvtq_u32_f32(vr3);
vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx0);
vst1q_u32(temporary1.0.as_mut_ptr() as *mut _, zx1);
vst1q_u32(temporary2.0.as_mut_ptr() as *mut _, zx2);
vst1q_u32(temporary3.0.as_mut_ptr() as *mut _, zx3);
dst0[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
dst0[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
dst0[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
if dst_channels == 4 {
dst0[dst_cn.a_i()] = a0;
}
dst0[dst_cn.r_i() + dst_channels] =
self.profile.gamma[temporary1.0[0] as usize];
dst0[dst_cn.g_i() + dst_channels] =
self.profile.gamma[temporary1.0[2] as usize];
dst0[dst_cn.b_i() + dst_channels] =
self.profile.gamma[temporary1.0[4] as usize];
if dst_channels == 4 {
dst0[dst_cn.a_i() + dst_channels] = a1;
}
dst1[dst_cn.r_i()] = self.profile.gamma[temporary2.0[0] as usize];
dst1[dst_cn.g_i()] = self.profile.gamma[temporary2.0[2] as usize];
dst1[dst_cn.b_i()] = self.profile.gamma[temporary2.0[4] as usize];
if dst_channels == 4 {
dst1[dst_cn.a_i()] = a2;
}
dst1[dst_cn.r_i() + dst_channels] =
self.profile.gamma[temporary3.0[0] as usize];
dst1[dst_cn.g_i() + dst_channels] =
self.profile.gamma[temporary3.0[2] as usize];
dst1[dst_cn.b_i() + dst_channels] =
self.profile.gamma[temporary3.0[4] as usize];
if dst_channels == 4 {
dst1[dst_cn.a_i() + dst_channels] = a3;
}
}
}
for (src, dst) in src_remainder
.chunks_exact(src_channels)
.zip(dst_remainder.chunks_exact_mut(dst_channels))
{
let rp = &self.profile.linear[src[src_cn.r_i()]._as_usize()];
let gp = &self.profile.linear[src[src_cn.g_i()]._as_usize()];
let bp = &self.profile.linear[src[src_cn.b_i()]._as_usize()];
let r = vld1q_dup_f32(rp);
let g = vld1q_dup_f32(gp);
let b = vld1q_dup_f32(bp);
let a = if src_channels == 4 {
src[src_cn.a_i()]
} else {
max_colors
};
let v0 = vmulq_f32(r, m0);
let v1 = vfmaq_f32(v0, g, m1);
let mut v = vfmaq_f32(v1, b, m2);
v = vfmaq_f32(rnd, v, v_scale);
v = vminq_f32(v, v_scale);
let zx = vcvtq_u32_f32(v);
vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx);
dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
if dst_channels == 4 {
dst[dst_cn.a_i()] = a;
}
}
}
Ok(())
}
}