Vendor dependencies for 0.3.0 release

2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions
--- a/vendor/moxcms/.cargo-checksum.json
+++ b/vendor/moxcms/.cargo-checksum.json
--- a/vendor/moxcms/Cargo.lock
+++ b/vendor/moxcms/Cargo.lock
@@ -0,0 +1,183 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "bitflags"
+version = "2.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
+
+[[package]]
+name = "getrandom"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.175"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"
+
+[[package]]
+name = "moxcms"
+version = "0.7.5"
+dependencies = [
+ "num-traits",
+ "pxfm",
+ "rand",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.101"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "pxfm"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "069f3b41a7e17d18b8af925e597c8b2430591341415f98c5e1ecb2a245cea7ae"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
--- a/vendor/moxcms/Cargo.toml
+++ b/vendor/moxcms/Cargo.toml
@@ -0,0 +1,79 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2024"
+rust-version = "1.85.0"
+name = "moxcms"
+version = "0.7.5"
+authors = ["Radzivon Bartoshyk"]
+build = false
+exclude = [
+    "*.jpg",
+    "../../assets/*",
+    "*.png",
+    "*.icc",
+    "./assets/*",
+]
+autolib = false
+autobins = false
+autoexamples = false
+autotests = false
+autobenches = false
+description = "Simple Color Management in Rust"
+homepage = "https://github.com/awxkee/moxcms"
+documentation = "https://github.com/awxkee/moxcms"
+readme = "README.md"
+keywords = [
+    "icc",
+    "cms",
+    "color",
+    "cmyk",
+]
+categories = ["multimedia::images"]
+license = "BSD-3-Clause OR Apache-2.0"
+repository = "https://github.com/awxkee/moxcms.git"
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = [
+    "--cfg",
+    "docsrs",
+]
+
+[features]
+avx = []
+avx512 = []
+default = [
+    "avx",
+    "sse",
+    "neon",
+]
+neon = []
+options = []
+sse = []
+
+[lib]
+name = "moxcms"
+path = "src/lib.rs"
+
+[dependencies.num-traits]
+version = "0.2"
+
+[dependencies.pxfm]
+version = "^0.1.1"
+
+[dev-dependencies.rand]
+version = "0.9"
+
+[profile.profiling]
+debug = 2
+inherits = "release"
--- a/vendor/moxcms/LICENSE-APACHE.md
+++ b/vendor/moxcms/LICENSE-APACHE.md
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2024 Radzivon Bartoshyk
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/vendor/moxcms/LICENSE.md
+++ b/vendor/moxcms/LICENSE.md
@@ -0,0 +1,26 @@
+Copyright (c) Radzivon Bartoshyk. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1.  Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+
+2.  Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+3.  Neither the name of the copyright holder nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/moxcms/README.md
+++ b/vendor/moxcms/README.md
@@ -0,0 +1,74 @@
+# Rust ICC Management
+
+Fast and safe conversion between ICC profiles; in pure Rust.
+
+Supports CMYK⬌RGBX, RGBX⬌RGBX, RGBX⬌GRAY, LAB⬌RGBX and CMYK⬌LAB, GRAY⬌RGB, any 3/4 color profiles to RGB and vice versa. Also supports almost any to any Display Class ICC profiles up to 16 inks.
+
+## Example
+
+```rust
+let f_str = "./assets/dci_p3_profile.jpeg";
+let file = File::open(f_str).expect("Failed to open file");
+
+let img = image::ImageReader::open(f_str).unwrap().decode().unwrap();
+let rgb = img.to_rgb8();
+
+let mut decoder = JpegDecoder::new(BufReader::new(file)).unwrap();
+let icc = decoder.icc_profile().unwrap().unwrap();
+let color_profile = ColorProfile::new_from_slice(&icc).unwrap();
+let dest_profile = ColorProfile::new_srgb();
+let transform = color_profile
+    .create_transform_8bit(&dest_profile, Layout::Rgb8, TransformOptions::default())
+    .unwrap();
+let mut dst = vec![0u8; rgb.len()];
+
+for (src, dst) in rgb
+    .chunks_exact(img.width() as usize * 3)
+    .zip(dst.chunks_exact_mut(img.dimensions().0 as usize * 3))
+{
+    transform
+        .transform(
+            &src[..img.dimensions().0 as usize * 3],
+            &mut dst[..img.dimensions().0 as usize * 3],
+        )
+        .unwrap();
+}
+image::save_buffer(
+    "v1.jpg",
+    &dst,
+    img.dimensions().0,
+    img.dimensions().1,
+    image::ExtendedColorType::Rgb8,
+)
+    .unwrap();
+```
+
+## Benchmarks
+
+### ICC Transform 8-Bit 
+
+Tests were ran with a 1997×1331 resolution image.
+
+| Conversion         | time(NEON) | Time(AVX2) |
+|--------------------|:----------:|:----------:|
+| moxcms RGB⮕RGB     |   2.68ms   |   4.52ms   |
+| moxcms LUT RGB⮕RGB |   6.03ms   |  12.43ms   |
+| moxcms RGBA⮕RGBA   |   2.96ms   |   4.83ms   |
+| moxcms CMYK⮕RGBA   |   9.74ms   |  21.65ms   |
+| lcms2 RGB⮕RGB      |   13.1ms   |  27.73ms   |
+| lcms2 LUT RGB⮕RGB  |  27.60ms   |  58.26ms   |
+| lcms2 RGBA⮕RGBA    |  21.97ms   |  35.70ms   |
+| lcms2 CMYK⮕RGBA    |  39.71ms   |  79.40ms   |
+| qcms RGB⮕RGB       |   6.47ms   |   4.59ms   |
+| qcms LUT RGB⮕RGB   |  26.72ms   |  60.80ms   |
+| qcms RGBA⮕RGBA     |   6.83ms   |   4.99ms   |
+| qcms CMYK⮕RGBA     |  25.97ms   |  61.54ms   |
+
+## License
+
+This project is licensed under either of
+
+- BSD-3-Clause License (see [LICENSE](LICENSE.md))
+- Apache License, Version 2.0 (see [LICENSE](LICENSE-APACHE.md))
+
+at your option.
--- a/vendor/moxcms/src/chad.rs
+++ b/vendor/moxcms/src/chad.rs
@@ -0,0 +1,172 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::matrix::{Matrix3f, Vector3f, Xyz};
+use crate::{Chromaticity, Matrix3d, Vector3d, XyY};
+
+pub(crate) const BRADFORD_D: Matrix3d = Matrix3d {
+    v: [
+        [0.8951, 0.2664, -0.1614],
+        [-0.7502, 1.7135, 0.0367],
+        [0.0389, -0.0685, 1.0296],
+    ],
+};
+
+pub(crate) const BRADFORD_F: Matrix3f = BRADFORD_D.to_f32();
+
+#[inline]
+pub(crate) const fn compute_chromatic_adaption(
+    source_white_point: Xyz,
+    dest_white_point: Xyz,
+    chad: Matrix3f,
+) -> Matrix3f {
+    let cone_source_xyz = Vector3f {
+        v: [
+            source_white_point.x,
+            source_white_point.y,
+            source_white_point.z,
+        ],
+    };
+    let cone_source_rgb = chad.mul_vector(cone_source_xyz);
+
+    let cone_dest_xyz = Vector3f {
+        v: [dest_white_point.x, dest_white_point.y, dest_white_point.z],
+    };
+    let cone_dest_rgb = chad.mul_vector(cone_dest_xyz);
+
+    let cone = Matrix3f {
+        v: [
+            [cone_dest_rgb.v[0] / cone_source_rgb.v[0], 0., 0.],
+            [0., cone_dest_rgb.v[1] / cone_source_rgb.v[1], 0.],
+            [0., 0., cone_dest_rgb.v[2] / cone_source_rgb.v[2]],
+        ],
+    };
+
+    let chad_inv = chad.inverse();
+
+    let p0 = cone.mat_mul_const(chad);
+    chad_inv.mat_mul_const(p0)
+}
+
+#[inline]
+pub(crate) const fn compute_chromatic_adaption_d(
+    source_white_point: Xyz,
+    dest_white_point: Xyz,
+    chad: Matrix3d,
+) -> Matrix3d {
+    let cone_source_xyz = Vector3d {
+        v: [
+            source_white_point.x as f64,
+            source_white_point.y as f64,
+            source_white_point.z as f64,
+        ],
+    };
+    let cone_source_rgb = chad.mul_vector(cone_source_xyz);
+
+    let cone_dest_xyz = Vector3d {
+        v: [
+            dest_white_point.x as f64,
+            dest_white_point.y as f64,
+            dest_white_point.z as f64,
+        ],
+    };
+    let cone_dest_rgb = chad.mul_vector(cone_dest_xyz);
+
+    let cone = Matrix3d {
+        v: [
+            [cone_dest_rgb.v[0] / cone_source_rgb.v[0], 0., 0.],
+            [0., cone_dest_rgb.v[1] / cone_source_rgb.v[1], 0.],
+            [0., 0., cone_dest_rgb.v[2] / cone_source_rgb.v[2]],
+        ],
+    };
+
+    let chad_inv = chad.inverse();
+
+    let p0 = cone.mat_mul_const(chad);
+    chad_inv.mat_mul_const(p0)
+}
+
+pub const fn adaption_matrix(source_illumination: Xyz, target_illumination: Xyz) -> Matrix3f {
+    compute_chromatic_adaption(source_illumination, target_illumination, BRADFORD_F)
+}
+
+pub const fn adaption_matrix_d(source_illumination: Xyz, target_illumination: Xyz) -> Matrix3d {
+    compute_chromatic_adaption_d(source_illumination, target_illumination, BRADFORD_D)
+}
+
+pub const fn adapt_to_d50(r: Matrix3f, source_white_pt: XyY) -> Matrix3f {
+    adapt_to_illuminant(r, source_white_pt, Chromaticity::D50.to_xyz())
+}
+
+pub const fn adapt_to_d50_d(r: Matrix3d, source_white_pt: XyY) -> Matrix3d {
+    adapt_to_illuminant_d(r, source_white_pt, Chromaticity::D50.to_xyz())
+}
+
+pub const fn adapt_to_illuminant(
+    r: Matrix3f,
+    source_white_pt: XyY,
+    illuminant_xyz: Xyz,
+) -> Matrix3f {
+    let bradford = adaption_matrix(source_white_pt.to_xyz(), illuminant_xyz);
+    bradford.mat_mul_const(r)
+}
+
+pub const fn adapt_to_illuminant_d(
+    r: Matrix3d,
+    source_white_pt: XyY,
+    illuminant_xyz: Xyz,
+) -> Matrix3d {
+    let bradford = adaption_matrix_d(source_white_pt.to_xyz(), illuminant_xyz);
+    bradford.mat_mul_const(r)
+}
+
+pub const fn adapt_to_illuminant_xyz(
+    r: Matrix3f,
+    source_white_pt: Xyz,
+    illuminant_xyz: Xyz,
+) -> Matrix3f {
+    if source_white_pt.y == 0.0 {
+        return r;
+    }
+
+    let bradford = adaption_matrix(source_white_pt, illuminant_xyz);
+    bradford.mat_mul_const(r)
+}
+
+pub const fn adapt_to_illuminant_xyz_d(
+    r: Matrix3d,
+    source_white_pt: Xyz,
+    illuminant_xyz: Xyz,
+) -> Matrix3d {
+    if source_white_pt.y == 0.0 {
+        return r;
+    }
+
+    let bradford = adaption_matrix_d(source_white_pt, illuminant_xyz);
+    bradford.mat_mul_const(r)
+}
--- a/vendor/moxcms/src/chromaticity.rs
+++ b/vendor/moxcms/src/chromaticity.rs
@@ -0,0 +1,143 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 8/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::{CmsError, XyY, XyYRepresentable, Xyz, Xyzd};
+
+#[derive(Clone, Debug, Copy)]
+#[repr(C)]
+pub struct Chromaticity {
+    pub x: f32,
+    pub y: f32,
+}
+
+impl Chromaticity {
+    #[inline]
+    pub const fn new(x: f32, y: f32) -> Self {
+        Self { x, y }
+    }
+
+    /// Converts this chromaticity (`x`, `y`) to a tristimulus [`Xyz`] value,
+    /// normalized such that `y = 1.0`.
+    #[inline]
+    pub const fn to_xyz(&self) -> Xyz {
+        let reciprocal = if self.y != 0. { 1. / self.y } else { 0. };
+        Xyz {
+            x: self.x * reciprocal,
+            y: 1f32,
+            z: (1f32 - self.x - self.y) * reciprocal,
+        }
+    }
+
+    /// Get the color representation with component sum `1`.
+    ///
+    /// In contrast to the XYZ representation defined through setting `Y` to a known
+    /// value (such as `1` in [`Self::to_xyz`]) this representation can be uniquely
+    /// derived from the `xy` coordinates with no ambiguities. It is scaled from the
+    /// original XYZ color by diving by `X + Y + Z`. Note that, in particular, this
+    /// method is well-defined even if the original color had pure chromamatic
+    /// information with no luminance (Y = `0`) and will preserve that information,
+    /// whereas [`Self::to_xyz`] is ill-defined and returns an incorrect value.
+    #[inline]
+    pub const fn to_scaled_xyzd(&self) -> Xyzd {
+        let z = 1.0 - self.x as f64 - self.y as f64;
+        Xyzd::new(self.x as f64, self.y as f64, z)
+    }
+
+    /// Get the color representation with component sum `1`.
+    ///
+    /// In contrast to the XYZ representation defined through setting `Y` to a known
+    /// value (such as `1` in [`Self::to_xyz`]) this representation can be uniquely
+    /// derived from the `xy` coordinates with no ambiguities. It is scaled from the
+    /// original XYZ color by diving by `X + Y + Z`. Note that, in particular, this
+    /// method is well-defined even if the original color had pure chromamatic
+    /// information with no luminance (Y = `0`) and will preserve that information,
+    /// whereas [`Self::to_xyz`] is ill-defined and returns an incorrect value.
+    #[inline]
+    pub const fn to_scaled_xyz(&self) -> Xyz {
+        let z = 1.0 - self.x - self.y;
+        Xyz::new(self.x, self.y, z)
+    }
+
+    #[inline]
+    pub const fn to_xyzd(&self) -> Xyzd {
+        let reciprocal = if self.y != 0. { 1. / self.y } else { 0. };
+        Xyzd {
+            x: self.x as f64 * reciprocal as f64,
+            y: 1f64,
+            z: (1f64 - self.x as f64 - self.y as f64) * reciprocal as f64,
+        }
+    }
+
+    #[inline]
+    pub const fn to_xyyb(&self) -> XyY {
+        XyY {
+            x: self.x as f64,
+            y: self.y as f64,
+            yb: 1.,
+        }
+    }
+
+    pub const D65: Chromaticity = Chromaticity {
+        x: 0.31272,
+        y: 0.32903,
+    };
+
+    pub const D50: Chromaticity = Chromaticity {
+        x: 0.34567,
+        y: 0.35850,
+    };
+}
+
+impl XyYRepresentable for Chromaticity {
+    fn to_xyy(self) -> XyY {
+        self.to_xyyb()
+    }
+}
+
+impl TryFrom<Xyz> for Chromaticity {
+    type Error = CmsError;
+
+    #[inline]
+    fn try_from(xyz: Xyz) -> Result<Self, Self::Error> {
+        let sum = xyz.x + xyz.y + xyz.z;
+
+        // Avoid division by zero or invalid XYZ values
+        if sum == 0.0 {
+            return Err(CmsError::DivisionByZero);
+        }
+        let rec = 1f32 / sum;
+
+        let chromaticity_x = xyz.x * rec;
+        let chromaticity_y = xyz.y * rec;
+
+        Ok(Chromaticity {
+            x: chromaticity_x,
+            y: chromaticity_y,
+        })
+    }
+}
--- a/vendor/moxcms/src/cicp.rs
+++ b/vendor/moxcms/src/cicp.rs
@@ -0,0 +1,642 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::gamma::{
+    bt1361_to_linear, hlg_to_linear, iec61966_to_linear, log100_sqrt10_to_linear, log100_to_linear,
+    pq_to_linear, smpte240_to_linear, smpte428_to_linear,
+};
+use crate::{
+    Chromaticity, ColorProfile, Matrix3d, Matrix3f, XyYRepresentable,
+    err::CmsError,
+    trc::{ToneReprCurve, build_trc_table, curve_from_gamma},
+};
+use std::convert::TryFrom;
+
+/// See [Rec. ITU-T H.273 (12/2016)](https://www.itu.int/rec/T-REC-H.273-201612-I/en) Table 2
+/// Values 0, 3, 13–21, 23–255 are all reserved so all map to the same variant
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum CicpColorPrimaries {
+    /// For future use by ITU-T | ISO/IEC
+    Reserved,
+    /// Rec. ITU-R BT.709-6<br />
+    /// Rec. ITU-R BT.1361-0 conventional colour gamut system and extended colour gamut system (historical)<br />
+    /// IEC 61966-2-1 sRGB or sYCC IEC 61966-2-4<br />
+    /// Society of Motion Picture and Television Engineers (MPTE) RP 177 (1993) Annex B<br />
+    Bt709 = 1,
+    /// Unspecified<br />
+    /// Image characteristics are unknown or are determined by the application.
+    Unspecified = 2,
+    /// Rec. ITU-R BT.470-6 System M (historical)<br />
+    /// United States National Television System Committee 1953 Recommendation for transmission standards for color television<br />
+    /// United States Federal Communications Commission (2003) Title 47 Code of Federal Regulations 73.682 (a) (20)<br />
+    Bt470M = 4,
+    /// Rec. ITU-R BT.470-6 System B, G (historical) Rec. ITU-R BT.601-7 625<br />
+    /// Rec. ITU-R BT.1358-0 625 (historical)<br />
+    /// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM<br />
+    Bt470Bg = 5,
+    /// Rec. ITU-R BT.601-7 525<br />
+    /// Rec. ITU-R BT.1358-1 525 or 625 (historical) Rec. ITU-R BT.1700-0 NTSC<br />
+    /// SMPTE 170M (2004)<br />
+    /// (functionally the same as the value 7)<br />
+    Bt601 = 6,
+    /// SMPTE 240M (1999) (historical) (functionally the same as the value 6)<br />
+    Smpte240 = 7,
+    /// Generic film (colour filters using Illuminant C)<br />
+    GenericFilm = 8,
+    /// Rec. ITU-R BT.2020-2<br />
+    /// Rec. ITU-R BT.2100-0<br />
+    Bt2020 = 9,
+    /// SMPTE ST 428-1<br />
+    /// (CIE 1931 XYZ as in ISO 11664-1)<br />
+    Xyz = 10,
+    /// SMPTE RP 431-2 (2011)<br />
+    Smpte431 = 11,
+    /// SMPTE EG 432-1 (2010)<br />
+    Smpte432 = 12,
+    /// EBU Tech. 3213-E (1975)<br />
+    Ebu3213 = 22,
+}
+
+impl TryFrom<u8> for CicpColorPrimaries {
+    type Error = CmsError;
+
+    #[allow(unreachable_patterns)]
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            // Values 0, 3, 13–21, 23–255 are all reserved so all map to the
+            // same variant.
+            0 | 3 | 13..=21 | 23..=255 => Ok(Self::Reserved),
+            1 => Ok(Self::Bt709),
+            2 => Ok(Self::Unspecified),
+            4 => Ok(Self::Bt470M),
+            5 => Ok(Self::Bt470Bg),
+            6 => Ok(Self::Bt601),
+            7 => Ok(Self::Smpte240),
+            8 => Ok(Self::GenericFilm),
+            9 => Ok(Self::Bt2020),
+            10 => Ok(Self::Xyz),
+            11 => Ok(Self::Smpte431),
+            12 => Ok(Self::Smpte432),
+            22 => Ok(Self::Ebu3213),
+            _ => Err(CmsError::InvalidCicp),
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+#[repr(C)]
+pub struct ColorPrimaries {
+    pub red: Chromaticity,
+    pub green: Chromaticity,
+    pub blue: Chromaticity,
+}
+
+/// See [Rec. ITU-T H.273 (12/2016)](https://www.itu.int/rec/T-REC-H.273-201612-I/en) Table 2.
+impl ColorPrimaries {
+    /// [ACEScg](https://en.wikipedia.org/wiki/Academy_Color_Encoding_System#ACEScg).
+    pub const ACES_CG: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.713, y: 0.293 },
+        green: Chromaticity { x: 0.165, y: 0.830 },
+        blue: Chromaticity { x: 0.128, y: 0.044 },
+    };
+
+    /// [ACES2065-1](https://en.wikipedia.org/wiki/Academy_Color_Encoding_System#ACES2065-1).
+    pub const ACES_2065_1: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity {
+            x: 0.7347,
+            y: 0.2653,
+        },
+        green: Chromaticity {
+            x: 0.0000,
+            y: 1.0000,
+        },
+        blue: Chromaticity {
+            x: 0.0001,
+            y: -0.0770,
+        },
+    };
+
+    /// [Adobe RGB](https://en.wikipedia.org/wiki/Adobe_RGB_color_space) (1998).
+    pub const ADOBE_RGB: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.64, y: 0.33 },
+        green: Chromaticity { x: 0.21, y: 0.71 },
+        blue: Chromaticity { x: 0.15, y: 0.06 },
+    };
+
+    /// [DCI P3](https://en.wikipedia.org/wiki/DCI-P3#DCI_P3).
+    ///
+    /// This is the same as [`DISPLAY_P3`](Self::DISPLAY_P3),
+    /// [`SMPTE_431`](Self::SMPTE_431) and [`SMPTE_432`](Self::SMPTE_432).
+    pub const DCI_P3: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.680, y: 0.320 },
+        green: Chromaticity { x: 0.265, y: 0.690 },
+        blue: Chromaticity { x: 0.150, y: 0.060 },
+    };
+
+    /// [Diplay P3](https://en.wikipedia.org/wiki/DCI-P3#Display_P3).
+    ///
+    /// This is the same as [`DCI_P3`](Self::DCI_P3),
+    /// [`SMPTE_431`](Self::SMPTE_431) and [`SMPTE_432`](Self::SMPTE_432).
+    pub const DISPLAY_P3: ColorPrimaries = Self::DCI_P3;
+
+    /// SMPTE RP 431-2 (2011).
+    ///
+    /// This is the same as [`DCI_P3`](Self::DCI_P3),
+    /// [`DISPLAY_P3`](Self::DISPLAY_P3) and [`SMPTE_432`](Self::SMPTE_432).
+    pub const SMPTE_431: ColorPrimaries = Self::DCI_P3;
+
+    /// SMPTE EG 432-1 (2010).
+    ///
+    /// This is the same as [`DCI_P3`](Self::DCI_P3),
+    /// [`DISPLAY_P3`](Self::DISPLAY_P3) and [`SMPTE_431`](Self::SMPTE_431).
+    pub const SMPTE_432: ColorPrimaries = Self::DCI_P3;
+
+    /// [ProPhoto RGB](https://en.wikipedia.org/wiki/ProPhoto_RGB_color_space).
+    pub const PRO_PHOTO_RGB: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity {
+            x: 0.734699,
+            y: 0.265301,
+        },
+        green: Chromaticity {
+            x: 0.159597,
+            y: 0.840403,
+        },
+        blue: Chromaticity {
+            x: 0.036598,
+            y: 0.000105,
+        },
+    };
+
+    /// Rec. ITU-R BT.709-6
+    ///
+    /// Rec. ITU-R BT.1361-0 conventional colour gamut system and extended
+    /// colour gamut system (historical).
+    ///
+    /// IEC 61966-2-1 sRGB or sYCC IEC 61966-2-4).
+    ///
+    /// Society of Motion Picture and Television Engineers (MPTE) RP 177 (1993) Annex B.
+    pub const BT_709: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.64, y: 0.33 },
+        green: Chromaticity { x: 0.30, y: 0.60 },
+        blue: Chromaticity { x: 0.15, y: 0.06 },
+    };
+
+    /// Rec. ITU-R BT.470-6 System M (historical).
+    ///
+    /// United States National Television System Committee 1953 Recommendation
+    /// for transmission standards for color television.
+    ///
+    /// United States Federal Communications Commission (2003) Title 47 Code of
+    /// Federal Regulations 73.682 (a) (20).
+    pub const BT_470M: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.67, y: 0.33 },
+        green: Chromaticity { x: 0.21, y: 0.71 },
+        blue: Chromaticity { x: 0.14, y: 0.08 },
+    };
+
+    /// Rec. ITU-R BT.470-6 System B, G (historical) Rec. ITU-R BT.601-7 625.
+    ///
+    /// Rec. ITU-R BT.1358-0 625 (historical).
+    /// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM.
+    pub const BT_470BG: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.64, y: 0.33 },
+        green: Chromaticity { x: 0.29, y: 0.60 },
+        blue: Chromaticity { x: 0.15, y: 0.06 },
+    };
+
+    /// Rec. ITU-R BT.601-7 525.
+    ///
+    /// Rec. ITU-R BT.1358-1 525 or 625 (historical) Rec. ITU-R BT.1700-0 NTSC.
+    ///
+    /// SMPTE 170M (2004) (functionally the same as the [`SMPTE_240`](Self::SMPTE_240)).
+    pub const BT_601: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.630, y: 0.340 },
+        green: Chromaticity { x: 0.310, y: 0.595 },
+        blue: Chromaticity { x: 0.155, y: 0.070 },
+    };
+
+    /// SMPTE 240M (1999) (historical) (functionally the same as [`BT_601`](Self::BT_601)).
+    pub const SMPTE_240: ColorPrimaries = Self::BT_601;
+
+    /// Generic film (colour filters using Illuminant C).
+    pub const GENERIC_FILM: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.681, y: 0.319 },
+        green: Chromaticity { x: 0.243, y: 0.692 },
+        blue: Chromaticity { x: 0.145, y: 0.049 },
+    };
+
+    /// Rec. ITU-R BT.2020-2.
+    ///
+    /// Rec. ITU-R BT.2100-0.
+    pub const BT_2020: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.708, y: 0.292 },
+        green: Chromaticity { x: 0.170, y: 0.797 },
+        blue: Chromaticity { x: 0.131, y: 0.046 },
+    };
+
+    /// SMPTE ST 428-1 (CIE 1931 XYZ as in ISO 11664-1).
+    pub const XYZ: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 1.0, y: 0.0 },
+        green: Chromaticity { x: 0.0, y: 1.0 },
+        blue: Chromaticity { x: 0.0, y: 0.0 },
+    };
+
+    /// EBU Tech. 3213-E (1975).
+    pub const EBU_3213: ColorPrimaries = ColorPrimaries {
+        red: Chromaticity { x: 0.630, y: 0.340 },
+        green: Chromaticity { x: 0.295, y: 0.605 },
+        blue: Chromaticity { x: 0.155, y: 0.077 },
+    };
+}
+
+impl ColorPrimaries {
+    /// Returns RGB -> XYZ conversion matrix
+    ///
+    /// # Arguments
+    ///
+    /// * `white_point`: [Chromaticity] or [crate::XyY] or any item conforming [XyYRepresentable]
+    ///
+    /// returns: [Matrix3d]
+    pub fn transform_to_xyz_d(self, white_point: impl XyYRepresentable) -> Matrix3d {
+        let red_xyz = self.red.to_scaled_xyzd();
+        let green_xyz = self.green.to_scaled_xyzd();
+        let blue_xyz = self.blue.to_scaled_xyzd();
+
+        let xyz_matrix = Matrix3d {
+            v: [
+                [red_xyz.x, green_xyz.x, blue_xyz.x],
+                [red_xyz.y, green_xyz.y, blue_xyz.y],
+                [red_xyz.z, green_xyz.z, blue_xyz.z],
+            ],
+        };
+        ColorProfile::rgb_to_xyz_d(xyz_matrix, white_point.to_xyy().to_xyzd())
+    }
+
+    /// Returns RGB -> XYZ conversion matrix
+    ///
+    /// # Arguments
+    ///
+    /// * `white_point`: [Chromaticity] or [crate::XyY] or any item conforming [XyYRepresentable]
+    ///
+    /// returns: [Matrix3f]
+    pub fn transform_to_xyz(self, white_point: impl XyYRepresentable) -> Matrix3f {
+        let red_xyz = self.red.to_scaled_xyz();
+        let green_xyz = self.green.to_scaled_xyz();
+        let blue_xyz = self.blue.to_scaled_xyz();
+
+        let xyz_matrix = Matrix3f {
+            v: [
+                [red_xyz.x, green_xyz.x, blue_xyz.x],
+                [red_xyz.y, green_xyz.y, blue_xyz.y],
+                [red_xyz.z, green_xyz.z, blue_xyz.z],
+            ],
+        };
+        ColorProfile::rgb_to_xyz_static(xyz_matrix, white_point.to_xyy().to_xyz())
+    }
+}
+
+/// See [Rec. ITU-T H.273 (12/2016)](https://www.itu.int/rec/T-REC-H.273-201612-I/en) Table 3
+/// Values 0, 3, 19–255 are all reserved so all map to the same variant
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum TransferCharacteristics {
+    /// For future use by ITU-T | ISO/IEC
+    Reserved,
+    /// Rec. ITU-R BT.709-6<br />
+    /// Rec. ITU-R BT.1361-0 conventional colour gamut system (historical)<br />
+    /// (functionally the same as the values 6, 14 and 15)    <br />
+    Bt709 = 1,
+    /// Image characteristics are unknown or are determined by the application.<br />
+    Unspecified = 2,
+    /// Rec. ITU-R BT.470-6 System M (historical)<br />
+    /// United States National Television System Committee 1953 Recommendation for transmission standards for color television<br />
+    /// United States Federal Communications Commission (2003) Title 47 Code of Federal Regulations 73.682 (a) (20)<br />
+    /// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM<br />
+    Bt470M = 4,
+    /// Rec. ITU-R BT.470-6 System B, G (historical)<br />
+    Bt470Bg = 5,
+    /// Rec. ITU-R BT.601-7 525 or 625<br />
+    /// Rec. ITU-R BT.1358-1 525 or 625 (historical)<br />
+    /// Rec. ITU-R BT.1700-0 NTSC SMPTE 170M (2004)<br />
+    /// (functionally the same as the values 1, 14 and 15)<br />
+    Bt601 = 6,
+    /// SMPTE 240M (1999) (historical)<br />
+    Smpte240 = 7,
+    /// Linear transfer characteristics<br />
+    Linear = 8,
+    /// Logarithmic transfer characteristic (100:1 range)<br />
+    Log100 = 9,
+    /// Logarithmic transfer characteristic (100 * Sqrt( 10 ) : 1 range)<br />
+    Log100sqrt10 = 10,
+    /// IEC 61966-2-4<br />
+    Iec61966 = 11,
+    /// Rec. ITU-R BT.1361-0 extended colour gamut system (historical)<br />
+    Bt1361 = 12,
+    /// IEC 61966-2-1 sRGB or sYCC<br />
+    Srgb = 13,
+    /// Rec. ITU-R BT.2020-2 (10-bit system)<br />
+    /// (functionally the same as the values 1, 6 and 15)<br />
+    Bt202010bit = 14,
+    /// Rec. ITU-R BT.2020-2 (12-bit system)<br />
+    /// (functionally the same as the values 1, 6 and 14)<br />
+    Bt202012bit = 15,
+    /// SMPTE ST 2084 for 10-, 12-, 14- and 16-bitsystems<br />
+    /// Rec. ITU-R BT.2100-0 perceptual quantization (PQ) system<br />
+    Smpte2084 = 16,
+    /// SMPTE ST 428-1<br />
+    Smpte428 = 17,
+    /// ARIB STD-B67<br />
+    /// Rec. ITU-R BT.2100-0 hybrid log- gamma (HLG) system<br />
+    Hlg = 18,
+}
+
+impl TryFrom<u8> for TransferCharacteristics {
+    type Error = CmsError;
+
+    #[allow(unreachable_patterns)]
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            0 | 3 | 19..=255 => Ok(Self::Reserved),
+            1 => Ok(Self::Bt709),
+            2 => Ok(Self::Unspecified),
+            4 => Ok(Self::Bt470M),
+            5 => Ok(Self::Bt470Bg),
+            6 => Ok(Self::Bt601),
+            7 => Ok(Self::Smpte240), // unimplemented
+            8 => Ok(Self::Linear),
+            9 => Ok(Self::Log100),
+            10 => Ok(Self::Log100sqrt10),
+            11 => Ok(Self::Iec61966), // unimplemented
+            12 => Ok(Self::Bt1361),   // unimplemented
+            13 => Ok(Self::Srgb),
+            14 => Ok(Self::Bt202010bit),
+            15 => Ok(Self::Bt202012bit),
+            16 => Ok(Self::Smpte2084),
+            17 => Ok(Self::Smpte428), // unimplemented
+            18 => Ok(Self::Hlg),
+            _ => Err(CmsError::InvalidCicp),
+        }
+    }
+}
+
+impl CicpColorPrimaries {
+    pub(crate) const fn has_chromaticity(self) -> bool {
+        self as u8 != Self::Reserved as u8 && self as u8 != Self::Unspecified as u8
+    }
+
+    pub(crate) const fn white_point(self) -> Result<Chromaticity, CmsError> {
+        Ok(match self {
+            Self::Reserved => return Err(CmsError::UnsupportedColorPrimaries(self as u8)),
+            Self::Bt709
+            | Self::Bt470Bg
+            | Self::Bt601
+            | Self::Smpte240
+            | Self::Bt2020
+            | Self::Smpte432
+            | Self::Ebu3213 => Chromaticity::D65,
+            Self::Unspecified => return Err(CmsError::UnsupportedColorPrimaries(self as u8)),
+            Self::Bt470M => Chromaticity { x: 0.310, y: 0.316 },
+            Self::GenericFilm => Chromaticity { x: 0.310, y: 0.316 },
+            Self::Xyz => Chromaticity {
+                x: 1. / 3.,
+                y: 1. / 3.,
+            },
+            Self::Smpte431 => Chromaticity { x: 0.314, y: 0.351 },
+        })
+    }
+}
+
+impl TryFrom<CicpColorPrimaries> for ColorPrimaries {
+    type Error = CmsError;
+
+    fn try_from(value: CicpColorPrimaries) -> Result<Self, Self::Error> {
+        match value {
+            CicpColorPrimaries::Reserved => Err(CmsError::UnsupportedColorPrimaries(value as u8)),
+            CicpColorPrimaries::Bt709 => Ok(ColorPrimaries::BT_709),
+            CicpColorPrimaries::Unspecified => {
+                Err(CmsError::UnsupportedColorPrimaries(value as u8))
+            }
+            CicpColorPrimaries::Bt470M => Ok(ColorPrimaries::BT_470M),
+            CicpColorPrimaries::Bt470Bg => Ok(ColorPrimaries::BT_470BG),
+            CicpColorPrimaries::Bt601 | CicpColorPrimaries::Smpte240 => Ok(ColorPrimaries::BT_601),
+            CicpColorPrimaries::GenericFilm => Ok(ColorPrimaries::GENERIC_FILM),
+            CicpColorPrimaries::Bt2020 => Ok(ColorPrimaries::BT_2020),
+            CicpColorPrimaries::Xyz => Ok(ColorPrimaries::XYZ),
+            // These two share primaries, but have distinct white points
+            CicpColorPrimaries::Smpte431 | CicpColorPrimaries::Smpte432 => {
+                Ok(ColorPrimaries::SMPTE_431)
+            }
+            CicpColorPrimaries::Ebu3213 => Ok(ColorPrimaries::EBU_3213),
+        }
+    }
+}
+
+impl TransferCharacteristics {
+    pub(crate) fn has_transfer_curve(self) -> bool {
+        self != Self::Reserved && self != Self::Unspecified
+    }
+}
+
+pub(crate) fn create_rec709_parametric() -> [f32; 5] {
+    const POW_EXP: f32 = 0.45;
+
+    const G: f32 = 1. / POW_EXP;
+    const B: f32 = (0.09929682680944f64 / 1.09929682680944f64) as f32;
+    const C: f32 = 1f32 / 4.5f32;
+    const D: f32 = (4.5f64 * 0.018053968510807f64) as f32;
+    const A: f32 = (1. / 1.09929682680944f64) as f32;
+
+    [G, A, B, C, D]
+}
+
+impl TryFrom<TransferCharacteristics> for ToneReprCurve {
+    type Error = CmsError;
+    /// See [ICC.1:2010](https://www.color.org/specification/ICC1v43_2010-12.pdf)
+    /// See [Rec. ITU-R BT.2100-2](https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2100-2-201807-I!!PDF-E.pdf)
+    fn try_from(value: TransferCharacteristics) -> Result<Self, Self::Error> {
+        const NUM_TRC_TABLE_ENTRIES: i32 = 1024;
+
+        Ok(match value {
+            TransferCharacteristics::Reserved => {
+                return Err(CmsError::UnsupportedTrc(value as u8));
+            }
+            TransferCharacteristics::Bt709
+            | TransferCharacteristics::Bt601
+            | TransferCharacteristics::Bt202010bit
+            | TransferCharacteristics::Bt202012bit => {
+                // The opto-electronic transfer characteristic function (OETF)
+                // as defined in ITU-T H.273 table 3, row 1:
+                //
+                // V = (α * Lc^0.45) − (α − 1)  for 1 >= Lc >= β
+                // V = 4.500 * Lc               for β >  Lc >= 0
+                //
+                // Inverting gives the electro-optical transfer characteristic
+                // function (EOTF) which can be represented as ICC
+                // parametricCurveType with 4 parameters (ICC.1:2010 Table 5).
+                // Converting between the two (Lc ↔︎ Y, V ↔︎ X):
+                //
+                // Y = (a * X + b)^g  for (X >= d)
+                // Y = c * X          for (X < d)
+                //
+                // g, a, b, c, d can then be defined in terms of α and β:
+                //
+                // g = 1 / 0.45
+                // a = 1 / α
+                // b = 1 - α
+                // c = 1 / 4.500
+                // d = 4.500 * β
+                //
+                // α and β are determined by solving the piecewise equations to
+                // ensure continuity of both value and slope at the value β.
+                // We use the values specified for 10-bit systems in
+                // https://www.itu.int/rec/R-REC-BT.2020-2-201510-I Table 4
+                // since this results in the similar values as available ICC
+                // profiles after converting to s15Fixed16Number, providing us
+                // good test coverage.
+
+                ToneReprCurve::Parametric(create_rec709_parametric().to_vec())
+            }
+            TransferCharacteristics::Unspecified => {
+                return Err(CmsError::UnsupportedTrc(value as u8));
+            }
+            TransferCharacteristics::Bt470M => curve_from_gamma(2.2),
+            TransferCharacteristics::Bt470Bg => curve_from_gamma(2.8),
+            TransferCharacteristics::Smpte240 => {
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, smpte240_to_linear);
+                ToneReprCurve::Lut(table)
+            }
+            TransferCharacteristics::Linear => curve_from_gamma(1.),
+            TransferCharacteristics::Log100 => {
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, log100_to_linear);
+                ToneReprCurve::Lut(table)
+            }
+            TransferCharacteristics::Log100sqrt10 => {
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, log100_sqrt10_to_linear);
+                ToneReprCurve::Lut(table)
+            }
+            TransferCharacteristics::Iec61966 => {
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, iec61966_to_linear);
+                ToneReprCurve::Lut(table)
+            }
+            TransferCharacteristics::Bt1361 => {
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, bt1361_to_linear);
+                ToneReprCurve::Lut(table)
+            }
+            TransferCharacteristics::Srgb => {
+                ToneReprCurve::Parametric(vec![2.4, 1. / 1.055, 0.055 / 1.055, 1. / 12.92, 0.04045])
+            }
+            TransferCharacteristics::Smpte2084 => {
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, pq_to_linear);
+                ToneReprCurve::Lut(table)
+            }
+            TransferCharacteristics::Smpte428 => {
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, smpte428_to_linear);
+                ToneReprCurve::Lut(table)
+            }
+            TransferCharacteristics::Hlg => {
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, hlg_to_linear);
+                ToneReprCurve::Lut(table)
+            }
+        })
+    }
+}
+
+/// Matrix Coefficients Enum (from ISO/IEC 23091-4 / MPEG CICP)
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[repr(C)]
+pub enum MatrixCoefficients {
+    Identity = 0,                // RGB (Identity matrix)
+    Bt709 = 1,                   // Rec. 709
+    Unspecified = 2,             // Unspecified
+    Reserved = 3,                // Reserved
+    Fcc = 4,                     // FCC
+    Bt470Bg = 5,                 // BT.470BG / BT.601-625
+    Smpte170m = 6,               // SMPTE 170M / BT.601-525
+    Smpte240m = 7,               // SMPTE 240M
+    YCgCo = 8,                   // YCgCo
+    Bt2020Ncl = 9,               // BT.2020 (non-constant luminance)
+    Bt2020Cl = 10,               // BT.2020 (constant luminance)
+    Smpte2085 = 11,              // SMPTE ST 2085
+    ChromaticityDerivedNCL = 12, // Chromaticity-derived non-constant luminance
+    ChromaticityDerivedCL = 13,  // Chromaticity-derived constant luminance
+    ICtCp = 14,                  // ICtCp
+}
+
+impl TryFrom<u8> for MatrixCoefficients {
+    type Error = CmsError;
+
+    fn try_from(value: u8) -> Result<Self, CmsError> {
+        match value {
+            0 => Ok(MatrixCoefficients::Identity),
+            1 => Ok(MatrixCoefficients::Bt709),
+            2 => Ok(MatrixCoefficients::Unspecified),
+            3 => Ok(MatrixCoefficients::Reserved),
+            4 => Ok(MatrixCoefficients::Fcc),
+            5 => Ok(MatrixCoefficients::Bt470Bg),
+            6 => Ok(MatrixCoefficients::Smpte170m),
+            7 => Ok(MatrixCoefficients::Smpte240m),
+            8 => Ok(MatrixCoefficients::YCgCo),
+            9 => Ok(MatrixCoefficients::Bt2020Ncl),
+            10 => Ok(MatrixCoefficients::Bt2020Cl),
+            11 => Ok(MatrixCoefficients::Smpte2085),
+            12 => Ok(MatrixCoefficients::ChromaticityDerivedNCL),
+            13 => Ok(MatrixCoefficients::ChromaticityDerivedCL),
+            14 => Ok(MatrixCoefficients::ICtCp),
+            _ => Err(CmsError::InvalidCicp),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::WHITE_POINT_D65;
+
+    #[test]
+    fn test_to_xyz_using_absolute_coordinates() {
+        let conversion_matrix = ColorPrimaries::BT_709.transform_to_xyz_d(WHITE_POINT_D65);
+        assert!((conversion_matrix.v[0][0] - 0.4121524015214193).abs() < 1e-14);
+        assert!((conversion_matrix.v[1][1] - 0.7153537403945436).abs() < 1e-14);
+        assert!((conversion_matrix.v[2][2] - 0.9497138466283235).abs() < 1e-14);
+    }
+
+    #[test]
+    fn test_to_xyz_using_absolute_coordinates_xyz() {
+        let conversion_matrix = ColorPrimaries::XYZ.transform_to_xyz_d(WHITE_POINT_D65);
+        assert!((conversion_matrix.v[0][0] - 0.95015469385536477).abs() < 1e-14);
+        assert!((conversion_matrix.v[1][1] - 1.0).abs() < 1e-14);
+        assert!((conversion_matrix.v[2][2] - 1.0882590676722474).abs() < 1e-14);
+    }
+
+    #[test]
+    fn test_to_xyz_using_absolute_coordinates_f() {
+        let conversion_matrix = ColorPrimaries::BT_709.transform_to_xyz(WHITE_POINT_D65);
+        assert!((conversion_matrix.v[0][0] - 0.4121524015214193).abs() < 1e-5);
+        assert!((conversion_matrix.v[1][1] - 0.7153537403945436).abs() < 1e-5);
+        assert!((conversion_matrix.v[2][2] - 0.9497138466283235).abs() < 1e-5);
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/a_curves3.rs
+++ b/vendor/moxcms/src/conversions/avx/a_curves3.rs
@@ -0,0 +1,237 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::avx::cube::CubeAvxFma;
+use crate::conversions::avx::interpolator::AvxVectorSse;
+use crate::{CmsError, DataColorSpace, InPlaceStage, InterpolationMethod};
+use std::arch::x86_64::*;
+
+pub(crate) struct ACurves3AvxFma<'a, const DEPTH: usize> {
+    pub(crate) curve0: Box<[f32; 65536]>,
+    pub(crate) curve1: Box<[f32; 65536]>,
+    pub(crate) curve2: Box<[f32; 65536]>,
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 3],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+pub(crate) struct ACurves3OptimizedAvxFma<'a> {
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 3],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+pub(crate) struct ACurves3InverseAvxFma<'a, const DEPTH: usize> {
+    pub(crate) curve0: Box<[f32; 65536]>,
+    pub(crate) curve1: Box<[f32; 65536]>,
+    pub(crate) curve2: Box<[f32; 65536]>,
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 3],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+impl<const DEPTH: usize> ACurves3AvxFma<'_, DEPTH> {
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32) -> AvxVectorSse>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        unsafe {
+            let scale_value = (DEPTH - 1) as f32;
+
+            for dst in dst.chunks_exact_mut(3) {
+                let a0 = (dst[0] * scale_value).round().min(scale_value) as u16;
+                let a1 = (dst[1] * scale_value).round().min(scale_value) as u16;
+                let a2 = (dst[2] * scale_value).round().min(scale_value) as u16;
+                let b0 = self.curve0[a0 as usize];
+                let b1 = self.curve1[a1 as usize];
+                let b2 = self.curve2[a2 as usize];
+                let v = fetch(b0, b1, b2).v;
+                dst[0] = f32::from_bits(_mm_extract_ps::<0>(v) as u32);
+                dst[1] = f32::from_bits(_mm_extract_ps::<1>(v) as u32);
+                dst[2] = f32::from_bits(_mm_extract_ps::<2>(v) as u32);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ACurves3OptimizedAvxFma<'_> {
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32) -> AvxVectorSse>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        unsafe {
+            for dst in dst.chunks_exact_mut(3) {
+                let a0 = dst[0];
+                let a1 = dst[1];
+                let a2 = dst[2];
+                let v = fetch(a0, a1, a2).v;
+                dst[0] = f32::from_bits(_mm_extract_ps::<0>(v) as u32);
+                dst[1] = f32::from_bits(_mm_extract_ps::<1>(v) as u32);
+                dst[2] = f32::from_bits(_mm_extract_ps::<2>(v) as u32);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> InPlaceStage for ACurves3AvxFma<'_, DEPTH> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = CubeAvxFma::new(self.clut, self.grid_size, 3);
+
+        unsafe {
+            // If PCS is LAB then linear interpolation should be used
+            if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+                return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+            }
+
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+                }
+                InterpolationMethod::Linear => {
+                    self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl InPlaceStage for ACurves3OptimizedAvxFma<'_> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = CubeAvxFma::new(self.clut, self.grid_size, 3);
+
+        unsafe {
+            // If PCS is LAB then linear interpolation should be used
+            if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+                return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+            }
+
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+                }
+                InterpolationMethod::Linear => {
+                    self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> ACurves3InverseAvxFma<'_, DEPTH> {
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32) -> AvxVectorSse>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        unsafe {
+            let v_scale_value = _mm_set1_ps((DEPTH as u32 - 1u32) as f32);
+            for dst in dst.chunks_exact_mut(3) {
+                let mut v = fetch(dst[0], dst[1], dst[2]).v;
+                v = _mm_mul_ps(v, v_scale_value);
+                v = _mm_min_ps(v, v_scale_value);
+                v = _mm_max_ps(v, _mm_setzero_ps());
+                let c = _mm_cvtps_epi32(v);
+                let a0 = _mm_extract_epi32::<0>(c) as u16;
+                let a1 = _mm_extract_epi32::<1>(c) as u16;
+                let a2 = _mm_extract_epi32::<2>(c) as u16;
+                let b0 = self.curve0[a0 as usize];
+                let b1 = self.curve1[a1 as usize];
+                let b2 = self.curve2[a2 as usize];
+                dst[0] = b0;
+                dst[1] = b1;
+                dst[2] = b2;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> InPlaceStage for ACurves3InverseAvxFma<'_, DEPTH> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = CubeAvxFma::new(self.clut, self.grid_size, 3);
+
+        unsafe {
+            // If PCS is LAB then linear interpolation should be used
+            if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+                return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+            }
+
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+                }
+                InterpolationMethod::Linear => {
+                    self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/a_curves4x3.rs
+++ b/vendor/moxcms/src/conversions/avx/a_curves4x3.rs
@@ -0,0 +1,182 @@
+// /*
+//  * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+//  * //
+//  * // Redistribution and use in source and binary forms, with or without modification,
+//  * // are permitted provided that the following conditions are met:
+//  * //
+//  * // 1.  Redistributions of source code must retain the above copyright notice, this
+//  * // list of conditions and the following disclaimer.
+//  * //
+//  * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+//  * // this list of conditions and the following disclaimer in the documentation
+//  * // and/or other materials provided with the distribution.
+//  * //
+//  * // 3.  Neither the name of the copyright holder nor the names of its
+//  * // contributors may be used to endorse or promote products derived from
+//  * // this software without specific prior written permission.
+//  * //
+//  * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+//  * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+//  * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+//  * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+//  * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+//  * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+//  * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+//  * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+//  * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+//  * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  */
+use crate::conversions::avx::hypercube::HypercubeAvx;
+use crate::conversions::avx::interpolator::AvxVectorSse;
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Stage};
+use std::arch::x86_64::*;
+
+pub(crate) struct ACurves4x3AvxFma<'a, const DEPTH: usize> {
+    pub(crate) curve0: Box<[f32; 65536]>,
+    pub(crate) curve1: Box<[f32; 65536]>,
+    pub(crate) curve2: Box<[f32; 65536]>,
+    pub(crate) curve3: Box<[f32; 65536]>,
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 4],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+pub(crate) struct ACurves4x3AvxFmaOptimized<'a> {
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 4],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+impl<const DEPTH: usize> ACurves4x3AvxFma<'_, DEPTH> {
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let scale_value = (DEPTH - 1) as f32;
+
+        assert_eq!(src.len() / 4, dst.len() / 3);
+
+        unsafe {
+            for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
+                let a0 = (src[0] * scale_value).round().min(scale_value) as u16;
+                let a1 = (src[1] * scale_value).round().min(scale_value) as u16;
+                let a2 = (src[2] * scale_value).round().min(scale_value) as u16;
+                let a3 = (src[3] * scale_value).round().min(scale_value) as u16;
+                let c = self.curve0[a0 as usize];
+                let m = self.curve1[a1 as usize];
+                let y = self.curve2[a2 as usize];
+                let k = self.curve3[a3 as usize];
+
+                let v = fetch(c, m, y, k).v;
+                dst[0] = f32::from_bits(_mm_extract_ps::<0>(v) as u32);
+                dst[1] = f32::from_bits(_mm_extract_ps::<1>(v) as u32);
+                dst[2] = f32::from_bits(_mm_extract_ps::<2>(v) as u32);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ACurves4x3AvxFmaOptimized<'_> {
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        assert_eq!(src.len() / 4, dst.len() / 3);
+        unsafe {
+            for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
+                let c = src[0];
+                let m = src[1];
+                let y = src[2];
+                let k = src[3];
+
+                let v = fetch(c, m, y, k).v;
+                dst[0] = f32::from_bits(_mm_extract_ps::<0>(v) as u32);
+                dst[1] = f32::from_bits(_mm_extract_ps::<1>(v) as u32);
+                dst[2] = f32::from_bits(_mm_extract_ps::<2>(v) as u32);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> Stage for ACurves4x3AvxFma<'_, DEPTH> {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = HypercubeAvx::new(self.clut, self.grid_size, 3);
+
+        assert!(std::arch::is_x86_feature_detected!("avx2"));
+        assert!(std::arch::is_x86_feature_detected!("fma"));
+
+        unsafe {
+            // If PCS is LAB then linear interpolation should be used
+            if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+                return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
+            }
+
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
+                }
+                InterpolationMethod::Linear => {
+                    self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl Stage for ACurves4x3AvxFmaOptimized<'_> {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = HypercubeAvx::new(self.clut, self.grid_size, 3);
+
+        assert!(std::arch::is_x86_feature_detected!("avx2"));
+        assert!(std::arch::is_x86_feature_detected!("fma"));
+
+        unsafe {
+            // If PCS is LAB then linear interpolation should be used
+            if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+                return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
+            }
+
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
+                }
+                InterpolationMethod::Linear => {
+                    self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/cube.rs
+++ b/vendor/moxcms/src/conversions/avx/cube.rs
@@ -0,0 +1,445 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::avx::interpolator::AvxVectorSse;
+use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
+use std::arch::x86_64::*;
+use std::ops::{Add, Mul, Sub};
+
+/// 3D CLUT NEON helper
+///
+/// Represents hexahedron.
+pub(crate) struct CubeAvxFma<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+    grid_size: [u8; 3],
+}
+
+struct HexahedronFetch3<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+}
+
+trait CubeFetch<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
+}
+
+impl CubeFetch<AvxVectorSse> for HexahedronFetch3<'_> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorSse {
+        let start = (x as u32 * self.x_stride + y as u32 * self.y_stride + z as u32) as usize * 3;
+        unsafe {
+            let k = self.array.get_unchecked(start..);
+            let lo = _mm_loadu_si64(k.as_ptr() as *const _);
+            let hi = _mm_insert_epi32::<2>(
+                lo,
+                k.get_unchecked(2..).as_ptr().read_unaligned().to_bits() as i32,
+            );
+            AvxVectorSse {
+                v: _mm_castsi128_ps(hi),
+            }
+        }
+    }
+}
+
+impl<'a> CubeAvxFma<'a> {
+    pub(crate) fn new(arr: &'a [f32], grid: [u8; 3], components: usize) -> Self {
+        // This is safety precondition, array size must be not less than full grid size * components.
+        // Needs to ensure that it is not missed somewhere else
+        assert_eq!(
+            grid[0] as usize * grid[1] as usize * grid[2] as usize * components,
+            arr.len()
+        );
+        let y_stride = grid[1] as u32;
+        let x_stride = y_stride * grid[0] as u32;
+        CubeAvxFma {
+            array: arr,
+            x_stride,
+            y_stride,
+            grid_size: grid,
+        }
+    }
+
+    #[inline(always)]
+    fn trilinear<
+        T: Copy
+            + From<f32>
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Add<T, Output = T>
+            + FusedMultiplyNegAdd<T>
+            + FusedMultiplyAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        fetch: impl CubeFetch<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+
+        let x_d = T::from(lin_x * scale_x - x as f32);
+        let y_d = T::from(lin_y * scale_y - y as f32);
+        let z_d = T::from(lin_z * scale_z - z as f32);
+
+        let c000 = fetch.fetch(x, y, z);
+        let c100 = fetch.fetch(x_n, y, z);
+        let c010 = fetch.fetch(x, y_n, z);
+        let c110 = fetch.fetch(x_n, y_n, z);
+        let c001 = fetch.fetch(x, y, z_n);
+        let c101 = fetch.fetch(x_n, y, z_n);
+        let c011 = fetch.fetch(x, y_n, z_n);
+        let c111 = fetch.fetch(x_n, y_n, z_n);
+
+        let c00 = c000.neg_mla(c000, x_d).mla(c100, x_d);
+        let c10 = c010.neg_mla(c010, x_d).mla(c110, x_d);
+        let c01 = c001.neg_mla(c001, x_d).mla(c101, x_d);
+        let c11 = c011.neg_mla(c011, x_d).mla(c111, x_d);
+
+        let c0 = c00.neg_mla(c00, y_d).mla(c10, y_d);
+        let c1 = c01.neg_mla(c01, y_d).mla(c11, y_d);
+
+        c0.neg_mla(c0, z_d).mla(c1, z_d)
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    fn pyramid<
+        T: Copy
+            + From<f32>
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Add<T, Output = T>
+            + FusedMultiplyAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        fetch: impl CubeFetch<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+
+        let dr = lin_x * scale_x - x as f32;
+        let dg = lin_y * scale_y - y as f32;
+        let db = lin_z * scale_z - z as f32;
+
+        let c0 = fetch.fetch(x, y, z);
+
+        if dr > db && dg > db {
+            let x0 = fetch.fetch(x_n, y_n, z_n);
+            let x1 = fetch.fetch(x_n, y_n, z);
+            let x2 = fetch.fetch(x_n, y, z);
+            let x3 = fetch.fetch(x, y_n, z);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = fetch.fetch(x, y, z_n);
+            let x1 = fetch.fetch(x_n, y_n, z_n);
+            let x2 = fetch.fetch(x, y_n, z_n);
+            let x3 = fetch.fetch(x, y_n, z);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dg * db))
+        } else {
+            let x0 = fetch.fetch(x, y, z_n);
+            let x1 = fetch.fetch(x_n, y, z);
+            let x2 = fetch.fetch(x_n, y, z_n);
+            let x3 = fetch.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(db * dr))
+        }
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    fn tetra<
+        T: Copy
+            + From<f32>
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Add<T, Output = T>
+            + FusedMultiplyAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        fetch: impl CubeFetch<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+
+        let rx = lin_x * scale_x - x as f32;
+        let ry = lin_y * scale_y - y as f32;
+        let rz = lin_z * scale_z - z as f32;
+
+        let c0 = fetch.fetch(x, y, z);
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = fetch.fetch(x_n, y, z) - c0;
+                c2 = fetch.fetch(x_n, y_n, z) - fetch.fetch(x_n, y, z);
+                c3 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = fetch.fetch(x_n, y, z) - c0;
+                c2 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x_n, y, z_n);
+                c3 = fetch.fetch(x_n, y, z_n) - fetch.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = fetch.fetch(x_n, y, z_n) - fetch.fetch(x, y, z_n);
+                c2 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x_n, y, z_n);
+                c3 = fetch.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = fetch.fetch(x_n, y_n, z) - fetch.fetch(x, y_n, z);
+            c2 = fetch.fetch(x, y_n, z) - c0;
+            c3 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x, y_n, z_n);
+            c2 = fetch.fetch(x, y_n, z) - c0;
+            c3 = fetch.fetch(x, y_n, z_n) - fetch.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x, y_n, z_n);
+            c2 = fetch.fetch(x, y_n, z_n) - fetch.fetch(x, y, z_n);
+            c3 = fetch.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, T::from(rx));
+        let s1 = s0.mla(c2, T::from(ry));
+        s1.mla(c3, T::from(rz))
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    fn prism<
+        T: Copy
+            + From<f32>
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Add<T, Output = T>
+            + FusedMultiplyAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        fetch: impl CubeFetch<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+
+        let dr = lin_x * scale_x - x as f32;
+        let dg = lin_y * scale_y - y as f32;
+        let db = lin_z * scale_z - z as f32;
+
+        let c0 = fetch.fetch(x, y, z);
+
+        if db >= dr {
+            let x0 = fetch.fetch(x, y, z_n);
+            let x1 = fetch.fetch(x_n, y, z_n);
+            let x2 = fetch.fetch(x, y_n, z);
+            let x3 = fetch.fetch(x, y_n, z_n);
+            let x4 = fetch.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        } else {
+            let x0 = fetch.fetch(x_n, y, z);
+            let x1 = fetch.fetch(x_n, y, z_n);
+            let x2 = fetch.fetch(x, y_n, z);
+            let x3 = fetch.fetch(x_n, y_n, z);
+            let x4 = fetch.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        }
+    }
+
+    #[inline]
+    pub(crate) fn trilinear_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32) -> AvxVectorSse {
+        self.trilinear(
+            lin_x,
+            lin_y,
+            lin_z,
+            HexahedronFetch3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn prism_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32) -> AvxVectorSse {
+        self.prism(
+            lin_x,
+            lin_y,
+            lin_z,
+            HexahedronFetch3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn pyramid_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32) -> AvxVectorSse {
+        self.pyramid(
+            lin_x,
+            lin_y,
+            lin_z,
+            HexahedronFetch3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn tetra_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32) -> AvxVectorSse {
+        self.tetra(
+            lin_x,
+            lin_y,
+            lin_z,
+            HexahedronFetch3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+            },
+        )
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/hypercube.rs
+++ b/vendor/moxcms/src/conversions/avx/hypercube.rs
@@ -0,0 +1,644 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::avx::interpolator::AvxVectorSse;
+use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
+use crate::nd_array::lerp;
+use std::arch::x86_64::*;
+use std::ops::{Add, Mul, Sub};
+
+/// 4D CLUT helper.
+///
+/// Represents hypercube.
+pub(crate) struct HypercubeAvx<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+    z_stride: u32,
+    grid_size: [u8; 4],
+}
+
+trait Fetcher4<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32, w: i32) -> T;
+}
+
+struct Fetch4Vec3<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+    z_stride: u32,
+}
+
+impl Fetcher4<AvxVectorSse> for Fetch4Vec3<'_> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32, w: i32) -> AvxVectorSse {
+        let start = (x as u32 * self.x_stride
+            + y as u32 * self.y_stride
+            + z as u32 * self.z_stride
+            + w as u32) as usize
+            * 3;
+        unsafe {
+            let k = self.array.get_unchecked(start..);
+            let lo = _mm_loadu_si64(k.as_ptr() as *const _);
+            let hi = _mm_insert_epi32::<2>(
+                lo,
+                k.get_unchecked(2..).as_ptr().read_unaligned().to_bits() as i32,
+            );
+            AvxVectorSse {
+                v: _mm_castsi128_ps(hi),
+            }
+        }
+    }
+}
+
+impl<'a> HypercubeAvx<'a> {
+    pub(crate) fn new(arr: &'a [f32], grid: [u8; 4], components: usize) -> Self {
+        // This is safety precondition, array size must be not less than full grid size * components.
+        // Needs to ensure that it is not missed somewhere else
+        assert_eq!(
+            grid[0] as usize * grid[1] as usize * grid[2] as usize * grid[3] as usize * components,
+            arr.len()
+        );
+        let z_stride = grid[2] as u32;
+        let y_stride = z_stride * grid[1] as u32;
+        let x_stride = y_stride * grid[0] as u32;
+        HypercubeAvx {
+            array: arr,
+            x_stride,
+            y_stride,
+            z_stride,
+            grid_size: grid,
+        }
+    }
+
+    #[inline(always)]
+    fn quadlinear<
+        T: From<f32>
+            + Add<T, Output = T>
+            + Mul<T, Output = T>
+            + FusedMultiplyAdd<T>
+            + Sub<T, Output = T>
+            + Copy
+            + FusedMultiplyNegAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+        r: impl Fetcher4<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+        let lin_w = lin_w.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+        let w = (lin_w * scale_w).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+        let w_n = (lin_w * scale_w).ceil() as i32;
+
+        let x_d = T::from(lin_x * scale_x - x as f32);
+        let y_d = T::from(lin_y * scale_y - y as f32);
+        let z_d = T::from(lin_z * scale_z - z as f32);
+        let w_d = T::from(lin_w * scale_w - w as f32);
+
+        let r_x1 = lerp(r.fetch(x, y, z, w), r.fetch(x_n, y, z, w), x_d);
+        let r_x2 = lerp(r.fetch(x, y_n, z, w), r.fetch(x_n, y_n, z, w), x_d);
+        let r_y1 = lerp(r_x1, r_x2, y_d);
+        let r_x3 = lerp(r.fetch(x, y, z_n, w), r.fetch(x_n, y, z_n, w), x_d);
+        let r_x4 = lerp(r.fetch(x, y_n, z_n, w), r.fetch(x_n, y_n, z_n, w), x_d);
+        let r_y2 = lerp(r_x3, r_x4, y_d);
+        let r_z1 = lerp(r_y1, r_y2, z_d);
+
+        let r_x1 = lerp(r.fetch(x, y, z, w_n), r.fetch(x_n, y, z, w_n), x_d);
+        let r_x2 = lerp(r.fetch(x, y_n, z, w_n), r.fetch(x_n, y_n, z, w_n), x_d);
+        let r_y1 = lerp(r_x1, r_x2, y_d);
+        let r_x3 = lerp(r.fetch(x, y, z_n, w_n), r.fetch(x_n, y, z_n, w_n), x_d);
+        let r_x4 = lerp(r.fetch(x, y_n, z_n, w_n), r.fetch(x_n, y_n, z_n, w_n), x_d);
+        let r_y2 = lerp(r_x3, r_x4, y_d);
+        let r_z2 = lerp(r_y1, r_y2, z_d);
+        lerp(r_z1, r_z2, w_d)
+    }
+
+    #[inline(always)]
+    pub(crate) fn quadlinear_vec3(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+    ) -> AvxVectorSse {
+        self.quadlinear(
+            lin_x,
+            lin_y,
+            lin_z,
+            lin_w,
+            Fetch4Vec3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+                z_stride: self.z_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    fn pyramid<
+        T: From<f32>
+            + Add<T, Output = T>
+            + Mul<T, Output = T>
+            + FusedMultiplyAdd<T>
+            + Sub<T, Output = T>
+            + Copy
+            + FusedMultiplyNegAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+        r: impl Fetcher4<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+        let lin_w = lin_w.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+        let w = (lin_w * scale_w).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+        let w_n = (lin_w * scale_w).ceil() as i32;
+
+        let dr = lin_x * scale_x - x as f32;
+        let dg = lin_y * scale_y - y as f32;
+        let db = lin_z * scale_z - z as f32;
+        let dw = lin_w * scale_w - w as f32;
+
+        let c0 = r.fetch(x, y, z, w);
+
+        let w0 = if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n, w);
+            let x1 = r.fetch(x_n, y_n, z, w);
+            let x2 = r.fetch(x_n, y, z, w);
+            let x3 = r.fetch(x, y_n, z, w);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n, w);
+            let x1 = r.fetch(x_n, y_n, z_n, w);
+            let x2 = r.fetch(x, y_n, z_n, w);
+            let x3 = r.fetch(x, y_n, z, w);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dg * db))
+        } else {
+            let x0 = r.fetch(x, y, z_n, w);
+            let x1 = r.fetch(x_n, y, z, w);
+            let x2 = r.fetch(x_n, y, z_n, w);
+            let x3 = r.fetch(x_n, y_n, z_n, w);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(db * dr))
+        };
+
+        let c0 = r.fetch(x, y, z, w_n);
+
+        let w1 = if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n, w_n);
+            let x1 = r.fetch(x_n, y_n, z, w_n);
+            let x2 = r.fetch(x_n, y, z, w_n);
+            let x3 = r.fetch(x, y_n, z, w_n);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n, w_n);
+            let x1 = r.fetch(x_n, y_n, z_n, w_n);
+            let x2 = r.fetch(x, y_n, z_n, w_n);
+            let x3 = r.fetch(x, y_n, z, w_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dg * db))
+        } else {
+            let x0 = r.fetch(x, y, z_n, w_n);
+            let x1 = r.fetch(x_n, y, z, w_n);
+            let x2 = r.fetch(x_n, y, z_n, w_n);
+            let x3 = r.fetch(x_n, y_n, z_n, w_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(db * dr))
+        };
+        w0.neg_mla(w0, T::from(dw)).mla(w1, T::from(dw))
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    pub(crate) fn pyramid_vec3(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+    ) -> AvxVectorSse {
+        self.pyramid(
+            lin_x,
+            lin_y,
+            lin_z,
+            lin_w,
+            Fetch4Vec3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+                z_stride: self.z_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    fn prism<
+        T: From<f32>
+            + Add<T, Output = T>
+            + Mul<T, Output = T>
+            + FusedMultiplyAdd<T>
+            + Sub<T, Output = T>
+            + Copy
+            + FusedMultiplyNegAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+        r: impl Fetcher4<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+        let lin_w = lin_w.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+        let w = (lin_w * scale_w).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+        let w_n = (lin_w * scale_w).ceil() as i32;
+
+        let dr = lin_x * scale_x - x as f32;
+        let dg = lin_y * scale_y - y as f32;
+        let db = lin_z * scale_z - z as f32;
+        let dw = lin_w * scale_w - w as f32;
+
+        let c0 = r.fetch(x, y, z, w);
+
+        let w0 = if db >= dr {
+            let x0 = r.fetch(x, y, z_n, w);
+            let x1 = r.fetch(x_n, y, z_n, w);
+            let x2 = r.fetch(x, y_n, z, w);
+            let x3 = r.fetch(x, y_n, z_n, w);
+            let x4 = r.fetch(x_n, y_n, z_n, w);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        } else {
+            let x0 = r.fetch(x_n, y, z, w);
+            let x1 = r.fetch(x_n, y, z_n, w);
+            let x2 = r.fetch(x, y_n, z, w);
+            let x3 = r.fetch(x_n, y_n, z, w);
+            let x4 = r.fetch(x_n, y_n, z_n, w);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        };
+
+        let c0 = r.fetch(x, y, z, w_n);
+
+        let w1 = if db >= dr {
+            let x0 = r.fetch(x, y, z_n, w_n);
+            let x1 = r.fetch(x_n, y, z_n, w_n);
+            let x2 = r.fetch(x, y_n, z, w_n);
+            let x3 = r.fetch(x, y_n, z_n, w_n);
+            let x4 = r.fetch(x_n, y_n, z_n, w_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        } else {
+            let x0 = r.fetch(x_n, y, z, w_n);
+            let x1 = r.fetch(x_n, y, z_n, w_n);
+            let x2 = r.fetch(x, y_n, z, w_n);
+            let x3 = r.fetch(x_n, y_n, z, w_n);
+            let x4 = r.fetch(x_n, y_n, z_n, w_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        };
+        w0.neg_mla(w0, T::from(dw)).mla(w1, T::from(dw))
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    pub(crate) fn prism_vec3(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+    ) -> AvxVectorSse {
+        self.prism(
+            lin_x,
+            lin_y,
+            lin_z,
+            lin_w,
+            Fetch4Vec3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+                z_stride: self.z_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    fn tetra<
+        T: From<f32>
+            + Add<T, Output = T>
+            + Mul<T, Output = T>
+            + FusedMultiplyAdd<T>
+            + Sub<T, Output = T>
+            + Copy
+            + FusedMultiplyNegAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+        r: impl Fetcher4<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+        let lin_w = lin_w.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+        let w = (lin_w * scale_w).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+        let w_n = (lin_w * scale_w).ceil() as i32;
+
+        let rx = lin_x * scale_x - x as f32;
+        let ry = lin_y * scale_y - y as f32;
+        let rz = lin_z * scale_z - z as f32;
+        let rw = lin_w * scale_w - w as f32;
+
+        let c0 = r.fetch(x, y, z, w);
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z, w) - c0;
+                c2 = r.fetch(x_n, y_n, z, w) - r.fetch(x_n, y, z, w);
+                c3 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y_n, z, w);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z, w) - c0;
+                c2 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y, z_n, w);
+                c3 = r.fetch(x_n, y, z_n, w) - r.fetch(x_n, y, z, w);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n, w) - r.fetch(x, y, z_n, w);
+                c2 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y, z_n, w);
+                c3 = r.fetch(x, y, z_n, w) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z, w) - r.fetch(x, y_n, z, w);
+            c2 = r.fetch(x, y_n, z, w) - c0;
+            c3 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y_n, z, w);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x, y_n, z_n, w);
+            c2 = r.fetch(x, y_n, z, w) - c0;
+            c3 = r.fetch(x, y_n, z_n, w) - r.fetch(x, y_n, z, w);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x, y_n, z_n, w);
+            c2 = r.fetch(x, y_n, z_n, w) - r.fetch(x, y, z_n, w);
+            c3 = r.fetch(x, y, z_n, w) - c0;
+        }
+        let s0 = c0.mla(c1, T::from(rx));
+        let s1 = s0.mla(c2, T::from(ry));
+        let w0 = s1.mla(c3, T::from(rz));
+
+        let c0 = r.fetch(x, y, z, w_n);
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z, w_n) - c0;
+                c2 = r.fetch(x_n, y_n, z, w_n) - r.fetch(x_n, y, z, w_n);
+                c3 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y_n, z, w_n);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z, w_n) - c0;
+                c2 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y, z_n, w_n);
+                c3 = r.fetch(x_n, y, z_n, w_n) - r.fetch(x_n, y, z, w_n);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n, w_n) - r.fetch(x, y, z_n, w_n);
+                c2 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y, z_n, w_n);
+                c3 = r.fetch(x, y, z_n, w_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z, w_n) - r.fetch(x, y_n, z, w_n);
+            c2 = r.fetch(x, y_n, z, w_n) - c0;
+            c3 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y_n, z, w_n);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x, y_n, z_n, w_n);
+            c2 = r.fetch(x, y_n, z, w_n) - c0;
+            c3 = r.fetch(x, y_n, z_n, w_n) - r.fetch(x, y_n, z, w_n);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x, y_n, z_n, w_n);
+            c2 = r.fetch(x, y_n, z_n, w_n) - r.fetch(x, y, z_n, w_n);
+            c3 = r.fetch(x, y, z_n, w_n) - c0;
+        }
+        let s0 = c0.mla(c1, T::from(rx));
+        let s1 = s0.mla(c2, T::from(ry));
+        let w1 = s1.mla(c3, T::from(rz));
+        w0.neg_mla(w0, T::from(rw)).mla(w1, T::from(rw))
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    pub(crate) fn tetra_vec3(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+    ) -> AvxVectorSse {
+        self.tetra(
+            lin_x,
+            lin_y,
+            lin_z,
+            lin_w,
+            Fetch4Vec3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+                z_stride: self.z_stride,
+            },
+        )
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/interpolator.rs
+++ b/vendor/moxcms/src/conversions/avx/interpolator.rs
--- a/vendor/moxcms/src/conversions/avx/interpolator_q0_15.rs
+++ b/vendor/moxcms/src/conversions/avx/interpolator_q0_15.rs
--- a/vendor/moxcms/src/conversions/avx/lut4_to_3.rs
+++ b/vendor/moxcms/src/conversions/avx/lut4_to_3.rs
@@ -0,0 +1,327 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::avx::interpolator::*;
+use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
+use crate::conversions::avx::lut4_to_3_q0_15::TransformLut4To3AvxQ0_15;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::lut_transforms::Lut4x3Factory;
+use crate::transform::PointeeSizeExpressible;
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+use std::marker::PhantomData;
+
+struct TransformLut4To3Avx<
+    T,
+    U,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    lut: Vec<SseAlignedF32>,
+    _phantom: PhantomData<T>,
+    _phantom1: PhantomData<U>,
+    interpolation_method: InterpolationMethod,
+    weights: Box<[BarycentricWeight<f32>; BINS]>,
+    color_space: DataColorSpace,
+    is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut4To3Avx<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_chunk<'b, Interpolator: AvxMdInterpolationDouble<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        let grid_size = GRID_SIZE as i32;
+        let grid_size3 = grid_size * grid_size * grid_size;
+
+        let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
+        let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
+
+        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
+            let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[0],
+            );
+            let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[1],
+            );
+            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[2],
+            );
+            let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[3],
+            );
+
+            let k_weights = self.weights[k.as_()];
+
+            let w: i32 = k_weights.x;
+            let w_n: i32 = k_weights.x_n;
+            let t: f32 = k_weights.w;
+
+            let table1 = &self.lut[(w * grid_size3) as usize..];
+            let table2 = &self.lut[(w_n * grid_size3) as usize..];
+
+            let interpolator = Interpolator::new(table1, table2);
+            let v = interpolator.inter3_sse(c, m, y, &self.weights);
+            let (a0, b0) = (v.0.v, v.1.v);
+
+            if T::FINITE {
+                unsafe {
+                    let t0 = _mm_set1_ps(t);
+                    let hp = _mm_fnmadd_ps(a0, t0, a0);
+                    let mut v = _mm_fmadd_ps(b0, t0, hp);
+                    v = _mm_max_ps(v, _mm_setzero_ps());
+                    v = _mm_mul_ps(v, value_scale);
+                    v = _mm_min_ps(v, value_scale);
+                    let jvz = _mm_cvtps_epi32(v);
+
+                    let x = _mm_extract_epi32::<0>(jvz);
+                    let y = _mm_extract_epi32::<1>(jvz);
+                    let z = _mm_extract_epi32::<2>(jvz);
+
+                    dst[cn.r_i()] = (x as u32).as_();
+                    dst[cn.g_i()] = (y as u32).as_();
+                    dst[cn.b_i()] = (z as u32).as_();
+                }
+            } else {
+                unsafe {
+                    let t0 = _mm_set1_ps(t);
+                    let hp = _mm_fnmadd_ps(a0, t0, a0);
+                    let v = _mm_fmadd_ps(b0, t0, hp);
+                    dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
+                    dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
+                    dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
+                }
+            }
+            if channels == 4 {
+                dst[cn.a_i()] = max_value;
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut4To3Avx<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        if src.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / 4;
+        let dst_chunks = dst.len() / channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearAvxFmaDouble<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralAvxFmaDouble<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidAvxFmaDouble<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticAvxFmaDouble<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearAvxFmaDouble<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) struct AvxLut4x3Factory {}
+
+impl Lut4x3Factory for AvxLut4x3Factory {
+    fn make_transform_4x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Send + Sync>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>,
+    {
+        if options.prefer_fixed_point && BIT_DEPTH < 16 {
+            let q: f32 = if T::FINITE {
+                ((1i32 << BIT_DEPTH as i32) - 1) as f32
+            } else {
+                ((1i32 << 14i32) - 1) as f32
+            };
+            let lut = lut
+                .chunks_exact(3)
+                .map(|x| {
+                    AvxAlignedI16([
+                        (x[0] * q).round() as i16,
+                        (x[1] * q).round() as i16,
+                        (x[2] * q).round() as i16,
+                        0,
+                    ])
+                })
+                .collect::<Vec<_>>();
+            return match options.barycentric_weight_scale {
+                BarycentricWeightScale::Low => Box::new(TransformLut4To3AvxQ0_15::<
+                    T,
+                    u8,
+                    LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    256,
+                    256,
+                > {
+                    lut,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
+                    _phantom: PhantomData,
+                    _phantom1: PhantomData,
+                    color_space,
+                    is_linear,
+                }),
+                #[cfg(feature = "options")]
+                BarycentricWeightScale::High => Box::new(TransformLut4To3AvxQ0_15::<
+                    T,
+                    u16,
+                    LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    65536,
+                    65536,
+                > {
+                    lut,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
+                    _phantom: PhantomData,
+                    _phantom1: PhantomData,
+                    color_space,
+                    is_linear,
+                }),
+            };
+        }
+        assert!(
+            std::arch::is_x86_feature_detected!("fma"),
+            "Internal configuration error, this might not be called without `fma` feature"
+        );
+        let lut = lut
+            .chunks_exact(3)
+            .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
+            .collect::<Vec<_>>();
+        match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => {
+                Box::new(
+                    TransformLut4To3Avx::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
+                        lut,
+                        interpolation_method: options.interpolation_method,
+                        weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                        _phantom: PhantomData,
+                        _phantom1: PhantomData,
+                        color_space,
+                        is_linear,
+                    },
+                )
+            }
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => {
+                Box::new(
+                    TransformLut4To3Avx::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
+                        lut,
+                        interpolation_method: options.interpolation_method,
+                        weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                        _phantom: PhantomData,
+                        _phantom1: PhantomData,
+                        color_space,
+                        is_linear,
+                    },
+                )
+            }
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/lut4_to_3_q0_15.rs
+++ b/vendor/moxcms/src/conversions/avx/lut4_to_3_q0_15.rs
@@ -0,0 +1,207 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::avx::interpolator_q0_15::*;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformLut4To3AvxQ0_15<
+    T,
+    U,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    pub(crate) lut: Vec<AvxAlignedI16>,
+    pub(crate) _phantom: PhantomData<T>,
+    pub(crate) _phantom1: PhantomData<U>,
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
+    pub(crate) color_space: DataColorSpace,
+    pub(crate) is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut4To3AvxQ0_15<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2")]
+    unsafe fn transform_chunk<'b, Interpolator: AvxMdInterpolationQ0_15Double<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        unsafe {
+            let cn = Layout::from(LAYOUT);
+            let channels = cn.channels();
+            let grid_size = GRID_SIZE as i32;
+            let grid_size3 = grid_size * grid_size * grid_size;
+
+            let f_value_scale = _mm_set1_ps(1. / ((1 << 14i32) - 1) as f32);
+            let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+            let v_max_scale = if T::FINITE {
+                _mm_set1_epi16(((1i32 << BIT_DEPTH) - 1) as i16)
+            } else {
+                _mm_set1_epi16(((1i32 << 14i32) - 1) as i16)
+            };
+
+            for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
+                let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[0],
+                );
+                let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[1],
+                );
+                let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[2],
+                );
+                let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[3],
+                );
+
+                let k_weights = self.weights[k.as_()];
+
+                let w: i32 = k_weights.x;
+                let w_n: i32 = k_weights.x_n;
+                const Q: i16 = ((1i32 << 15) - 1) as i16;
+                let t: i16 = k_weights.w;
+                let t_n: i16 = Q - t;
+
+                let table1 = &self.lut[(w * grid_size3) as usize..];
+                let table2 = &self.lut[(w_n * grid_size3) as usize..];
+
+                let interpolator = Interpolator::new(table1, table2);
+                let v = interpolator.inter3_sse(c, m, y, &self.weights);
+                let (a0, b0) = (v.0.v, v.1.v);
+
+                let hp = _mm_mulhrs_epi16(_mm_set1_epi16(t_n), a0);
+                let v = _mm_add_epi16(hp, _mm_mulhrs_epi16(b0, _mm_set1_epi16(t)));
+
+                if T::FINITE {
+                    let mut o = _mm_max_epi16(v, _mm_setzero_si128());
+                    o = _mm_min_epi16(o, v_max_scale);
+                    let x = _mm_extract_epi16::<0>(o);
+                    let y = _mm_extract_epi16::<1>(o);
+                    let z = _mm_extract_epi16::<2>(o);
+
+                    dst[cn.r_i()] = (x as u32).as_();
+                    dst[cn.g_i()] = (y as u32).as_();
+                    dst[cn.b_i()] = (z as u32).as_();
+                } else {
+                    let mut r = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(v));
+                    r = _mm_mul_ps(r, f_value_scale);
+                    dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(r) as u32).as_();
+                    dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(r) as u32).as_();
+                    dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(r) as u32).as_();
+                }
+                if channels == 4 {
+                    dst[cn.a_i()] = max_value;
+                }
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut4To3AvxQ0_15<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        if src.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / 4;
+        let dst_chunks = dst.len() / channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearAvxQ0_15Double<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralAvxQ0_15Double<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidAvxFmaQ0_15Double<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticAvxQ0_15Double<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearAvxQ0_15Double<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/mod.rs
+++ b/vendor/moxcms/src/conversions/avx/mod.rs
@@ -0,0 +1,53 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+mod a_curves3;
+mod a_curves4x3;
+mod cube;
+mod hypercube;
+mod interpolator;
+mod interpolator_q0_15;
+mod lut4_to_3;
+mod lut4_to_3_q0_15;
+mod preheat_lut4x3;
+mod rgb_xyz;
+mod rgb_xyz_opt;
+mod rgb_xyz_q2_13;
+mod rgb_xyz_q2_13_opt;
+mod t_lut3_to_3;
+mod t_lut3_to_3_q0_15;
+
+pub(crate) use a_curves3::{ACurves3AvxFma, ACurves3InverseAvxFma, ACurves3OptimizedAvxFma};
+pub(crate) use a_curves4x3::{ACurves4x3AvxFma, ACurves4x3AvxFmaOptimized};
+pub(crate) use lut4_to_3::AvxLut4x3Factory;
+pub(crate) use preheat_lut4x3::Lut4x3AvxFma;
+pub(crate) use rgb_xyz::TransformShaperRgbAvx;
+pub(crate) use rgb_xyz_opt::TransformShaperRgbOptAvx;
+pub(crate) use rgb_xyz_q2_13::TransformShaperRgbQ2_13Avx;
+pub(crate) use rgb_xyz_q2_13_opt::TransformShaperRgbQ2_13OptAvx;
+pub(crate) use t_lut3_to_3::AvxLut3x3Factory;
--- a/vendor/moxcms/src/conversions/avx/preheat_lut4x3.rs
+++ b/vendor/moxcms/src/conversions/avx/preheat_lut4x3.rs
@@ -0,0 +1,135 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::avx::hypercube::HypercubeAvx;
+use crate::conversions::avx::interpolator::AvxVectorSse;
+use crate::trc::{lut_interp_linear_float, lut_interp_linear_float_clamped};
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Stage};
+use std::arch::x86_64::*;
+
+#[derive(Default)]
+pub(crate) struct Lut4x3AvxFma {
+    pub(crate) linearization: [Vec<f32>; 4],
+    pub(crate) clut: Vec<f32>,
+    pub(crate) grid_size: u8,
+    pub(crate) output: [Vec<f32>; 3],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+impl Lut4x3AvxFma {
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let linearization_0 = &self.linearization[0];
+        let linearization_1 = &self.linearization[1];
+        let linearization_2 = &self.linearization[2];
+        let linearization_3 = &self.linearization[3];
+        unsafe {
+            let ones = _mm_set1_ps(1.);
+            for (dest, src) in dst.chunks_exact_mut(3).zip(src.chunks_exact(4)) {
+                debug_assert!(self.grid_size as i32 >= 1);
+                let linear_x = lut_interp_linear_float(src[0], linearization_0);
+                let linear_y = lut_interp_linear_float(src[1], linearization_1);
+                let linear_z = lut_interp_linear_float(src[2], linearization_2);
+                let linear_w = lut_interp_linear_float(src[3], linearization_3);
+
+                let mut v = fetch(linear_x, linear_y, linear_z, linear_w).v;
+                v = _mm_max_ps(v, _mm_setzero_ps());
+                v = _mm_min_ps(v, ones);
+
+                let pcs_x = lut_interp_linear_float_clamped(
+                    f32::from_bits(_mm_extract_ps::<0>(v) as u32),
+                    &self.output[0],
+                );
+                let pcs_y = lut_interp_linear_float_clamped(
+                    f32::from_bits(_mm_extract_ps::<1>(v) as u32),
+                    &self.output[1],
+                );
+                let pcs_z = lut_interp_linear_float_clamped(
+                    f32::from_bits(_mm_extract_ps::<2>(v) as u32),
+                    &self.output[2],
+                );
+                dest[0] = pcs_x;
+                dest[1] = pcs_y;
+                dest[2] = pcs_z;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Stage for Lut4x3AvxFma {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let l_tbl = HypercubeAvx::new(
+            &self.clut,
+            [
+                self.grid_size,
+                self.grid_size,
+                self.grid_size,
+                self.grid_size,
+            ],
+            3,
+        );
+
+        assert!(std::arch::is_x86_feature_detected!("avx2"));
+        assert!(std::arch::is_x86_feature_detected!("fma"));
+
+        unsafe {
+            // If Source PCS is LAB trilinear should be used
+            if self.pcs == DataColorSpace::Lab {
+                return self
+                    .transform_impl(src, dst, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w));
+            }
+
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    self.transform_impl(src, dst, |x, y, z, w| l_tbl.tetra_vec3(x, y, z, w))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    self.transform_impl(src, dst, |x, y, z, w| l_tbl.pyramid_vec3(x, y, z, w))?;
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    self.transform_impl(src, dst, |x, y, z, w| l_tbl.prism_vec3(x, y, z, w))?
+                }
+                InterpolationMethod::Linear => {
+                    self.transform_impl(src, dst, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w))?
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/rgb_xyz.rs
+++ b/vendor/moxcms/src/conversions/avx/rgb_xyz.rs
@@ -0,0 +1,325 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::TransformMatrixShaper;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+
+#[repr(align(32), C)]
+#[derive(Debug)]
+pub(crate) struct AvxAlignedU16(pub(crate) [u16; 16]);
+
+pub(crate) struct TransformShaperRgbAvx<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> {
+    pub(crate) profile: TransformMatrixShaper<T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformShaperRgbAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    #[inline(always)]
+    unsafe fn transform_impl<const FMA: bool>(
+        &self,
+        src: &[T],
+        dst: &mut [T],
+    ) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary0 = AvxAlignedU16([0; 16]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        unsafe {
+            let m0 = _mm256_setr_ps(
+                t.v[0][0], t.v[0][1], t.v[0][2], 0., t.v[0][0], t.v[0][1], t.v[0][2], 0.,
+            );
+            let m1 = _mm256_setr_ps(
+                t.v[1][0], t.v[1][1], t.v[1][2], 0., t.v[1][0], t.v[1][1], t.v[1][2], 0.,
+            );
+            let m2 = _mm256_setr_ps(
+                t.v[2][0], t.v[2][1], t.v[2][2], 0., t.v[2][0], t.v[2][1], t.v[2][2], 0.,
+            );
+
+            let zeros = _mm_setzero_ps();
+
+            let v_scale = _mm256_set1_ps(scale);
+
+            let mut src = src;
+            let mut dst = dst;
+
+            let mut src_iter = src.chunks_exact(src_channels * 2);
+            let dst_iter = dst.chunks_exact_mut(dst_channels * 2);
+
+            let (mut r0, mut g0, mut b0, mut a0);
+            let (mut r1, mut g1, mut b1, mut a1);
+
+            if let Some(src) = src_iter.next() {
+                r0 = _mm_broadcast_ss(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
+                g0 = _mm_broadcast_ss(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
+                b0 = _mm_broadcast_ss(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
+                r1 = _mm_broadcast_ss(
+                    &self.profile.r_linear[src[src_cn.r_i() + src_channels]._as_usize()],
+                );
+                g1 = _mm_broadcast_ss(
+                    &self.profile.g_linear[src[src_cn.g_i() + src_channels]._as_usize()],
+                );
+                b1 = _mm_broadcast_ss(
+                    &self.profile.b_linear[src[src_cn.b_i() + src_channels]._as_usize()],
+                );
+                a0 = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+                a1 = if src_channels == 4 {
+                    src[src_cn.a_i() + src_channels]
+                } else {
+                    max_colors
+                };
+            } else {
+                r0 = _mm_setzero_ps();
+                g0 = _mm_setzero_ps();
+                b0 = _mm_setzero_ps();
+                a0 = max_colors;
+                r1 = _mm_setzero_ps();
+                g1 = _mm_setzero_ps();
+                b1 = _mm_setzero_ps();
+                a1 = max_colors;
+            }
+
+            for (src, dst) in src_iter.zip(dst_iter) {
+                let r = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
+                let g = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
+                let b = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
+
+                let mut v = if FMA {
+                    let v0 = _mm256_mul_ps(r, m0);
+                    let v1 = _mm256_fmadd_ps(g, m1, v0);
+                    _mm256_fmadd_ps(b, m2, v1)
+                } else {
+                    let v0 = _mm256_mul_ps(r, m0);
+                    let v1 = _mm256_mul_ps(g, m1);
+                    let v2 = _mm256_mul_ps(b, m2);
+
+                    _mm256_add_ps(_mm256_add_ps(v0, v1), v2)
+                };
+
+                v = _mm256_max_ps(v, _mm256_setzero_ps());
+                v = _mm256_mul_ps(v, v_scale);
+                v = _mm256_min_ps(v, v_scale);
+
+                let zx = _mm256_cvtps_epi32(v);
+                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                r0 = _mm_broadcast_ss(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
+                g0 = _mm_broadcast_ss(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
+                b0 = _mm_broadcast_ss(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
+                r1 = _mm_broadcast_ss(
+                    &self.profile.r_linear[src[src_cn.r_i() + src_channels]._as_usize()],
+                );
+                g1 = _mm_broadcast_ss(
+                    &self.profile.g_linear[src[src_cn.g_i() + src_channels]._as_usize()],
+                );
+                b1 = _mm_broadcast_ss(
+                    &self.profile.b_linear[src[src_cn.b_i() + src_channels]._as_usize()],
+                );
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a0;
+                }
+
+                dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
+                dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
+                dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i() + dst_channels] = a1;
+                }
+
+                a0 = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+                a1 = if src_channels == 4 {
+                    src[src_cn.a_i() + src_channels]
+                } else {
+                    max_colors
+                };
+            }
+
+            if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
+                let r = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
+                let g = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
+                let b = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
+
+                let mut v = if FMA {
+                    let v0 = _mm256_mul_ps(r, m0);
+                    let v1 = _mm256_fmadd_ps(g, m1, v0);
+                    _mm256_fmadd_ps(b, m2, v1)
+                } else {
+                    let v0 = _mm256_mul_ps(r, m0);
+                    let v1 = _mm256_mul_ps(g, m1);
+                    let v2 = _mm256_mul_ps(b, m2);
+
+                    _mm256_add_ps(_mm256_add_ps(v0, v1), v2)
+                };
+
+                v = _mm256_max_ps(v, _mm256_setzero_ps());
+                v = _mm256_mul_ps(v, v_scale);
+                v = _mm256_min_ps(v, v_scale);
+
+                let zx = _mm256_cvtps_epi32(v);
+                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a0;
+                }
+
+                dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
+                dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
+                dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i() + dst_channels] = a1;
+                }
+            }
+
+            src = src.chunks_exact(src_channels * 2).remainder();
+            dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let r = _mm_broadcast_ss(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
+                let g = _mm_broadcast_ss(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
+                let b = _mm_broadcast_ss(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let mut v = if FMA {
+                    let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
+                    let v1 = _mm_fmadd_ps(g, _mm256_castps256_ps128(m1), v0);
+                    _mm_fmadd_ps(b, _mm256_castps256_ps128(m2), v1)
+                } else {
+                    let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
+                    let v1 = _mm_mul_ps(g, _mm256_castps256_ps128(m1));
+                    let v2 = _mm_mul_ps(b, _mm256_castps256_ps128(m2));
+
+                    _mm_add_ps(_mm_add_ps(v0, v1), v2)
+                };
+
+                v = _mm_max_ps(v, zeros);
+                v = _mm_mul_ps(v, _mm256_castps256_ps128(v_scale));
+                v = _mm_min_ps(v, _mm256_castps256_ps128(v_scale));
+
+                let zx = _mm_cvtps_epi32(v);
+                _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_fma(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl::<true>(src, dst) }
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn transform_avx(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl::<false>(src, dst) }
+    }
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformExecutor<T> for TransformShaperRgbAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe {
+            if std::arch::is_x86_feature_detected!("fma") {
+                self.transform_fma(src, dst)
+            } else {
+                self.transform_avx(src, dst)
+            }
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/rgb_xyz_opt.rs
+++ b/vendor/moxcms/src/conversions/avx/rgb_xyz_opt.rs
@@ -0,0 +1,323 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::TransformMatrixShaperOptimized;
+use crate::conversions::avx::rgb_xyz::AvxAlignedU16;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+
+pub(crate) struct TransformShaperRgbOptAvx<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> {
+    pub(crate) profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformShaperRgbOptAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    #[inline(always)]
+    unsafe fn transform_impl<const FMA: bool>(
+        &self,
+        src: &[T],
+        dst: &mut [T],
+    ) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary0 = AvxAlignedU16([0; 16]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        unsafe {
+            let m0 = _mm256_setr_ps(
+                t.v[0][0], t.v[0][1], t.v[0][2], 0., t.v[0][0], t.v[0][1], t.v[0][2], 0.,
+            );
+            let m1 = _mm256_setr_ps(
+                t.v[1][0], t.v[1][1], t.v[1][2], 0., t.v[1][0], t.v[1][1], t.v[1][2], 0.,
+            );
+            let m2 = _mm256_setr_ps(
+                t.v[2][0], t.v[2][1], t.v[2][2], 0., t.v[2][0], t.v[2][1], t.v[2][2], 0.,
+            );
+
+            let zeros = _mm_setzero_ps();
+
+            let v_scale = _mm256_set1_ps(scale);
+
+            let mut src = src;
+            let mut dst = dst;
+
+            let mut src_iter = src.chunks_exact(src_channels * 2);
+            let dst_iter = dst.chunks_exact_mut(dst_channels * 2);
+
+            let (mut r0, mut g0, mut b0, mut a0);
+            let (mut r1, mut g1, mut b1, mut a1);
+
+            if let Some(src) = src_iter.next() {
+                r0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
+                g0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
+                b0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
+                r1 = _mm_broadcast_ss(
+                    &self.profile.linear[src[src_cn.r_i() + src_channels]._as_usize()],
+                );
+                g1 = _mm_broadcast_ss(
+                    &self.profile.linear[src[src_cn.g_i() + src_channels]._as_usize()],
+                );
+                b1 = _mm_broadcast_ss(
+                    &self.profile.linear[src[src_cn.b_i() + src_channels]._as_usize()],
+                );
+                a0 = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+                a1 = if src_channels == 4 {
+                    src[src_cn.a_i() + src_channels]
+                } else {
+                    max_colors
+                };
+            } else {
+                r0 = _mm_setzero_ps();
+                g0 = _mm_setzero_ps();
+                b0 = _mm_setzero_ps();
+                a0 = max_colors;
+                r1 = _mm_setzero_ps();
+                g1 = _mm_setzero_ps();
+                b1 = _mm_setzero_ps();
+                a1 = max_colors;
+            }
+
+            for (src, dst) in src_iter.zip(dst_iter) {
+                let r = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
+                let g = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
+                let b = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
+
+                let mut v = if FMA {
+                    let v0 = _mm256_mul_ps(r, m0);
+                    let v1 = _mm256_fmadd_ps(g, m1, v0);
+                    _mm256_fmadd_ps(b, m2, v1)
+                } else {
+                    let v0 = _mm256_mul_ps(r, m0);
+                    let v1 = _mm256_mul_ps(g, m1);
+                    let v2 = _mm256_mul_ps(b, m2);
+
+                    _mm256_add_ps(_mm256_add_ps(v0, v1), v2)
+                };
+
+                v = _mm256_max_ps(v, _mm256_setzero_ps());
+                v = _mm256_mul_ps(v, v_scale);
+                v = _mm256_min_ps(v, v_scale);
+
+                let zx = _mm256_cvtps_epi32(v);
+                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                r0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
+                g0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
+                b0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
+                r1 = _mm_broadcast_ss(
+                    &self.profile.linear[src[src_cn.r_i() + src_channels]._as_usize()],
+                );
+                g1 = _mm_broadcast_ss(
+                    &self.profile.linear[src[src_cn.g_i() + src_channels]._as_usize()],
+                );
+                b1 = _mm_broadcast_ss(
+                    &self.profile.linear[src[src_cn.b_i() + src_channels]._as_usize()],
+                );
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a0;
+                }
+
+                dst[dst_cn.r_i() + dst_channels] = self.profile.gamma[temporary0.0[8] as usize];
+                dst[dst_cn.g_i() + dst_channels] = self.profile.gamma[temporary0.0[10] as usize];
+                dst[dst_cn.b_i() + dst_channels] = self.profile.gamma[temporary0.0[12] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i() + dst_channels] = a1;
+                }
+
+                a0 = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+                a1 = if src_channels == 4 {
+                    src[src_cn.a_i() + src_channels]
+                } else {
+                    max_colors
+                };
+            }
+
+            if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
+                let r = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
+                let g = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
+                let b = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
+
+                let mut v = if FMA {
+                    let v0 = _mm256_mul_ps(r, m0);
+                    let v1 = _mm256_fmadd_ps(g, m1, v0);
+                    _mm256_fmadd_ps(b, m2, v1)
+                } else {
+                    let v0 = _mm256_mul_ps(r, m0);
+                    let v1 = _mm256_mul_ps(g, m1);
+                    let v2 = _mm256_mul_ps(b, m2);
+
+                    _mm256_add_ps(_mm256_add_ps(v0, v1), v2)
+                };
+
+                v = _mm256_max_ps(v, _mm256_setzero_ps());
+                v = _mm256_mul_ps(v, v_scale);
+                v = _mm256_min_ps(v, v_scale);
+
+                let zx = _mm256_cvtps_epi32(v);
+                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a0;
+                }
+
+                dst[dst_cn.r_i() + dst_channels] = self.profile.gamma[temporary0.0[8] as usize];
+                dst[dst_cn.g_i() + dst_channels] = self.profile.gamma[temporary0.0[10] as usize];
+                dst[dst_cn.b_i() + dst_channels] = self.profile.gamma[temporary0.0[12] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i() + dst_channels] = a1;
+                }
+            }
+
+            src = src.chunks_exact(src_channels * 2).remainder();
+            dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let r = _mm_broadcast_ss(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
+                let g = _mm_broadcast_ss(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
+                let b = _mm_broadcast_ss(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let mut v = if FMA {
+                    let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
+                    let v1 = _mm_fmadd_ps(g, _mm256_castps256_ps128(m1), v0);
+                    _mm_fmadd_ps(b, _mm256_castps256_ps128(m2), v1)
+                } else {
+                    let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
+                    let v1 = _mm_mul_ps(g, _mm256_castps256_ps128(m1));
+                    let v2 = _mm_mul_ps(b, _mm256_castps256_ps128(m2));
+
+                    _mm_add_ps(_mm_add_ps(v0, v1), v2)
+                };
+
+                v = _mm_max_ps(v, zeros);
+                v = _mm_mul_ps(v, _mm256_castps256_ps128(v_scale));
+                v = _mm_min_ps(v, _mm256_castps256_ps128(v_scale));
+
+                let zx = _mm_cvtps_epi32(v);
+                _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_fma(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl::<true>(src, dst) }
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn transform_avx(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl::<false>(src, dst) }
+    }
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformExecutor<T>
+    for TransformShaperRgbOptAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe {
+            if std::arch::is_x86_feature_detected!("fma") {
+                self.transform_fma(src, dst)
+            } else {
+                self.transform_avx(src, dst)
+            }
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/rgb_xyz_q2_13.rs
+++ b/vendor/moxcms/src/conversions/avx/rgb_xyz_q2_13.rs
@@ -0,0 +1,304 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::avx::rgb_xyz::AvxAlignedU16;
+use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPoint;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+
+pub(crate) struct TransformShaperRgbQ2_13Avx<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPoint<i32, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+#[inline(always)]
+pub(crate) unsafe fn _xmm_broadcast_epi32(f: &i32) -> __m128i {
+    let float_ref: &f32 = unsafe { &*(f as *const i32 as *const f32) };
+    unsafe { _mm_castps_si128(_mm_broadcast_ss(float_ref)) }
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformShaperRgbQ2_13Avx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "avx2")]
+    unsafe fn transform_avx2(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary0 = AvxAlignedU16([0; 16]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let max_colors = ((1 << self.bit_depth) - 1).as_();
+
+        unsafe {
+            let m0 = _mm256_setr_epi16(
+                t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0, t.v[0][0],
+                t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
+            );
+            let m2 = _mm256_setr_epi16(
+                t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0, t.v[2][0], 1, t.v[2][1], 1,
+                t.v[2][2], 1, 0, 0,
+            );
+
+            let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
+            let rnd = _mm256_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
+
+            let zeros = _mm256_setzero_si256();
+
+            let v_max_value = _mm256_set1_epi32(GAMMA_LUT as i32 - 1);
+
+            let mut src = src;
+            let mut dst = dst;
+
+            let mut src_iter = src.chunks_exact(src_channels * 2);
+            let dst_iter = dst.chunks_exact_mut(dst_channels * 2);
+
+            let (mut r0, mut g0, mut b0, mut a0);
+            let (mut r1, mut g1, mut b1, mut a1);
+
+            if let Some(src) = src_iter.next() {
+                r0 = _xmm_broadcast_epi32(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
+                g0 = _xmm_broadcast_epi32(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
+                b0 = _xmm_broadcast_epi32(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
+                r1 = _xmm_broadcast_epi32(
+                    &self.profile.r_linear[src[src_cn.r_i() + src_channels]._as_usize()],
+                );
+                g1 = _xmm_broadcast_epi32(
+                    &self.profile.g_linear[src[src_cn.g_i() + src_channels]._as_usize()],
+                );
+                b1 = _xmm_broadcast_epi32(
+                    &self.profile.b_linear[src[src_cn.b_i() + src_channels]._as_usize()],
+                );
+                a0 = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+                a1 = if src_channels == 4 {
+                    src[src_cn.a_i() + src_channels]
+                } else {
+                    max_colors
+                };
+            } else {
+                r0 = _mm_setzero_si128();
+                g0 = _mm_setzero_si128();
+                b0 = _mm_setzero_si128();
+                a0 = max_colors;
+                r1 = _mm_setzero_si128();
+                g1 = _mm_setzero_si128();
+                b1 = _mm_setzero_si128();
+                a1 = max_colors;
+            }
+
+            for (src, dst) in src_iter.zip(dst_iter) {
+                let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
+                let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
+                let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
+                zg0 = _mm256_slli_epi32::<16>(zg0);
+
+                let zrg0 = _mm256_or_si256(zr0, zg0);
+                let zbz0 = _mm256_or_si256(zb0, rnd);
+
+                let va0 = _mm256_madd_epi16(zrg0, m0);
+                let va1 = _mm256_madd_epi16(zbz0, m2);
+
+                let mut v0 = _mm256_add_epi32(va0, va1);
+
+                v0 = _mm256_srai_epi32::<PRECISION>(v0);
+                v0 = _mm256_max_epi32(v0, zeros);
+                v0 = _mm256_min_epi32(v0, v_max_value);
+
+                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
+
+                r0 = _xmm_broadcast_epi32(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
+                g0 = _xmm_broadcast_epi32(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
+                b0 = _xmm_broadcast_epi32(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
+                r1 = _xmm_broadcast_epi32(
+                    &self.profile.r_linear[src[src_cn.r_i() + src_channels]._as_usize()],
+                );
+                g1 = _xmm_broadcast_epi32(
+                    &self.profile.g_linear[src[src_cn.g_i() + src_channels]._as_usize()],
+                );
+                b1 = _xmm_broadcast_epi32(
+                    &self.profile.b_linear[src[src_cn.b_i() + src_channels]._as_usize()],
+                );
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a0;
+                }
+
+                dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
+                dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
+                dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i() + dst_channels] = a1;
+                }
+
+                a0 = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+                a1 = if src_channels == 4 {
+                    src[src_cn.a_i() + src_channels]
+                } else {
+                    max_colors
+                };
+            }
+
+            if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
+                let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
+                let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
+                let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
+                zg0 = _mm256_slli_epi32::<16>(zg0);
+
+                let zrg0 = _mm256_or_si256(zr0, zg0);
+                let zbz0 = _mm256_or_si256(zb0, rnd);
+
+                let va0 = _mm256_madd_epi16(zrg0, m0);
+                let va1 = _mm256_madd_epi16(zbz0, m2);
+
+                let mut v0 = _mm256_add_epi32(va0, va1);
+
+                v0 = _mm256_srai_epi32::<PRECISION>(v0);
+                v0 = _mm256_max_epi32(v0, zeros);
+                v0 = _mm256_min_epi32(v0, v_max_value);
+
+                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a0;
+                }
+
+                dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
+                dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
+                dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i() + dst_channels] = a1;
+                }
+            }
+
+            src = src.chunks_exact(src_channels * 2).remainder();
+            dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let r = _xmm_broadcast_epi32(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
+                let mut g =
+                    _xmm_broadcast_epi32(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
+                let b = _xmm_broadcast_epi32(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
+
+                g = _mm_slli_epi32::<16>(g);
+
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let zrg0 = _mm_or_si128(r, g);
+                let zbz0 = _mm_or_si128(b, _mm256_castsi256_si128(rnd));
+
+                let v0 = _mm_madd_epi16(zrg0, _mm256_castsi256_si128(m0));
+                let v1 = _mm_madd_epi16(zbz0, _mm256_castsi256_si128(m2));
+
+                let mut v = _mm_add_epi32(v0, v1);
+
+                v = _mm_srai_epi32::<PRECISION>(v);
+                v = _mm_max_epi32(v, _mm_setzero_si128());
+                v = _mm_min_epi32(v, _mm256_castsi256_si128(v_max_value));
+
+                _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, v);
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformShaperRgbQ2_13Avx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_avx2(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/rgb_xyz_q2_13_opt.rs
+++ b/vendor/moxcms/src/conversions/avx/rgb_xyz_q2_13_opt.rs
@@ -0,0 +1,298 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::avx::rgb_xyz::AvxAlignedU16;
+use crate::conversions::avx::rgb_xyz_q2_13::_xmm_broadcast_epi32;
+use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPointOpt;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+
+pub(crate) struct TransformShaperRgbQ2_13OptAvx<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPointOpt<i32, i16, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformShaperRgbQ2_13OptAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "avx2")]
+    unsafe fn transform_avx2(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary0 = AvxAlignedU16([0; 16]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let max_colors = ((1 << self.bit_depth) - 1).as_();
+
+        unsafe {
+            let m0 = _mm256_setr_epi16(
+                t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0, t.v[0][0],
+                t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
+            );
+            let m2 = _mm256_setr_epi16(
+                t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0, t.v[2][0], 1, t.v[2][1], 1,
+                t.v[2][2], 1, 0, 0,
+            );
+
+            let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
+            let rnd = _mm256_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
+
+            let zeros = _mm256_setzero_si256();
+
+            let v_max_value = _mm256_set1_epi32(GAMMA_LUT as i32 - 1);
+
+            let (mut r0, mut g0, mut b0, mut a0);
+            let (mut r1, mut g1, mut b1, mut a1);
+
+            let mut src_iter = src.chunks_exact(src_channels * 2);
+
+            if let Some(src0) = src_iter.next() {
+                r0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.r_i()]._as_usize()]);
+                g0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.g_i()]._as_usize()]);
+                b0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.b_i()]._as_usize()]);
+
+                r1 = _xmm_broadcast_epi32(
+                    &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()],
+                );
+                g1 = _xmm_broadcast_epi32(
+                    &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()],
+                );
+                b1 = _xmm_broadcast_epi32(
+                    &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()],
+                );
+
+                a0 = if src_channels == 4 {
+                    src0[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+                a1 = if src_channels == 4 {
+                    src0[src_cn.a_i() + src_channels]
+                } else {
+                    max_colors
+                };
+            } else {
+                r0 = _mm_setzero_si128();
+                g0 = _mm_setzero_si128();
+                b0 = _mm_setzero_si128();
+                a0 = max_colors;
+                r1 = _mm_setzero_si128();
+                g1 = _mm_setzero_si128();
+                b1 = _mm_setzero_si128();
+                a1 = max_colors;
+            }
+
+            for (src, dst) in src_iter.zip(dst.chunks_exact_mut(dst_channels * 2)) {
+                let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
+                let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
+                let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
+                zg0 = _mm256_slli_epi32::<16>(zg0);
+
+                let zrg0 = _mm256_or_si256(zr0, zg0);
+                let zbz0 = _mm256_or_si256(zb0, rnd);
+
+                let va0 = _mm256_madd_epi16(zrg0, m0);
+                let va1 = _mm256_madd_epi16(zbz0, m2);
+
+                let mut v0 = _mm256_add_epi32(va0, va1);
+
+                v0 = _mm256_srai_epi32::<PRECISION>(v0);
+                v0 = _mm256_max_epi32(v0, zeros);
+                v0 = _mm256_min_epi32(v0, v_max_value);
+
+                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
+
+                r0 = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
+                g0 = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
+                b0 = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
+
+                r1 = _xmm_broadcast_epi32(
+                    &self.profile.linear[src[src_cn.r_i() + src_channels]._as_usize()],
+                );
+                g1 = _xmm_broadcast_epi32(
+                    &self.profile.linear[src[src_cn.g_i() + src_channels]._as_usize()],
+                );
+                b1 = _xmm_broadcast_epi32(
+                    &self.profile.linear[src[src_cn.b_i() + src_channels]._as_usize()],
+                );
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a0;
+                }
+
+                dst[dst_cn.r_i() + dst_channels] = self.profile.gamma[temporary0.0[8] as usize];
+                dst[dst_cn.g_i() + dst_channels] = self.profile.gamma[temporary0.0[10] as usize];
+                dst[dst_cn.b_i() + dst_channels] = self.profile.gamma[temporary0.0[12] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i() + dst_channels] = a1;
+                }
+
+                a0 = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+                a1 = if src_channels == 4 {
+                    src[src_cn.a_i() + src_channels]
+                } else {
+                    max_colors
+                };
+            }
+
+            if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
+                let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
+                let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
+                let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
+                zg0 = _mm256_slli_epi32::<16>(zg0);
+
+                let zrg0 = _mm256_or_si256(zr0, zg0);
+                let zbz0 = _mm256_or_si256(zb0, rnd);
+
+                let va0 = _mm256_madd_epi16(zrg0, m0);
+                let va1 = _mm256_madd_epi16(zbz0, m2);
+
+                let mut v0 = _mm256_add_epi32(va0, va1);
+
+                v0 = _mm256_srai_epi32::<PRECISION>(v0);
+                v0 = _mm256_max_epi32(v0, zeros);
+                v0 = _mm256_min_epi32(v0, v_max_value);
+
+                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a0;
+                }
+
+                dst[dst_cn.r_i() + dst_channels] = self.profile.gamma[temporary0.0[8] as usize];
+                dst[dst_cn.g_i() + dst_channels] = self.profile.gamma[temporary0.0[10] as usize];
+                dst[dst_cn.b_i() + dst_channels] = self.profile.gamma[temporary0.0[12] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i() + dst_channels] = a1;
+                }
+            }
+
+            let src = src.chunks_exact(src_channels * 2).remainder();
+            let dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let r = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
+                let mut g =
+                    _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
+                let b = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
+
+                g = _mm_slli_epi32::<16>(g);
+
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let zrg0 = _mm_or_si128(r, g);
+                let zbz0 = _mm_or_si128(b, _mm256_castsi256_si128(rnd));
+
+                let v0 = _mm_madd_epi16(zrg0, _mm256_castsi256_si128(m0));
+                let v1 = _mm_madd_epi16(zbz0, _mm256_castsi256_si128(m2));
+
+                let mut v = _mm_add_epi32(v0, v1);
+
+                v = _mm_srai_epi32::<PRECISION>(v);
+                v = _mm_max_epi32(v, _mm_setzero_si128());
+                v = _mm_min_epi32(v, _mm256_castsi256_si128(v_max_value));
+
+                _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, v);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformShaperRgbQ2_13OptAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_avx2(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/t_lut3_to_3.rs
+++ b/vendor/moxcms/src/conversions/avx/t_lut3_to_3.rs
@@ -0,0 +1,344 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::avx::interpolator::*;
+use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
+use crate::conversions::avx::t_lut3_to_3_q0_15::TransformLut3x3AvxQ0_15;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::lut_transforms::Lut3x3Factory;
+use crate::transform::PointeeSizeExpressible;
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+use std::marker::PhantomData;
+
+struct TransformLut3x3AvxFma<
+    T,
+    U,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    lut: Vec<SseAlignedF32>,
+    _phantom: PhantomData<T>,
+    _phantom2: PhantomData<U>,
+    interpolation_method: InterpolationMethod,
+    weights: Box<[BarycentricWeight<f32>; BINS]>,
+    color_space: DataColorSpace,
+    is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut3x3AvxFma<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2", enable = "fma")]
+    unsafe fn transform_chunk<'b, Interpolator: AvxMdInterpolation<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+
+        let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
+        let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.r_i()],
+            );
+            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.g_i()],
+            );
+            let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.b_i()],
+            );
+
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_value
+            };
+
+            let tetrahedral = Interpolator::new(&self.lut);
+            let v = tetrahedral.inter3_sse(x, y, z, &self.weights);
+            if T::FINITE {
+                unsafe {
+                    let mut r = _mm_mul_ps(v.v, value_scale);
+                    r = _mm_max_ps(r, _mm_setzero_ps());
+                    r = _mm_min_ps(r, value_scale);
+                    let jvz = _mm_cvtps_epi32(r);
+
+                    let x = _mm_extract_epi32::<0>(jvz);
+                    let y = _mm_extract_epi32::<1>(jvz);
+                    let z = _mm_extract_epi32::<2>(jvz);
+
+                    dst[dst_cn.r_i()] = (x as u32).as_();
+                    dst[dst_cn.g_i()] = (y as u32).as_();
+                    dst[dst_cn.b_i()] = (z as u32).as_();
+                }
+            } else {
+                unsafe {
+                    dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
+                    dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
+                    dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
+                }
+            }
+            if dst_channels == 4 {
+                dst[dst_cn.a_i()] = a;
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut3x3AvxFma<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / src_channels;
+        let dst_chunks = dst.len() / dst_channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearAvxFma<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralAvxFma<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidalAvxFma<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticAvxFma<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearAvxFma<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+pub(crate) struct AvxLut3x3Factory {}
+
+impl Lut3x3Factory for AvxLut3x3Factory {
+    fn make_transform_3x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const SRC_LAYOUT: u8,
+        const DST_LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Send + Sync>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>,
+    {
+        if options.prefer_fixed_point && BIT_DEPTH < 16 {
+            let q: f32 = if T::FINITE {
+                ((1i32 << BIT_DEPTH as i32) - 1) as f32
+            } else {
+                ((1i32 << 14i32) - 1) as f32
+            };
+            let lut = lut
+                .chunks_exact(3)
+                .map(|x| {
+                    AvxAlignedI16([
+                        (x[0] * q).round() as i16,
+                        (x[1] * q).round() as i16,
+                        (x[2] * q).round() as i16,
+                        0,
+                    ])
+                })
+                .collect::<Vec<_>>();
+            return match options.barycentric_weight_scale {
+                BarycentricWeightScale::Low => Box::new(TransformLut3x3AvxQ0_15::<
+                    T,
+                    u8,
+                    SRC_LAYOUT,
+                    DST_LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    256,
+                    256,
+                > {
+                    lut,
+                    _phantom: PhantomData,
+                    _phantom2: PhantomData,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
+                    color_space,
+                    is_linear,
+                }),
+                #[cfg(feature = "options")]
+                BarycentricWeightScale::High => Box::new(TransformLut3x3AvxQ0_15::<
+                    T,
+                    u16,
+                    SRC_LAYOUT,
+                    DST_LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    65536,
+                    65536,
+                > {
+                    lut,
+                    _phantom: PhantomData,
+                    _phantom2: PhantomData,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
+                    color_space,
+                    is_linear,
+                }),
+            };
+        }
+        assert!(
+            std::arch::is_x86_feature_detected!("fma"),
+            "Internal configuration error, this might not be called without `fma` feature"
+        );
+        let lut = lut
+            .chunks_exact(3)
+            .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
+            .collect::<Vec<_>>();
+        match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => Box::new(TransformLut3x3AvxFma::<
+                T,
+                u8,
+                SRC_LAYOUT,
+                DST_LAYOUT,
+                GRID_SIZE,
+                BIT_DEPTH,
+                256,
+                256,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom2: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                color_space,
+                is_linear,
+            }),
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => Box::new(TransformLut3x3AvxFma::<
+                T,
+                u16,
+                SRC_LAYOUT,
+                DST_LAYOUT,
+                GRID_SIZE,
+                BIT_DEPTH,
+                65536,
+                65536,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom2: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                color_space,
+                is_linear,
+            }),
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/avx/t_lut3_to_3_q0_15.rs
+++ b/vendor/moxcms/src/conversions/avx/t_lut3_to_3_q0_15.rs
@@ -0,0 +1,222 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::avx::interpolator_q0_15::*;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformLut3x3AvxQ0_15<
+    T,
+    U,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    pub(crate) lut: Vec<AvxAlignedI16>,
+    pub(crate) _phantom: PhantomData<T>,
+    pub(crate) _phantom2: PhantomData<U>,
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
+    pub(crate) color_space: DataColorSpace,
+    pub(crate) is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+>
+    TransformLut3x3AvxQ0_15<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "avx2")]
+    unsafe fn transform_chunk<'b, Interpolator: AvxMdInterpolationQ0_15<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        unsafe {
+            let src_cn = Layout::from(SRC_LAYOUT);
+            let src_channels = src_cn.channels();
+
+            let dst_cn = Layout::from(DST_LAYOUT);
+            let dst_channels = dst_cn.channels();
+
+            let f_value_scale = _mm_set1_ps(1. / ((1 << 14i32) - 1) as f32);
+            let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+            let v_max_scale = if T::FINITE {
+                _mm_set1_epi16(((1i32 << BIT_DEPTH) - 1) as i16)
+            } else {
+                _mm_set1_epi16(((1i32 << 14i32) - 1) as i16)
+            };
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.r_i()],
+                );
+                let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.g_i()],
+                );
+                let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.b_i()],
+                );
+
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_value
+                };
+
+                let tetrahedral = Interpolator::new(&self.lut);
+                let v = tetrahedral.inter3_sse(x, y, z, &self.weights);
+                if T::FINITE {
+                    let mut o = _mm_max_epi16(v.v, _mm_setzero_si128());
+                    o = _mm_min_epi16(o, v_max_scale);
+                    let x = _mm_extract_epi16::<0>(o);
+                    let y = _mm_extract_epi16::<1>(o);
+                    let z = _mm_extract_epi16::<2>(o);
+
+                    dst[dst_cn.r_i()] = (x as u32).as_();
+                    dst[dst_cn.g_i()] = (y as u32).as_();
+                    dst[dst_cn.b_i()] = (z as u32).as_();
+                } else {
+                    let mut r = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(v.v));
+                    r = _mm_mul_ps(r, f_value_scale);
+                    dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(r) as u32).as_();
+                    dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(r) as u32).as_();
+                    dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(r) as u32).as_();
+                }
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut3x3AvxQ0_15<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / src_channels;
+        let dst_chunks = dst.len() / dst_channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearAvxQ0_15<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralAvxQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidalAvxQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticAvxQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearAvxQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/avx512/mod.rs
+++ b/vendor/moxcms/src/conversions/avx512/mod.rs
@@ -0,0 +1,33 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 5/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+mod rgb_xyz_opt;
+mod rgb_xyz_q2_13_opt;
+
+pub(crate) use rgb_xyz_opt::TransformShaperRgbOptAvx512;
+pub(crate) use rgb_xyz_q2_13_opt::TransformShaperRgbQ2_13OptAvx512;
--- a/vendor/moxcms/src/conversions/avx512/rgb_xyz_opt.rs
+++ b/vendor/moxcms/src/conversions/avx512/rgb_xyz_opt.rs
@@ -0,0 +1,420 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 5/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::TransformMatrixShaperOptimized;
+use crate::conversions::avx512::rgb_xyz_q2_13_opt::{
+    AvxAlignedU16, split_by_twos, split_by_twos_mut,
+};
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+
+pub(crate) struct TransformShaperRgbOptAvx512<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> {
+    pub(crate) profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformShaperRgbOptAvx512<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "avx512bw", enable = "avx512vl", enable = "fma")]
+    unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        let (src_chunks, src_remainder) = split_by_twos(src, src_channels);
+        let (dst_chunks, dst_remainder) = split_by_twos_mut(dst, dst_channels);
+
+        let mut temporary0 = AvxAlignedU16([0; 16]);
+        let mut temporary1 = AvxAlignedU16([0; 16]);
+
+        unsafe {
+            let m0 = _mm256_setr_ps(
+                t.v[0][0], t.v[0][1], t.v[0][2], 0f32, t.v[0][0], t.v[0][1], t.v[0][2], 0f32,
+            );
+            let m1 = _mm256_setr_ps(
+                t.v[1][0], t.v[1][1], t.v[1][2], 0f32, t.v[1][0], t.v[1][1], t.v[1][2], 0f32,
+            );
+            let m2 = _mm256_setr_ps(
+                t.v[2][0], t.v[2][1], t.v[2][2], 0f32, t.v[2][0], t.v[2][1], t.v[2][2], 0f32,
+            );
+
+            let zeros = _mm_setzero_ps();
+
+            let v_scale = _mm256_set1_ps(scale);
+
+            if !src_chunks.is_empty() {
+                let (src0, src1) = src_chunks.split_at(src_chunks.len() / 2);
+                let (dst0, dst1) = dst_chunks.split_at_mut(dst_chunks.len() / 2);
+                let mut src_iter0 = src0.chunks_exact(src_channels * 2);
+                let mut src_iter1 = src1.chunks_exact(src_channels * 2);
+
+                let (mut r0, mut g0, mut b0, mut a0);
+                let (mut r1, mut g1, mut b1, mut a1);
+                let (mut r2, mut g2, mut b2, mut a2);
+                let (mut r3, mut g3, mut b3, mut a3);
+
+                if let (Some(src0), Some(src1)) = (src_iter0.next(), src_iter1.next()) {
+                    r0 = _mm_broadcast_ss(&self.profile.linear[src0[src_cn.r_i()]._as_usize()]);
+                    g0 = _mm_broadcast_ss(&self.profile.linear[src0[src_cn.g_i()]._as_usize()]);
+                    b0 = _mm_broadcast_ss(&self.profile.linear[src0[src_cn.b_i()]._as_usize()]);
+
+                    r1 = _mm_broadcast_ss(
+                        &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()],
+                    );
+                    g1 = _mm_broadcast_ss(
+                        &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()],
+                    );
+                    b1 = _mm_broadcast_ss(
+                        &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()],
+                    );
+
+                    r2 = _mm_broadcast_ss(&self.profile.linear[src1[src_cn.r_i()]._as_usize()]);
+                    g2 = _mm_broadcast_ss(&self.profile.linear[src1[src_cn.g_i()]._as_usize()]);
+                    b2 = _mm_broadcast_ss(&self.profile.linear[src1[src_cn.b_i()]._as_usize()]);
+
+                    r3 = _mm_broadcast_ss(
+                        &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()],
+                    );
+                    g3 = _mm_broadcast_ss(
+                        &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()],
+                    );
+                    b3 = _mm_broadcast_ss(
+                        &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()],
+                    );
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                } else {
+                    r0 = _mm_setzero_ps();
+                    g0 = _mm_setzero_ps();
+                    b0 = _mm_setzero_ps();
+                    a0 = max_colors;
+                    r1 = _mm_setzero_ps();
+                    g1 = _mm_setzero_ps();
+                    b1 = _mm_setzero_ps();
+                    a1 = max_colors;
+                    r2 = _mm_setzero_ps();
+                    g2 = _mm_setzero_ps();
+                    b2 = _mm_setzero_ps();
+                    a2 = max_colors;
+                    r3 = _mm_setzero_ps();
+                    g3 = _mm_setzero_ps();
+                    b3 = _mm_setzero_ps();
+                    a3 = max_colors;
+                }
+
+                for (((src0, src1), dst0), dst1) in src_iter0
+                    .zip(src_iter1)
+                    .zip(dst0.chunks_exact_mut(dst_channels * 2))
+                    .zip(dst1.chunks_exact_mut(dst_channels * 2))
+                {
+                    let rz0 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
+                    let gz0 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
+                    let bz0 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
+
+                    let rz1 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r2), r3);
+                    let gz1 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g2), g3);
+                    let bz1 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b2), b3);
+
+                    let v0 = _mm256_mul_ps(rz0, m0);
+                    let v1 = _mm256_fmadd_ps(gz0, m1, v0);
+                    let mut vz0 = _mm256_fmadd_ps(bz0, m2, v1);
+
+                    let v2 = _mm256_mul_ps(rz1, m0);
+                    let v3 = _mm256_fmadd_ps(gz1, m1, v2);
+                    let mut vz1 = _mm256_fmadd_ps(bz1, m2, v3);
+
+                    vz0 = _mm256_max_ps(vz0, _mm256_setzero_ps());
+                    vz0 = _mm256_mul_ps(vz0, v_scale);
+                    vz0 = _mm256_min_ps(vz0, v_scale);
+
+                    vz1 = _mm256_max_ps(vz1, _mm256_setzero_ps());
+                    vz1 = _mm256_mul_ps(vz1, v_scale);
+                    vz1 = _mm256_min_ps(vz1, v_scale);
+
+                    let zx0 = _mm256_cvtps_epi32(vz0);
+                    let zx1 = _mm256_cvtps_epi32(vz1);
+                    _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx0);
+                    _mm256_store_si256(temporary1.0.as_mut_ptr() as *mut _, zx1);
+
+                    r0 = _mm_broadcast_ss(&self.profile.linear[src0[src_cn.r_i()]._as_usize()]);
+                    g0 = _mm_broadcast_ss(&self.profile.linear[src0[src_cn.g_i()]._as_usize()]);
+                    b0 = _mm_broadcast_ss(&self.profile.linear[src0[src_cn.b_i()]._as_usize()]);
+                    r1 = _mm_broadcast_ss(
+                        &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()],
+                    );
+                    g1 = _mm_broadcast_ss(
+                        &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()],
+                    );
+                    b1 = _mm_broadcast_ss(
+                        &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()],
+                    );
+
+                    r2 = _mm_broadcast_ss(&self.profile.linear[src1[src_cn.r_i()]._as_usize()]);
+                    g2 = _mm_broadcast_ss(&self.profile.linear[src1[src_cn.g_i()]._as_usize()]);
+                    b2 = _mm_broadcast_ss(&self.profile.linear[src1[src_cn.b_i()]._as_usize()]);
+
+                    r3 = _mm_broadcast_ss(
+                        &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()],
+                    );
+                    g3 = _mm_broadcast_ss(
+                        &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()],
+                    );
+                    b3 = _mm_broadcast_ss(
+                        &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()],
+                    );
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[8] as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[10] as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[12] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[temporary1.0[0] as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[temporary1.0[2] as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[temporary1.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[8] as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[10] as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[12] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+
+                    a0 = if src_channels == 4 {
+                        src[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+                    a1 = if src_channels == 4 {
+                        src[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                }
+
+                if let (Some(dst0), Some(dst1)) = (
+                    dst0.chunks_exact_mut(dst_channels * 2).last(),
+                    dst1.chunks_exact_mut(dst_channels * 2).last(),
+                ) {
+                    let rz0 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
+                    let gz0 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
+                    let bz0 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
+
+                    let rz1 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r2), r3);
+                    let gz1 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g2), g3);
+                    let bz1 = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b2), b3);
+
+                    let v0 = _mm256_mul_ps(rz0, m0);
+                    let v1 = _mm256_fmadd_ps(gz0, m1, v0);
+                    let mut vz0 = _mm256_fmadd_ps(bz0, m2, v1);
+
+                    let v2 = _mm256_mul_ps(rz1, m0);
+                    let v3 = _mm256_fmadd_ps(gz1, m1, v2);
+                    let mut vz1 = _mm256_fmadd_ps(bz1, m2, v3);
+
+                    vz0 = _mm256_max_ps(vz0, _mm256_setzero_ps());
+                    vz0 = _mm256_mul_ps(vz0, v_scale);
+                    vz0 = _mm256_min_ps(vz0, v_scale);
+
+                    vz1 = _mm256_max_ps(vz1, _mm256_setzero_ps());
+                    vz1 = _mm256_mul_ps(vz1, v_scale);
+                    vz1 = _mm256_min_ps(vz1, v_scale);
+
+                    let zx0 = _mm256_cvtps_epi32(vz0);
+                    let zx1 = _mm256_cvtps_epi32(vz1);
+                    _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx0);
+                    _mm256_store_si256(temporary1.0.as_mut_ptr() as *mut _, zx1);
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[8] as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[10] as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[12] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[temporary1.0[0] as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[temporary1.0[2] as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[temporary1.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[8] as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[10] as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[12] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+                }
+            }
+
+            for (src, dst) in src_remainder
+                .chunks_exact(src_channels)
+                .zip(dst_remainder.chunks_exact_mut(dst_channels))
+            {
+                let r = _mm_broadcast_ss(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
+                let g = _mm_broadcast_ss(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
+                let b = _mm_broadcast_ss(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
+                let v1 = _mm_fmadd_ps(g, _mm256_castps256_ps128(m1), v0);
+                let mut v = _mm_fmadd_ps(b, _mm256_castps256_ps128(m2), v1);
+
+                v = _mm_max_ps(v, zeros);
+                v = _mm_mul_ps(v, _mm256_castps256_ps128(v_scale));
+                v = _mm_min_ps(v, _mm256_castps256_ps128(v_scale));
+
+                let zx = _mm_cvtps_epi32(v);
+                _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformExecutor<T>
+    for TransformShaperRgbOptAvx512<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/avx512/rgb_xyz_q2_13_opt.rs
+++ b/vendor/moxcms/src/conversions/avx512/rgb_xyz_q2_13_opt.rs
@@ -0,0 +1,476 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPointOpt;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::x86_64::*;
+
+pub(crate) struct TransformShaperRgbQ2_13OptAvx512<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPointOpt<i32, i16, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+#[inline(always)]
+pub(crate) unsafe fn _xmm_broadcast_epi32(f: &i32) -> __m128i {
+    let float_ref: &f32 = unsafe { &*(f as *const i32 as *const f32) };
+    unsafe { _mm_castps_si128(_mm_broadcast_ss(float_ref)) }
+}
+
+#[repr(align(32), C)]
+#[derive(Debug)]
+pub(crate) struct AvxAlignedU16(pub(crate) [u16; 16]);
+
+#[inline]
+pub(crate) fn split_by_twos<T: Copy>(data: &[T], channels: usize) -> (&[T], &[T]) {
+    let len = data.len() / (channels * 4);
+    let split_point = len * 4;
+    data.split_at(split_point * channels)
+}
+#[inline]
+pub(crate) fn split_by_twos_mut<T: Copy>(data: &mut [T], channels: usize) -> (&mut [T], &mut [T]) {
+    let len = data.len() / (channels * 4);
+    let split_point = len * 4;
+    data.split_at_mut(split_point * channels)
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformShaperRgbQ2_13OptAvx512<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "avx512bw", enable = "avx512vl")]
+    unsafe fn transform_avx512(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let max_colors = ((1 << self.bit_depth) - 1).as_();
+
+        // If precision changed in another place it should be also changed here
+        assert_eq!(PRECISION, 13);
+
+        let (src_chunks, src_remainder) = split_by_twos(src, src_channels);
+        let (dst_chunks, dst_remainder) = split_by_twos_mut(dst, dst_channels);
+
+        let mut temporary0 = AvxAlignedU16([0; 16]);
+        let mut temporary1 = AvxAlignedU16([0; 16]);
+
+        unsafe {
+            let m0 = _mm256_set_epi16(
+                0, 0, t.v[1][2], t.v[0][2], t.v[1][1], t.v[0][1], t.v[1][0], t.v[0][0], 0, 0,
+                t.v[1][2], t.v[0][2], t.v[1][1], t.v[0][1], t.v[1][0], t.v[0][0],
+            );
+            let m2 = _mm256_set_epi16(
+                0, 0, 1, t.v[2][2], 1, t.v[2][1], 1, t.v[2][0], 0, 0, 1, t.v[2][2], 1, t.v[2][1],
+                1, t.v[2][0],
+            );
+
+            let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
+            let rnd = _mm256_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
+
+            let zeros = _mm256_setzero_si256();
+
+            let v_max_value = _mm256_set1_epi32(GAMMA_LUT as i32 - 1);
+
+            let (mut r0, mut g0, mut b0, mut a0);
+            let (mut r1, mut g1, mut b1, mut a1);
+            let (mut r2, mut g2, mut b2, mut a2);
+            let (mut r3, mut g3, mut b3, mut a3);
+
+            if !src_chunks.is_empty() {
+                let (src0, src1) = src_chunks.split_at(src_chunks.len() / 2);
+                let (dst0, dst1) = dst_chunks.split_at_mut(dst_chunks.len() / 2);
+                let mut src_iter0 = src0.chunks_exact(src_channels * 2);
+                let mut src_iter1 = src1.chunks_exact(src_channels * 2);
+
+                if let (Some(src0), Some(src1)) = (src_iter0.next(), src_iter1.next()) {
+                    r0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.r_i()]._as_usize()]);
+                    g0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.g_i()]._as_usize()]);
+                    b0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.b_i()]._as_usize()]);
+
+                    r1 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()],
+                    );
+                    g1 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()],
+                    );
+                    b1 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()],
+                    );
+
+                    r2 = _xmm_broadcast_epi32(&self.profile.linear[src1[src_cn.r_i()]._as_usize()]);
+                    g2 = _xmm_broadcast_epi32(&self.profile.linear[src1[src_cn.g_i()]._as_usize()]);
+                    b2 = _xmm_broadcast_epi32(&self.profile.linear[src1[src_cn.b_i()]._as_usize()]);
+
+                    r3 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()],
+                    );
+                    g3 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()],
+                    );
+                    b3 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()],
+                    );
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                } else {
+                    r0 = _mm_setzero_si128();
+                    g0 = _mm_setzero_si128();
+                    b0 = _mm_setzero_si128();
+                    a0 = max_colors;
+                    r1 = _mm_setzero_si128();
+                    g1 = _mm_setzero_si128();
+                    b1 = _mm_setzero_si128();
+                    a1 = max_colors;
+                    r2 = _mm_setzero_si128();
+                    g2 = _mm_setzero_si128();
+                    b2 = _mm_setzero_si128();
+                    a2 = max_colors;
+                    r3 = _mm_setzero_si128();
+                    g3 = _mm_setzero_si128();
+                    b3 = _mm_setzero_si128();
+                    a3 = max_colors;
+                }
+
+                for (((src0, src1), dst0), dst1) in src_iter0
+                    .zip(src_iter1)
+                    .zip(dst0.chunks_exact_mut(dst_channels * 2))
+                    .zip(dst1.chunks_exact_mut(dst_channels * 2))
+                {
+                    let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
+                    let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
+                    let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
+                    zg0 = _mm256_slli_epi32::<16>(zg0);
+
+                    let zr1 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r2), r3);
+                    let mut zg1 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g2), g3);
+                    let zb1 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b2), b3);
+                    zg1 = _mm256_slli_epi32::<16>(zg1);
+
+                    let zrg0 = _mm256_or_si256(zr0, zg0);
+                    let zbz0 = _mm256_or_si256(zb0, rnd);
+
+                    let zrg1 = _mm256_or_si256(zr1, zg1);
+                    let zbz1 = _mm256_or_si256(zb1, rnd);
+
+                    let va0 = _mm256_madd_epi16(zrg0, m0);
+                    let va1 = _mm256_madd_epi16(zbz0, m2);
+
+                    let va2 = _mm256_madd_epi16(zrg1, m0);
+                    let va3 = _mm256_madd_epi16(zbz1, m2);
+
+                    let mut v0 = _mm256_add_epi32(va0, va1);
+                    let mut v1 = _mm256_add_epi32(va2, va3);
+
+                    v0 = _mm256_srai_epi32::<PRECISION>(v0);
+                    v0 = _mm256_max_epi32(v0, zeros);
+                    v0 = _mm256_min_epi32(v0, v_max_value);
+
+                    v1 = _mm256_srai_epi32::<PRECISION>(v1);
+                    v1 = _mm256_max_epi32(v1, zeros);
+                    v1 = _mm256_min_epi32(v1, v_max_value);
+
+                    _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
+                    _mm256_store_si256(temporary1.0.as_mut_ptr() as *mut _, v1);
+
+                    r0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.r_i()]._as_usize()]);
+                    g0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.g_i()]._as_usize()]);
+                    b0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.b_i()]._as_usize()]);
+
+                    r1 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()],
+                    );
+                    g1 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()],
+                    );
+                    b1 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()],
+                    );
+
+                    r2 = _xmm_broadcast_epi32(&self.profile.linear[src1[src_cn.r_i()]._as_usize()]);
+                    g2 = _xmm_broadcast_epi32(&self.profile.linear[src1[src_cn.g_i()]._as_usize()]);
+                    b2 = _xmm_broadcast_epi32(&self.profile.linear[src1[src_cn.b_i()]._as_usize()]);
+
+                    r3 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()],
+                    );
+                    g3 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()],
+                    );
+                    b3 = _xmm_broadcast_epi32(
+                        &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()],
+                    );
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[8] as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[10] as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[12] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[temporary1.0[0] as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[temporary1.0[2] as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[temporary1.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[8] as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[10] as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[12] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                }
+
+                if let (Some(dst0), Some(dst1)) = (
+                    dst0.chunks_exact_mut(dst_channels * 2).last(),
+                    dst1.chunks_exact_mut(dst_channels * 2).last(),
+                ) {
+                    let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
+                    let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
+                    let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
+                    zg0 = _mm256_slli_epi32::<16>(zg0);
+
+                    let zr1 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r2), r3);
+                    let mut zg1 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g2), g3);
+                    let zb1 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b2), b3);
+                    zg1 = _mm256_slli_epi32::<16>(zg1);
+
+                    let zrg0 = _mm256_or_si256(zr0, zg0);
+                    let zbz0 = _mm256_or_si256(zb0, rnd);
+
+                    let zrg1 = _mm256_or_si256(zr1, zg1);
+                    let zbz1 = _mm256_or_si256(zb1, rnd);
+
+                    let va0 = _mm256_madd_epi16(zrg0, m0);
+                    let va1 = _mm256_madd_epi16(zbz0, m2);
+
+                    let va2 = _mm256_madd_epi16(zrg1, m0);
+                    let va3 = _mm256_madd_epi16(zbz1, m2);
+
+                    let mut v0 = _mm256_add_epi32(va0, va1);
+                    let mut v1 = _mm256_add_epi32(va2, va3);
+
+                    v0 = _mm256_srai_epi32::<PRECISION>(v0);
+                    v0 = _mm256_max_epi32(v0, zeros);
+                    v0 = _mm256_min_epi32(v0, v_max_value);
+
+                    v1 = _mm256_srai_epi32::<PRECISION>(v1);
+                    v1 = _mm256_max_epi32(v1, zeros);
+                    v1 = _mm256_min_epi32(v1, v_max_value);
+
+                    _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
+                    _mm256_store_si256(temporary1.0.as_mut_ptr() as *mut _, v1);
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[8] as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[10] as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary0.0[12] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[temporary1.0[0] as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[temporary1.0[2] as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[temporary1.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[8] as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[10] as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[12] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+                }
+            }
+
+            for (src, dst) in src_remainder
+                .chunks_exact(src_channels)
+                .zip(dst_remainder.chunks_exact_mut(dst_channels))
+            {
+                let r = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
+                let mut g =
+                    _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
+                let b = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
+
+                g = _mm_slli_epi32::<16>(g);
+
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let zrg0 = _mm_or_si128(r, g);
+                let zbz0 = _mm_or_si128(b, _mm256_castsi256_si128(rnd));
+
+                let v0 = _mm_madd_epi16(zrg0, _mm256_castsi256_si128(m0));
+                let v1 = _mm_madd_epi16(zbz0, _mm256_castsi256_si128(m2));
+
+                let mut v = _mm_add_epi32(v0, v1);
+
+                v = _mm_srai_epi32::<PRECISION>(v);
+                v = _mm_max_epi32(v, _mm_setzero_si128());
+                v = _mm_min_epi32(v, _mm256_castsi256_si128(v_max_value));
+
+                _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, v);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformShaperRgbQ2_13OptAvx512<
+        T,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        LINEAR_CAP,
+        GAMMA_LUT,
+        PRECISION,
+    >
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_avx512(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/bpc.rs
+++ b/vendor/moxcms/src/conversions/bpc.rs
@@ -0,0 +1,121 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+//
+// use crate::conversions::interpolator::{MultidimensionalInterpolation, Tetrahedral};
+// use crate::conversions::transform_lut4_to_4::{NonFiniteVector3fLerp, Vector3fCmykLerp};
+// use crate::mlaf::mlaf;
+// use crate::{Chromaticity, ColorProfile, DataColorSpace, Lab, Xyz};
+//
+// impl ColorProfile {
+//     #[inline]
+//     pub(crate) fn detect_black_point<const GRID_SIZE: usize>(&self, lut: &[f32]) -> Option<Xyz> {
+//         if self.color_space == DataColorSpace::Cmyk {
+//             // if let Some(mut bp) = self.black_point {
+//             //     if let Some(wp) = self.media_white_point.map(|x| x.normalize()) {
+//             //         if wp != Chromaticity::D50.to_xyz() {
+//             //             let ad = adaption_matrix(wp, Chromaticity::D50.to_xyz());
+//             //             let v = ad.mul_vector(bp.to_vector());
+//             //             bp = Xyz {
+//             //                 x: v.v[0],
+//             //                 y: v.v[1],
+//             //                 z: v.v[2],
+//             //             };
+//             //         }
+//             //     }
+//             //     let mut lab = Lab::from_xyz(bp);
+//             //     lab.a = 0.;
+//             //     lab.b = 0.;
+//             //     if lab.l > 50. {
+//             //         lab.l = 50.;
+//             //     }
+//             //     bp = lab.to_xyz();
+//             //     return Some(bp);
+//             // }
+//             let c = 65535;
+//             let m = 65535;
+//             let y = 65535;
+//             let k = 65535;
+//
+//             let linear_k: f32 = k as f32 * (1. / 65535.);
+//             let w: i32 = k * (GRID_SIZE as i32 - 1) / 65535;
+//             let w_n: i32 = (w + 1).min(GRID_SIZE as i32 - 1);
+//             let t: f32 = linear_k * (GRID_SIZE as i32 - 1) as f32 - w as f32;
+//
+//             let grid_size = GRID_SIZE as i32;
+//             let grid_size3 = grid_size * grid_size * grid_size;
+//
+//             let table1 = &lut[(w * grid_size3 * 3) as usize..];
+//             let table2 = &lut[(w_n * grid_size3 * 3) as usize..];
+//
+//             let tetrahedral1 = Tetrahedral::<GRID_SIZE>::new(table1);
+//             let tetrahedral2 = Tetrahedral::<GRID_SIZE>::new(table2);
+//             let r1 = tetrahedral1.inter3(c, m, y);
+//             let r2 = tetrahedral2.inter3(c, m, y);
+//             let r = NonFiniteVector3fLerp::interpolate(r1, r2, t, 1.0);
+//
+//             let mut lab = Lab::from_xyz(Xyz {
+//                 x: r.v[0],
+//                 y: r.v[1],
+//                 z: r.v[2],
+//             });
+//             lab.a = 0.;
+//             lab.b = 0.;
+//             if lab.l > 50. {
+//                 lab.l = 50.;
+//             }
+//             let bp = lab.to_xyz();
+//
+//             return Some(bp);
+//         }
+//         if self.color_space == DataColorSpace::Rgb {
+//             return Some(Xyz::new(0.0, 0.0, 0.0));
+//         }
+//         None
+//     }
+// }
+//
+// pub(crate) fn compensate_bpc_in_lut(lut_xyz: &mut [f32], src_bp: Xyz, dst_bp: Xyz) {
+//     const WP_50: Xyz = Chromaticity::D50.to_xyz();
+//     let tx = src_bp.x - WP_50.x;
+//     let ty = src_bp.y - WP_50.y;
+//     let tz = src_bp.z - WP_50.z;
+//     let ax = (dst_bp.x - WP_50.x) / tx;
+//     let ay = (dst_bp.y - WP_50.y) / ty;
+//     let az = (dst_bp.z - WP_50.z) / tz;
+//
+//     let bx = -WP_50.x * (dst_bp.x - src_bp.x) / tx;
+//     let by = -WP_50.y * (dst_bp.y - src_bp.y) / ty;
+//     let bz = -WP_50.z * (dst_bp.z - src_bp.z) / tz;
+//
+//     for dst in lut_xyz.chunks_exact_mut(3) {
+//         dst[0] = mlaf(bx, dst[0], ax);
+//         dst[1] = mlaf(by, dst[1], ay);
+//         dst[2] = mlaf(bz, dst[2], az);
+//     }
+// }
--- a/vendor/moxcms/src/conversions/gray2rgb.rs
+++ b/vendor/moxcms/src/conversions/gray2rgb.rs
@@ -0,0 +1,416 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+
+#[derive(Clone)]
+struct TransformGray2RgbFusedExecutor<
+    T,
+    const SRC_LAYOUT: u8,
+    const DEST_LAYOUT: u8,
+    const BUCKET: usize,
+> {
+    fused_gamma: Box<[T; BUCKET]>,
+    bit_depth: usize,
+}
+
+pub(crate) fn make_gray_to_x<
+    T: Copy + Default + PointeeSizeExpressible + 'static + Send + Sync,
+    const BUCKET: usize,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    gray_linear: &[f32; BUCKET],
+    gray_gamma: &[T; 65536],
+    bit_depth: usize,
+    gamma_lut: usize,
+) -> Result<Box<dyn TransformExecutor<T> + Sync + Send>, CmsError>
+where
+    u32: AsPrimitive<T>,
+{
+    if src_layout != Layout::Gray && src_layout != Layout::GrayAlpha {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+
+    let mut fused_gamma = Box::new([T::default(); BUCKET]);
+    let max_lut_size = (gamma_lut - 1) as f32;
+    for (&src, dst) in gray_linear.iter().zip(fused_gamma.iter_mut()) {
+        let possible_value = ((src * max_lut_size).round() as u32).min(max_lut_size as u32) as u16;
+        *dst = gray_gamma[possible_value as usize];
+    }
+
+    match src_layout {
+        Layout::Gray => match dst_layout {
+            Layout::Rgb => Ok(Box::new(TransformGray2RgbFusedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgb as u8 },
+                BUCKET,
+            > {
+                fused_gamma,
+                bit_depth,
+            })),
+            Layout::Rgba => Ok(Box::new(TransformGray2RgbFusedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgba as u8 },
+                BUCKET,
+            > {
+                fused_gamma,
+                bit_depth,
+            })),
+            Layout::Gray => Ok(Box::new(TransformGray2RgbFusedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Gray as u8 },
+                BUCKET,
+            > {
+                fused_gamma,
+                bit_depth,
+            })),
+            Layout::GrayAlpha => Ok(Box::new(TransformGray2RgbFusedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::GrayAlpha as u8 },
+                BUCKET,
+            > {
+                fused_gamma,
+                bit_depth,
+            })),
+            _ => unreachable!(),
+        },
+        Layout::GrayAlpha => match dst_layout {
+            Layout::Rgb => Ok(Box::new(TransformGray2RgbFusedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::GrayAlpha as u8 },
+                BUCKET,
+            > {
+                fused_gamma,
+                bit_depth,
+            })),
+            Layout::Rgba => Ok(Box::new(TransformGray2RgbFusedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgba as u8 },
+                BUCKET,
+            > {
+                fused_gamma,
+                bit_depth,
+            })),
+            Layout::Gray => Ok(Box::new(TransformGray2RgbFusedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Gray as u8 },
+                BUCKET,
+            > {
+                fused_gamma,
+                bit_depth,
+            })),
+            Layout::GrayAlpha => Ok(Box::new(TransformGray2RgbFusedExecutor::<
+                T,
+                { Layout::GrayAlpha as u8 },
+                { Layout::GrayAlpha as u8 },
+                BUCKET,
+            > {
+                fused_gamma,
+                bit_depth,
+            })),
+            _ => unreachable!(),
+        },
+        _ => Err(CmsError::UnsupportedProfileConnection),
+    }
+}
+
+impl<
+    T: Copy + Default + PointeeSizeExpressible + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const BUCKET: usize,
+> TransformExecutor<T> for TransformGray2RgbFusedExecutor<T, SRC_LAYOUT, DST_LAYOUT, BUCKET>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let is_gray_alpha = src_cn == Layout::GrayAlpha;
+
+        let max_value: T = ((1u32 << self.bit_depth as u32) - 1u32).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let g = self.fused_gamma[src[0]._as_usize()];
+            let a = if is_gray_alpha { src[1] } else { max_value };
+
+            dst[0] = g;
+            if dst_cn == Layout::GrayAlpha {
+                dst[1] = a;
+            } else if dst_cn == Layout::Rgb {
+                dst[1] = g;
+                dst[2] = g;
+            } else if dst_cn == Layout::Rgba {
+                dst[1] = g;
+                dst[2] = g;
+                dst[3] = a;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Clone)]
+struct TransformGrayToRgbExecutor<
+    T,
+    const SRC_LAYOUT: u8,
+    const DEST_LAYOUT: u8,
+    const BUCKET: usize,
+> {
+    gray_linear: Box<[f32; BUCKET]>,
+    red_gamma: Box<[T; 65536]>,
+    green_gamma: Box<[T; 65536]>,
+    blue_gamma: Box<[T; 65536]>,
+    bit_depth: usize,
+    gamma_lut: usize,
+}
+
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn make_gray_to_unfused<
+    T: Copy + Default + PointeeSizeExpressible + 'static + Send + Sync,
+    const BUCKET: usize,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    gray_linear: Box<[f32; BUCKET]>,
+    red_gamma: Box<[T; 65536]>,
+    green_gamma: Box<[T; 65536]>,
+    blue_gamma: Box<[T; 65536]>,
+    bit_depth: usize,
+    gamma_lut: usize,
+) -> Result<Box<dyn TransformExecutor<T> + Sync + Send>, CmsError>
+where
+    u32: AsPrimitive<T>,
+{
+    if src_layout != Layout::Gray && src_layout != Layout::GrayAlpha {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    if dst_layout != Layout::Rgb && dst_layout != Layout::Rgba {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    match src_layout {
+        Layout::Gray => match dst_layout {
+            Layout::Rgb => Ok(Box::new(TransformGrayToRgbExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgb as u8 },
+                BUCKET,
+            > {
+                gray_linear,
+                red_gamma,
+                green_gamma,
+                blue_gamma,
+                bit_depth,
+                gamma_lut,
+            })),
+            Layout::Rgba => Ok(Box::new(TransformGrayToRgbExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgba as u8 },
+                BUCKET,
+            > {
+                gray_linear,
+                red_gamma,
+                green_gamma,
+                blue_gamma,
+                bit_depth,
+                gamma_lut,
+            })),
+            Layout::Gray => Ok(Box::new(TransformGrayToRgbExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Gray as u8 },
+                BUCKET,
+            > {
+                gray_linear,
+                red_gamma,
+                green_gamma,
+                blue_gamma,
+                bit_depth,
+                gamma_lut,
+            })),
+            Layout::GrayAlpha => Ok(Box::new(TransformGrayToRgbExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::GrayAlpha as u8 },
+                BUCKET,
+            > {
+                gray_linear,
+                red_gamma,
+                green_gamma,
+                blue_gamma,
+                bit_depth,
+                gamma_lut,
+            })),
+            _ => Err(CmsError::UnsupportedProfileConnection),
+        },
+        Layout::GrayAlpha => match dst_layout {
+            Layout::Rgb => Ok(Box::new(TransformGrayToRgbExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::GrayAlpha as u8 },
+                BUCKET,
+            > {
+                gray_linear,
+                red_gamma,
+                green_gamma,
+                blue_gamma,
+                bit_depth,
+                gamma_lut,
+            })),
+            Layout::Rgba => Ok(Box::new(TransformGrayToRgbExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgba as u8 },
+                BUCKET,
+            > {
+                gray_linear,
+                red_gamma,
+                green_gamma,
+                blue_gamma,
+                bit_depth,
+                gamma_lut,
+            })),
+            Layout::Gray => Ok(Box::new(TransformGrayToRgbExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Gray as u8 },
+                BUCKET,
+            > {
+                gray_linear,
+                red_gamma,
+                green_gamma,
+                blue_gamma,
+                bit_depth,
+                gamma_lut,
+            })),
+            Layout::GrayAlpha => Ok(Box::new(TransformGrayToRgbExecutor::<
+                T,
+                { Layout::GrayAlpha as u8 },
+                { Layout::GrayAlpha as u8 },
+                BUCKET,
+            > {
+                gray_linear,
+                red_gamma,
+                green_gamma,
+                blue_gamma,
+                bit_depth,
+                gamma_lut,
+            })),
+            _ => Err(CmsError::UnsupportedProfileConnection),
+        },
+        _ => Err(CmsError::UnsupportedProfileConnection),
+    }
+}
+
+impl<
+    T: Copy + Default + PointeeSizeExpressible + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const BUCKET: usize,
+> TransformExecutor<T> for TransformGrayToRgbExecutor<T, SRC_LAYOUT, DST_LAYOUT, BUCKET>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let is_gray_alpha = src_cn == Layout::GrayAlpha;
+
+        let max_value: T = ((1u32 << self.bit_depth as u32) - 1u32).as_();
+        let max_lut_size = (self.gamma_lut - 1) as f32;
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let g = self.gray_linear[src[0]._as_usize()];
+            let a = if is_gray_alpha { src[1] } else { max_value };
+
+            let possible_value = ((g * max_lut_size).round() as u16) as usize;
+            let red_value = self.red_gamma[possible_value];
+            let green_value = self.green_gamma[possible_value];
+            let blue_value = self.blue_gamma[possible_value];
+
+            if dst_cn == Layout::Rgb {
+                dst[0] = red_value;
+                dst[1] = green_value;
+                dst[2] = blue_value;
+            } else if dst_cn == Layout::Rgba {
+                dst[0] = red_value;
+                dst[1] = green_value;
+                dst[2] = blue_value;
+                dst[3] = a;
+            } else {
+                return Err(CmsError::UnsupportedProfileConnection);
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/gray2rgb_extended.rs
+++ b/vendor/moxcms/src/conversions/gray2rgb_extended.rs
@@ -0,0 +1,383 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::transform::PointeeSizeExpressible;
+use crate::trc::ToneCurveEvaluator;
+use crate::{CmsError, Layout, Rgb, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+struct TransformGrayOneToOneExecutor<T, const SRC_LAYOUT: u8, const DEST_LAYOUT: u8> {
+    linear_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    gamma_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+pub(crate) fn make_gray_to_one_trc_extended<
+    T: Copy + Default + PointeeSizeExpressible + 'static + Send + Sync + AsPrimitive<f32>,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    linear_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    gamma_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    bit_depth: usize,
+) -> Result<Box<dyn TransformExecutor<T> + Sync + Send>, CmsError>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    if src_layout != Layout::Gray && src_layout != Layout::GrayAlpha {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+
+    match src_layout {
+        Layout::Gray => match dst_layout {
+            Layout::Rgb => Ok(Box::new(TransformGrayOneToOneExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgb as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::Rgba => Ok(Box::new(TransformGrayOneToOneExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgba as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::Gray => Ok(Box::new(TransformGrayOneToOneExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Gray as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::GrayAlpha => Ok(Box::new(TransformGrayOneToOneExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::GrayAlpha as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            _ => unreachable!(),
+        },
+        Layout::GrayAlpha => match dst_layout {
+            Layout::Rgb => Ok(Box::new(TransformGrayOneToOneExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::GrayAlpha as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::Rgba => Ok(Box::new(TransformGrayOneToOneExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgba as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::Gray => Ok(Box::new(TransformGrayOneToOneExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Gray as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::GrayAlpha => Ok(Box::new(TransformGrayOneToOneExecutor::<
+                T,
+                { Layout::GrayAlpha as u8 },
+                { Layout::GrayAlpha as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            _ => unreachable!(),
+        },
+        _ => Err(CmsError::UnsupportedProfileConnection),
+    }
+}
+
+impl<
+    T: Copy + Default + PointeeSizeExpressible + 'static + AsPrimitive<f32>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+> TransformExecutor<T> for TransformGrayOneToOneExecutor<T, SRC_LAYOUT, DST_LAYOUT>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let is_gray_alpha = src_cn == Layout::GrayAlpha;
+
+        let max_value: T = ((1u32 << self.bit_depth as u32) - 1u32).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let linear_value = self.linear_eval.evaluate_value(src[0].as_());
+            let g = self.gamma_eval.evaluate_value(linear_value).as_();
+            let a = if is_gray_alpha { src[1] } else { max_value };
+
+            dst[0] = g;
+            if dst_cn == Layout::GrayAlpha {
+                dst[1] = a;
+            } else if dst_cn == Layout::Rgb {
+                dst[1] = g;
+                dst[2] = g;
+            } else if dst_cn == Layout::Rgba {
+                dst[1] = g;
+                dst[2] = g;
+                dst[3] = a;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+struct TransformGrayToRgbExtendedExecutor<T, const SRC_LAYOUT: u8, const DEST_LAYOUT: u8> {
+    linear_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    gamma_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+pub(crate) fn make_gray_to_rgb_extended<
+    T: Copy + Default + PointeeSizeExpressible + 'static + Send + Sync + AsPrimitive<f32>,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    linear_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    gamma_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    bit_depth: usize,
+) -> Result<Box<dyn TransformExecutor<T> + Sync + Send>, CmsError>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    if src_layout != Layout::Gray && src_layout != Layout::GrayAlpha {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    if dst_layout != Layout::Rgb && dst_layout != Layout::Rgba {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    match src_layout {
+        Layout::Gray => match dst_layout {
+            Layout::Rgb => Ok(Box::new(TransformGrayToRgbExtendedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgb as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::Rgba => Ok(Box::new(TransformGrayToRgbExtendedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgba as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::Gray => Ok(Box::new(TransformGrayToRgbExtendedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Gray as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::GrayAlpha => Ok(Box::new(TransformGrayToRgbExtendedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::GrayAlpha as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            _ => Err(CmsError::UnsupportedProfileConnection),
+        },
+        Layout::GrayAlpha => match dst_layout {
+            Layout::Rgb => Ok(Box::new(TransformGrayToRgbExtendedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::GrayAlpha as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::Rgba => Ok(Box::new(TransformGrayToRgbExtendedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Rgba as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::Gray => Ok(Box::new(TransformGrayToRgbExtendedExecutor::<
+                T,
+                { Layout::Gray as u8 },
+                { Layout::Gray as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            Layout::GrayAlpha => Ok(Box::new(TransformGrayToRgbExtendedExecutor::<
+                T,
+                { Layout::GrayAlpha as u8 },
+                { Layout::GrayAlpha as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                _phantom: PhantomData,
+                bit_depth,
+            })),
+            _ => Err(CmsError::UnsupportedProfileConnection),
+        },
+        _ => Err(CmsError::UnsupportedProfileConnection),
+    }
+}
+
+impl<
+    T: Copy + Default + PointeeSizeExpressible + 'static + AsPrimitive<f32>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+> TransformExecutor<T> for TransformGrayToRgbExtendedExecutor<T, SRC_LAYOUT, DST_LAYOUT>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let is_gray_alpha = src_cn == Layout::GrayAlpha;
+
+        let max_value: T = ((1u32 << self.bit_depth as u32) - 1u32).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let linear_value = self.linear_eval.evaluate_value(src[0].as_());
+            let a = if is_gray_alpha { src[1] } else { max_value };
+
+            let tristimulus = self.gamma_eval.evaluate_tristimulus(Rgb::new(
+                linear_value,
+                linear_value,
+                linear_value,
+            ));
+
+            let red_value = tristimulus.r.as_();
+            let green_value = tristimulus.g.as_();
+            let blue_value = tristimulus.b.as_();
+
+            if dst_cn == Layout::Rgb {
+                dst[0] = red_value;
+                dst[1] = green_value;
+                dst[2] = blue_value;
+            } else if dst_cn == Layout::Rgba {
+                dst[0] = red_value;
+                dst[1] = green_value;
+                dst[2] = blue_value;
+                dst[3] = a;
+            } else {
+                return Err(CmsError::UnsupportedProfileConnection);
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/interpolator.rs
+++ b/vendor/moxcms/src/conversions/interpolator.rs
@@ -0,0 +1,645 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#![allow(dead_code)]
+use crate::conversions::lut_transforms::LUT_SAMPLING;
+use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
+use crate::{Vector3f, Vector4f};
+use num_traits::AsPrimitive;
+use std::ops::{Add, Mul, Sub};
+
+#[cfg(feature = "options")]
+pub(crate) struct Tetrahedral<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [f32],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct Pyramidal<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [f32],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct Prismatic<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [f32],
+}
+
+pub(crate) struct Trilinear<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [f32],
+}
+
+#[derive(Debug, Copy, Clone, Default)]
+pub(crate) struct BarycentricWeight<V> {
+    pub x: i32,
+    pub x_n: i32,
+    pub w: V,
+}
+
+impl BarycentricWeight<f32> {
+    pub(crate) fn create_ranged_256<const GRID_SIZE: usize>() -> Box<[BarycentricWeight<f32>; 256]>
+    {
+        let mut weights = Box::new([BarycentricWeight::default(); 256]);
+        for (index, weight) in weights.iter_mut().enumerate() {
+            const SCALE: f32 = 1.0 / LUT_SAMPLING as f32;
+            let x: i32 = index as i32 * (GRID_SIZE as i32 - 1) / LUT_SAMPLING as i32;
+
+            let x_n: i32 = (x + 1).min(GRID_SIZE as i32 - 1);
+
+            let scale = (GRID_SIZE as i32 - 1) as f32 * SCALE;
+
+            let dr = index as f32 * scale - x as f32;
+            *weight = BarycentricWeight { x, x_n, w: dr };
+        }
+        weights
+    }
+
+    #[cfg(feature = "options")]
+    pub(crate) fn create_binned<const GRID_SIZE: usize, const BINS: usize>()
+    -> Box<[BarycentricWeight<f32>; 65536]> {
+        let mut weights = Box::new([BarycentricWeight::<f32>::default(); 65536]);
+        let b_scale: f32 = 1.0 / (BINS - 1) as f32;
+        for (index, weight) in weights.iter_mut().enumerate().take(BINS) {
+            let x: i32 = (index as f32 * (GRID_SIZE as i32 - 1) as f32 * b_scale).floor() as i32;
+
+            let x_n: i32 = (x + 1).min(GRID_SIZE as i32 - 1);
+
+            let scale = (GRID_SIZE as i32 - 1) as f32 * b_scale;
+
+            let dr = index as f32 * scale - x as f32;
+            *weight = BarycentricWeight { x, x_n, w: dr };
+        }
+        weights
+    }
+}
+
+#[allow(dead_code)]
+impl BarycentricWeight<i16> {
+    pub(crate) fn create_ranged_256<const GRID_SIZE: usize>() -> Box<[BarycentricWeight<i16>; 256]>
+    {
+        let mut weights = Box::new([BarycentricWeight::default(); 256]);
+        for (index, weight) in weights.iter_mut().enumerate() {
+            const SCALE: f32 = 1.0 / LUT_SAMPLING as f32;
+            let x: i32 = index as i32 * (GRID_SIZE as i32 - 1) / LUT_SAMPLING as i32;
+
+            let x_n: i32 = (x + 1).min(GRID_SIZE as i32 - 1);
+
+            let scale = (GRID_SIZE as i32 - 1) as f32 * SCALE;
+
+            const Q: f32 = ((1i32 << 15) - 1) as f32;
+
+            let dr = ((index as f32 * scale - x as f32) * Q)
+                .round()
+                .min(i16::MAX as f32)
+                .max(-i16::MAX as f32) as i16;
+            *weight = BarycentricWeight { x, x_n, w: dr };
+        }
+        weights
+    }
+
+    #[cfg(feature = "options")]
+    pub(crate) fn create_binned<const GRID_SIZE: usize, const BINS: usize>()
+    -> Box<[BarycentricWeight<i16>; 65536]> {
+        let mut weights = Box::new([BarycentricWeight::<i16>::default(); 65536]);
+        let b_scale: f32 = 1.0 / (BINS - 1) as f32;
+        for (index, weight) in weights.iter_mut().enumerate().take(BINS) {
+            let x: i32 = (index as f32 * (GRID_SIZE as i32 - 1) as f32 * b_scale).floor() as i32;
+
+            let x_n: i32 = (x + 1).min(GRID_SIZE as i32 - 1);
+
+            let scale = (GRID_SIZE as i32 - 1) as f32 * b_scale;
+
+            const Q: f32 = ((1i32 << 15) - 1) as f32;
+
+            let dr = ((index as f32 * scale - x as f32) * Q)
+                .round()
+                .min(i16::MAX as f32)
+                .max(-i16::MAX as f32) as i16;
+            *weight = BarycentricWeight { x, x_n, w: dr };
+        }
+        weights
+    }
+}
+
+trait Fetcher<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
+}
+
+struct TetrahedralFetchVector3f<'a, const GRID_SIZE: usize> {
+    cube: &'a [f32],
+}
+
+pub(crate) trait MultidimensionalInterpolation<'a, const GRID_SIZE: usize> {
+    fn new(table: &'a [f32]) -> Self;
+    fn inter3<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+    ) -> Vector3f;
+    fn inter4<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+    ) -> Vector4f;
+}
+
+impl<const GRID_SIZE: usize> Fetcher<Vector3f> for TetrahedralFetchVector3f<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32) -> Vector3f {
+        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+            + y as u32 * GRID_SIZE as u32
+            + z as u32) as usize
+            * 3;
+        let jx = &self.cube[offset..offset + 3];
+        Vector3f {
+            v: [jx[0], jx[1], jx[2]],
+        }
+    }
+}
+
+struct TetrahedralFetchVector4f<'a, const GRID_SIZE: usize> {
+    cube: &'a [f32],
+}
+
+impl<const GRID_SIZE: usize> Fetcher<Vector4f> for TetrahedralFetchVector4f<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32) -> Vector4f {
+        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+            + y as u32 * GRID_SIZE as u32
+            + z as u32) as usize
+            * 4;
+        let jx = &self.cube[offset..offset + 4];
+        Vector4f {
+            v: [jx[0], jx[1], jx[2], jx[3]],
+        }
+    }
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> Tetrahedral<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<
+        T: Copy
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Mul<f32, Output = T>
+            + Add<T, Output = T>
+            + From<f32>
+            + FusedMultiplyAdd<T>,
+        U: AsPrimitive<usize>,
+        const BINS: usize,
+    >(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<T>,
+    ) -> T {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let rx = lut_r.w;
+        let ry = lut_g.w;
+        let rz = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
+                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
+            c3 = r.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, T::from(rx));
+        let s1 = s0.mla(c2, T::from(ry));
+        s1.mla(c3, T::from(rz))
+    }
+}
+
+macro_rules! define_md_inter {
+    ($interpolator: ident) => {
+        impl<'a, const GRID_SIZE: usize> MultidimensionalInterpolation<'a, GRID_SIZE>
+            for $interpolator<'a, GRID_SIZE>
+        {
+            fn new(table: &'a [f32]) -> Self {
+                Self { cube: table }
+            }
+
+            fn inter3<U: AsPrimitive<usize>, const BINS: usize>(
+                &self,
+                in_r: U,
+                in_g: U,
+                in_b: U,
+                lut: &[BarycentricWeight<f32>; BINS],
+            ) -> Vector3f {
+                self.interpolate::<Vector3f, U, BINS>(
+                    in_r,
+                    in_g,
+                    in_b,
+                    lut,
+                    TetrahedralFetchVector3f::<GRID_SIZE> { cube: self.cube },
+                )
+            }
+
+            fn inter4<U: AsPrimitive<usize>, const BINS: usize>(
+                &self,
+                in_r: U,
+                in_g: U,
+                in_b: U,
+                lut: &[BarycentricWeight<f32>; BINS],
+            ) -> Vector4f {
+                self.interpolate::<Vector4f, U, BINS>(
+                    in_r,
+                    in_g,
+                    in_b,
+                    lut,
+                    TetrahedralFetchVector4f::<GRID_SIZE> { cube: self.cube },
+                )
+            }
+        }
+    };
+}
+
+#[cfg(feature = "options")]
+define_md_inter!(Tetrahedral);
+#[cfg(feature = "options")]
+define_md_inter!(Pyramidal);
+#[cfg(feature = "options")]
+define_md_inter!(Prismatic);
+define_md_inter!(Trilinear);
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> Pyramidal<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<
+        T: Copy
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Mul<f32, Output = T>
+            + Add<T, Output = T>
+            + From<f32>
+            + FusedMultiplyAdd<T>,
+        U: AsPrimitive<usize>,
+        const BINS: usize,
+    >(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<T>,
+    ) -> T {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n);
+            let x1 = r.fetch(x_n, y_n, z);
+            let x2 = r.fetch(x_n, y, z);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y_n, z_n);
+            let x2 = r.fetch(x, y_n, z_n);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dg * db))
+        } else {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z);
+            let x2 = r.fetch(x_n, y, z_n);
+            let x3 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(db * dr))
+        }
+    }
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> Prismatic<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<
+        T: Copy
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Mul<f32, Output = T>
+            + Add<T, Output = T>
+            + From<f32>
+            + FusedMultiplyAdd<T>,
+        U: AsPrimitive<usize>,
+        const BINS: usize,
+    >(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<T>,
+    ) -> T {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        if db >= dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x, y_n, z_n);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        } else {
+            let x0 = r.fetch(x_n, y, z);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x_n, y_n, z);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> Trilinear<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<
+        T: Copy
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Mul<f32, Output = T>
+            + Add<T, Output = T>
+            + From<f32>
+            + FusedMultiplyAdd<T>
+            + FusedMultiplyNegAdd<T>,
+        U: AsPrimitive<usize>,
+        const BINS: usize,
+    >(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<T>,
+    ) -> T {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let w0 = T::from(dr);
+        let w1 = T::from(dg);
+        let w2 = T::from(db);
+
+        let c000 = r.fetch(x, y, z);
+        let c100 = r.fetch(x_n, y, z);
+        let c010 = r.fetch(x, y_n, z);
+        let c110 = r.fetch(x_n, y_n, z);
+        let c001 = r.fetch(x, y, z_n);
+        let c101 = r.fetch(x_n, y, z_n);
+        let c011 = r.fetch(x, y_n, z_n);
+        let c111 = r.fetch(x_n, y_n, z_n);
+
+        let dx = T::from(dr);
+
+        let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
+        let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
+        let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
+        let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
+
+        let dy = T::from(dg);
+
+        let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
+        let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
+
+        let dz = T::from(db);
+
+        c0.neg_mla(c0, dz).mla(c1, w2)
+    }
+}
+
+pub(crate) trait LutBarycentricReduction<T, U> {
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: T) -> U;
+}
+
+impl LutBarycentricReduction<u8, u8> for () {
+    #[inline(always)]
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: u8) -> u8 {
+        v
+    }
+}
+
+impl LutBarycentricReduction<u8, u16> for () {
+    #[inline(always)]
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: u8) -> u16 {
+        if BINS == 65536 {
+            return u16::from_ne_bytes([v, v]);
+        }
+        if BINS == 16384 {
+            return u16::from_ne_bytes([v, v]) >> 2;
+        }
+        unimplemented!()
+    }
+}
+
+impl LutBarycentricReduction<f32, u8> for () {
+    #[inline(always)]
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: f32) -> u8 {
+        (v * 255.).round().min(255.).max(0.) as u8
+    }
+}
+
+impl LutBarycentricReduction<f32, u16> for () {
+    #[inline(always)]
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: f32) -> u16 {
+        let scale = (BINS - 1) as f32;
+        (v * scale).round().min(scale).max(0.) as u16
+    }
+}
+
+impl LutBarycentricReduction<f64, u8> for () {
+    #[inline(always)]
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: f64) -> u8 {
+        (v * 255.).round().min(255.).max(0.) as u8
+    }
+}
+
+impl LutBarycentricReduction<f64, u16> for () {
+    #[inline(always)]
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: f64) -> u16 {
+        let scale = (BINS - 1) as f64;
+        (v * scale).round().min(scale).max(0.) as u16
+    }
+}
+
+impl LutBarycentricReduction<u16, u16> for () {
+    #[inline(always)]
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: u16) -> u16 {
+        let src_scale = 1. / ((1 << SRC_BP) - 1) as f32;
+        let scale = src_scale * (BINS - 1) as f32;
+        (v as f32 * scale).round().min(scale).max(0.) as u16
+    }
+}
+
+impl LutBarycentricReduction<u16, u8> for () {
+    #[inline(always)]
+    fn reduce<const SRC_BP: usize, const BINS: usize>(v: u16) -> u8 {
+        let shift = SRC_BP as u16 - 8;
+        if SRC_BP == 16 {
+            (v >> 8) as u8
+        } else {
+            (v >> shift).min(255) as u8
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/katana/finalizers.rs
+++ b/vendor/moxcms/src/conversions/katana/finalizers.rs
@@ -0,0 +1,118 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 8/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::KatanaPostFinalizationStage;
+use crate::{CmsError, DataColorSpace, Layout, PointeeSizeExpressible};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+pub(crate) struct InjectAlphaStage<I> {
+    pub(crate) dst_layout: Layout,
+    pub(crate) target_color_space: DataColorSpace,
+    pub(crate) _phantom: PhantomData<I>,
+    pub(crate) bit_depth: usize,
+}
+
+pub(crate) struct CopyAlphaStage<I> {
+    pub(crate) src_layout: Layout,
+    pub(crate) dst_layout: Layout,
+    pub(crate) target_color_space: DataColorSpace,
+    pub(crate) _phantom: PhantomData<I>,
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    KatanaPostFinalizationStage<T> for InjectAlphaStage<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn finalize(&self, _: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let norm_value: T = (if T::FINITE {
+            ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        })
+        .as_();
+        if self.dst_layout == Layout::Rgba && self.target_color_space == DataColorSpace::Rgb {
+            for dst in dst.chunks_exact_mut(self.dst_layout.channels()) {
+                dst[3] = norm_value;
+            }
+        } else if self.dst_layout == Layout::GrayAlpha
+            && self.target_color_space == DataColorSpace::Gray
+        {
+            for dst in dst.chunks_exact_mut(self.dst_layout.channels()) {
+                dst[1] = norm_value;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    KatanaPostFinalizationStage<T> for CopyAlphaStage<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn finalize(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        if self.dst_layout == Layout::Rgba && self.target_color_space == DataColorSpace::Rgb {
+            if self.src_layout == Layout::Rgba {
+                for (src, dst) in src
+                    .chunks_exact(self.src_layout.channels())
+                    .zip(dst.chunks_exact_mut(self.dst_layout.channels()))
+                {
+                    dst[3] = src[3];
+                }
+            } else if self.src_layout == Layout::GrayAlpha {
+                for (src, dst) in src
+                    .chunks_exact(self.src_layout.channels())
+                    .zip(dst.chunks_exact_mut(self.dst_layout.channels()))
+                {
+                    dst[3] = src[1];
+                }
+            }
+        } else if self.dst_layout == Layout::GrayAlpha
+            && self.target_color_space == DataColorSpace::Gray
+        {
+            if self.src_layout == Layout::Rgba {
+                for (src, dst) in src
+                    .chunks_exact(self.src_layout.channels())
+                    .zip(dst.chunks_exact_mut(self.dst_layout.channels()))
+                {
+                    dst[1] = src[3];
+                }
+            } else if self.src_layout == Layout::GrayAlpha {
+                for (src, dst) in src
+                    .chunks_exact(self.src_layout.channels())
+                    .zip(dst.chunks_exact_mut(self.dst_layout.channels()))
+                {
+                    dst[1] = src[1];
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/katana/md3x3.rs
+++ b/vendor/moxcms/src/conversions/katana/md3x3.rs
@@ -0,0 +1,483 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::{KatanaFinalStage, KatanaInitialStage};
+use crate::mlaf::mlaf;
+use crate::safe_math::SafeMul;
+use crate::trc::lut_interp_linear_float;
+use crate::{
+    CmsError, Cube, DataColorSpace, InterpolationMethod, LutMultidimensionalType, MalformedSize,
+    Matrix3d, Matrix3f, PointeeSizeExpressible, TransformOptions, Vector3d, Vector3f,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
+pub(crate) enum MultidimensionalDirection {
+    DeviceToPcs,
+    PcsToDevice,
+}
+
+struct Multidimensional3x3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+> {
+    a_curves: Option<Box<[Vec<f32>; 3]>>,
+    m_curves: Option<Box<[Vec<f32>; 3]>>,
+    b_curves: Option<Box<[Vec<f32>; 3]>>,
+    clut: Option<Vec<f32>>,
+    matrix: Matrix3f,
+    bias: Vector3f,
+    direction: MultidimensionalDirection,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    grid_size: [u8; 3],
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    Multidimensional3x3<T>
+{
+    fn execute_matrix_stage(&self, dst: &mut [f32]) {
+        let m = self.matrix;
+        let b = self.bias;
+
+        if !m.test_equality(Matrix3f::IDENTITY) || !b.eq(&Vector3f::default()) {
+            for dst in dst.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(mlaf(b.v[0], x, m.v[0][0]), y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(mlaf(b.v[1], x, m.v[1][0]), y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(mlaf(b.v[2], x, m.v[2][0]), y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+    }
+
+    fn execute_simple_curves(&self, dst: &mut [f32], curves: &[Vec<f32>; 3]) {
+        let curve0 = &curves[0];
+        let curve1 = &curves[1];
+        let curve2 = &curves[2];
+
+        for dst in dst.chunks_exact_mut(3) {
+            let a0 = dst[0];
+            let a1 = dst[1];
+            let a2 = dst[2];
+            let b0 = lut_interp_linear_float(a0, curve0);
+            let b1 = lut_interp_linear_float(a1, curve1);
+            let b2 = lut_interp_linear_float(a2, curve2);
+            dst[0] = b0;
+            dst[1] = b1;
+            dst[2] = b2;
+        }
+    }
+
+    fn to_pcs_impl<Fetch: Fn(f32, f32, f32) -> Vector3f>(
+        &self,
+        input: &[T],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let norm_value = if T::FINITE {
+            1.0 / ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+        assert_eq!(
+            self.direction,
+            MultidimensionalDirection::DeviceToPcs,
+            "PCS to device cannot be used on `to pcs` stage"
+        );
+
+        // A -> B
+        // OR B - A A - curves stage
+
+        if let (Some(a_curves), Some(clut)) = (self.a_curves.as_ref(), self.clut.as_ref()) {
+            if !clut.is_empty() {
+                let curve0 = &a_curves[0];
+                let curve1 = &a_curves[1];
+                let curve2 = &a_curves[2];
+                for (src, dst) in input.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
+                    let b0 = lut_interp_linear_float(src[0].as_() * norm_value, curve0);
+                    let b1 = lut_interp_linear_float(src[1].as_() * norm_value, curve1);
+                    let b2 = lut_interp_linear_float(src[2].as_() * norm_value, curve2);
+                    let interpolated = fetch(b0, b1, b2);
+                    dst[0] = interpolated.v[0];
+                    dst[1] = interpolated.v[1];
+                    dst[2] = interpolated.v[2];
+                }
+            } else {
+                for (src, dst) in input.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
+                    dst[0] = src[0].as_() * norm_value;
+                    dst[1] = src[1].as_() * norm_value;
+                    dst[2] = src[2].as_() * norm_value;
+                }
+            }
+        } else {
+            for (src, dst) in input.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
+                dst[0] = src[0].as_() * norm_value;
+                dst[1] = src[1].as_() * norm_value;
+                dst[2] = src[2].as_() * norm_value;
+            }
+        }
+
+        // Matrix stage
+
+        if let Some(m_curves) = self.m_curves.as_ref() {
+            self.execute_simple_curves(dst, m_curves);
+            self.execute_matrix_stage(dst);
+        }
+
+        // B-curves is mandatory
+        if let Some(b_curves) = &self.b_curves.as_ref() {
+            self.execute_simple_curves(dst, b_curves);
+        }
+
+        Ok(())
+    }
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    KatanaInitialStage<f32, T> for Multidimensional3x3<T>
+{
+    fn to_pcs(&self, input: &[T]) -> Result<Vec<f32>, CmsError> {
+        if input.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let fixed_new_clut = Vec::new();
+        let new_clut = self.clut.as_ref().unwrap_or(&fixed_new_clut);
+        let lut = Cube::new_cube(new_clut, self.grid_size);
+
+        let mut new_dst = vec![0f32; input.len()];
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            self.to_pcs_impl(input, &mut new_dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            return Ok(new_dst);
+        }
+
+        match self.options.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.to_pcs_impl(input, &mut new_dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.to_pcs_impl(input, &mut new_dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.to_pcs_impl(input, &mut new_dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.to_pcs_impl(input, &mut new_dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(new_dst)
+    }
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    Multidimensional3x3<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn to_output_impl<Fetch: Fn(f32, f32, f32) -> Vector3f>(
+        &self,
+        src: &mut [f32],
+        dst: &mut [T],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let norm_value = if T::FINITE {
+            ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+        assert_eq!(
+            self.direction,
+            MultidimensionalDirection::PcsToDevice,
+            "Device to PCS cannot be used on `to output` stage"
+        );
+
+        if let Some(b_curves) = &self.b_curves.as_ref() {
+            self.execute_simple_curves(src, b_curves);
+        }
+
+        // Matrix stage
+
+        if let Some(m_curves) = self.m_curves.as_ref() {
+            self.execute_matrix_stage(src);
+            self.execute_simple_curves(src, m_curves);
+        }
+
+        if let (Some(a_curves), Some(clut)) = (self.a_curves.as_ref(), self.clut.as_ref()) {
+            if !clut.is_empty() {
+                let curve0 = &a_curves[0];
+                let curve1 = &a_curves[1];
+                let curve2 = &a_curves[2];
+                for (src, dst) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
+                    let b0 = lut_interp_linear_float(src[0], curve0);
+                    let b1 = lut_interp_linear_float(src[1], curve1);
+                    let b2 = lut_interp_linear_float(src[2], curve2);
+                    let interpolated = fetch(b0, b1, b2);
+                    if T::FINITE {
+                        dst[0] = (interpolated.v[0] * norm_value)
+                            .round()
+                            .max(0.0)
+                            .min(norm_value)
+                            .as_();
+                        dst[1] = (interpolated.v[1] * norm_value)
+                            .round()
+                            .max(0.0)
+                            .min(norm_value)
+                            .as_();
+                        dst[2] = (interpolated.v[2] * norm_value)
+                            .round()
+                            .max(0.0)
+                            .min(norm_value)
+                            .as_();
+                    } else {
+                        dst[0] = interpolated.v[0].as_();
+                        dst[1] = interpolated.v[1].as_();
+                        dst[2] = interpolated.v[2].as_();
+                    }
+                }
+            } else {
+                for (src, dst) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
+                    if T::FINITE {
+                        dst[0] = (src[0] * norm_value).round().max(0.0).min(norm_value).as_();
+                        dst[1] = (src[1] * norm_value).round().max(0.0).min(norm_value).as_();
+                        dst[2] = (src[2] * norm_value).round().max(0.0).min(norm_value).as_();
+                    } else {
+                        dst[0] = src[0].as_();
+                        dst[1] = src[1].as_();
+                        dst[2] = src[2].as_();
+                    }
+                }
+            }
+        } else {
+            for (src, dst) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
+                if T::FINITE {
+                    dst[0] = (src[0] * norm_value).round().max(0.0).min(norm_value).as_();
+                    dst[1] = (src[1] * norm_value).round().max(0.0).min(norm_value).as_();
+                    dst[2] = (src[2] * norm_value).round().max(0.0).min(norm_value).as_();
+                } else {
+                    dst[0] = src[0].as_();
+                    dst[1] = src[1].as_();
+                    dst[2] = src[2].as_();
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    KatanaFinalStage<f32, T> for Multidimensional3x3<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn to_output(&self, src: &mut [f32], dst: &mut [T]) -> Result<(), CmsError> {
+        if src.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if src.len() != dst.len() {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        let fixed_new_clut = Vec::new();
+        let new_clut = self.clut.as_ref().unwrap_or(&fixed_new_clut);
+        let lut = Cube::new_cube(new_clut, self.grid_size);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.to_output_impl(src, dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+        }
+
+        match self.options.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.to_output_impl(src, dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.to_output_impl(src, dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.to_output_impl(src, dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.to_output_impl(src, dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+fn make_multidimensional_3x3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    mab: &LutMultidimensionalType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    direction: MultidimensionalDirection,
+    bit_depth: usize,
+) -> Result<Multidimensional3x3<T>, CmsError> {
+    if mab.num_input_channels != 3 && mab.num_output_channels != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    if mab.b_curves.is_empty() || mab.b_curves.len() != 3 {
+        return Err(CmsError::InvalidAtoBLut);
+    }
+
+    let grid_size = [mab.grid_points[0], mab.grid_points[1], mab.grid_points[2]];
+
+    let clut: Option<Vec<f32>> = if mab.a_curves.len() == 3 && mab.clut.is_some() {
+        let clut = mab.clut.as_ref().map(|x| x.to_clut_f32()).unwrap();
+        let lut_grid = (mab.grid_points[0] as usize)
+            .safe_mul(mab.grid_points[1] as usize)?
+            .safe_mul(mab.grid_points[2] as usize)?
+            .safe_mul(mab.num_output_channels as usize)?;
+        if clut.len() != lut_grid {
+            return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+                size: clut.len(),
+                expected: lut_grid,
+            }));
+        }
+        Some(clut)
+    } else {
+        None
+    };
+
+    let a_curves: Option<Box<[Vec<f32>; 3]>> = if mab.a_curves.len() == 3 && mab.clut.is_some() {
+        let mut arr = Box::<[Vec<f32>; 3]>::default();
+        for (a_curve, dst) in mab.a_curves.iter().zip(arr.iter_mut()) {
+            *dst = a_curve.to_clut()?;
+        }
+        Some(arr)
+    } else {
+        None
+    };
+
+    let b_curves: Option<Box<[Vec<f32>; 3]>> = if mab.b_curves.len() == 3 {
+        let mut arr = Box::<[Vec<f32>; 3]>::default();
+        let all_curves_linear = mab.b_curves.iter().all(|curve| curve.is_linear());
+        if all_curves_linear {
+            None
+        } else {
+            for (c_curve, dst) in mab.b_curves.iter().zip(arr.iter_mut()) {
+                *dst = c_curve.to_clut()?;
+            }
+            Some(arr)
+        }
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    };
+
+    let matrix = mab.matrix.to_f32();
+
+    let m_curves: Option<Box<[Vec<f32>; 3]>> = if mab.m_curves.len() == 3 {
+        let all_curves_linear = mab.m_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear
+            || !mab.matrix.test_equality(Matrix3d::IDENTITY)
+            || mab.bias.ne(&Vector3d::default())
+        {
+            let mut arr = Box::<[Vec<f32>; 3]>::default();
+            for (curve, dst) in mab.m_curves.iter().zip(arr.iter_mut()) {
+                *dst = curve.to_clut()?;
+            }
+            Some(arr)
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+
+    let bias = mab.bias.cast();
+
+    let transform = Multidimensional3x3::<T> {
+        a_curves,
+        b_curves,
+        m_curves,
+        matrix,
+        direction,
+        options,
+        clut,
+        pcs,
+        grid_size,
+        bias,
+        _phantom: PhantomData,
+        bit_depth,
+    };
+
+    Ok(transform)
+}
+
+pub(crate) fn multi_dimensional_3x3_to_pcs<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    mab: &LutMultidimensionalType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaInitialStage<f32, T> + Send + Sync>, CmsError> {
+    let transform = make_multidimensional_3x3::<T>(
+        mab,
+        options,
+        pcs,
+        MultidimensionalDirection::DeviceToPcs,
+        bit_depth,
+    )?;
+    Ok(Box::new(transform))
+}
+
+pub(crate) fn multi_dimensional_3x3_to_device<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    mab: &LutMultidimensionalType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaFinalStage<f32, T> + Send + Sync>, CmsError>
+where
+    f32: AsPrimitive<T>,
+{
+    let transform = make_multidimensional_3x3::<T>(
+        mab,
+        options,
+        pcs,
+        MultidimensionalDirection::PcsToDevice,
+        bit_depth,
+    )?;
+    Ok(Box::new(transform))
+}
--- a/vendor/moxcms/src/conversions/katana/md4x3.rs
+++ b/vendor/moxcms/src/conversions/katana/md4x3.rs
@@ -0,0 +1,321 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::KatanaInitialStage;
+use crate::conversions::katana::md3x3::MultidimensionalDirection;
+use crate::mlaf::mlaf;
+use crate::safe_math::SafeMul;
+use crate::trc::lut_interp_linear_float;
+use crate::{
+    CmsError, DataColorSpace, Hypercube, InterpolationMethod, LutMultidimensionalType,
+    MalformedSize, Matrix3d, Matrix3f, PointeeSizeExpressible, TransformOptions, Vector3d,
+    Vector3f,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+pub(crate) fn execute_simple_curves3(dst: &mut [f32], curves: &[Vec<f32>; 3]) {
+    let curve0 = &curves[0];
+    let curve1 = &curves[1];
+    let curve2 = &curves[2];
+
+    for dst in dst.chunks_exact_mut(3) {
+        let a0 = dst[0];
+        let a1 = dst[1];
+        let a2 = dst[2];
+        let b0 = lut_interp_linear_float(a0, curve0);
+        let b1 = lut_interp_linear_float(a1, curve1);
+        let b2 = lut_interp_linear_float(a2, curve2);
+        dst[0] = b0;
+        dst[1] = b1;
+        dst[2] = b2;
+    }
+}
+
+pub(crate) fn execute_matrix_stage3(matrix: Matrix3f, bias: Vector3f, dst: &mut [f32]) {
+    let m = matrix;
+    let b = bias;
+
+    if !m.test_equality(Matrix3f::IDENTITY) || !b.eq(&Vector3f::default()) {
+        for dst in dst.chunks_exact_mut(3) {
+            let x = dst[0];
+            let y = dst[1];
+            let z = dst[2];
+            dst[0] = mlaf(mlaf(mlaf(b.v[0], x, m.v[0][0]), y, m.v[0][1]), z, m.v[0][2]);
+            dst[1] = mlaf(mlaf(mlaf(b.v[1], x, m.v[1][0]), y, m.v[1][1]), z, m.v[1][2]);
+            dst[2] = mlaf(mlaf(mlaf(b.v[2], x, m.v[2][0]), y, m.v[2][1]), z, m.v[2][2]);
+        }
+    }
+}
+
+struct Multidimensional4x3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+> {
+    a_curves: Option<Box<[Vec<f32>; 4]>>,
+    m_curves: Option<Box<[Vec<f32>; 3]>>,
+    b_curves: Option<Box<[Vec<f32>; 3]>>,
+    clut: Option<Vec<f32>>,
+    matrix: Matrix3f,
+    bias: Vector3f,
+    direction: MultidimensionalDirection,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    grid_size: [u8; 4],
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    Multidimensional4x3<T>
+{
+    fn to_pcs_impl<Fetch: Fn(f32, f32, f32, f32) -> Vector3f>(
+        &self,
+        input: &[T],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let norm_value = if T::FINITE {
+            1.0 / ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+        assert_eq!(
+            self.direction,
+            MultidimensionalDirection::DeviceToPcs,
+            "PCS to device cannot be used on `to pcs` stage"
+        );
+
+        // A -> B
+        // OR B - A A - curves stage
+
+        if let (Some(a_curves), Some(clut)) = (self.a_curves.as_ref(), self.clut.as_ref()) {
+            if !clut.is_empty() {
+                let curve0 = &a_curves[0];
+                let curve1 = &a_curves[1];
+                let curve2 = &a_curves[2];
+                let curve3 = &a_curves[3];
+                for (src, dst) in input.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
+                    let b0 = lut_interp_linear_float(src[0].as_() * norm_value, curve0);
+                    let b1 = lut_interp_linear_float(src[1].as_() * norm_value, curve1);
+                    let b2 = lut_interp_linear_float(src[2].as_() * norm_value, curve2);
+                    let b3 = lut_interp_linear_float(src[3].as_() * norm_value, curve3);
+                    let interpolated = fetch(b0, b1, b2, b3);
+                    dst[0] = interpolated.v[0];
+                    dst[1] = interpolated.v[1];
+                    dst[2] = interpolated.v[2];
+                }
+            }
+        } else {
+            return Err(CmsError::InvalidAtoBLut);
+        }
+
+        // Matrix stage
+
+        if let Some(m_curves) = self.m_curves.as_ref() {
+            execute_simple_curves3(dst, m_curves);
+            execute_matrix_stage3(self.matrix, self.bias, dst);
+        }
+
+        // B-curves is mandatory
+        if let Some(b_curves) = &self.b_curves.as_ref() {
+            execute_simple_curves3(dst, b_curves);
+        }
+
+        Ok(())
+    }
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    KatanaInitialStage<f32, T> for Multidimensional4x3<T>
+{
+    fn to_pcs(&self, input: &[T]) -> Result<Vec<f32>, CmsError> {
+        if input.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let fixed_new_clut = Vec::new();
+        let new_clut = self.clut.as_ref().unwrap_or(&fixed_new_clut);
+        let lut = Hypercube::new_hypercube(new_clut, self.grid_size);
+
+        let mut new_dst = vec![0f32; (input.len() / 4) * 3];
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            self.to_pcs_impl(input, &mut new_dst, |x, y, z, w| {
+                lut.quadlinear_vec3(x, y, z, w)
+            })?;
+            return Ok(new_dst);
+        }
+
+        match self.options.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.to_pcs_impl(input, &mut new_dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.to_pcs_impl(input, &mut new_dst, |x, y, z, w| {
+                    lut.pyramid_vec3(x, y, z, w)
+                })?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.to_pcs_impl(input, &mut new_dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
+            }
+            InterpolationMethod::Linear => {
+                self.to_pcs_impl(input, &mut new_dst, |x, y, z, w| {
+                    lut.quadlinear_vec3(x, y, z, w)
+                })?;
+            }
+        }
+        Ok(new_dst)
+    }
+}
+
+fn make_multidimensional_4x3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    mab: &LutMultidimensionalType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    direction: MultidimensionalDirection,
+    bit_depth: usize,
+) -> Result<Multidimensional4x3<T>, CmsError> {
+    if mab.num_input_channels != 4 && mab.num_output_channels != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    if mab.b_curves.is_empty() || mab.b_curves.len() != 3 {
+        return Err(CmsError::InvalidAtoBLut);
+    }
+
+    let grid_size = [
+        mab.grid_points[0],
+        mab.grid_points[1],
+        mab.grid_points[2],
+        mab.grid_points[3],
+    ];
+
+    let clut: Option<Vec<f32>> = if mab.a_curves.len() == 4 && mab.clut.is_some() {
+        let clut = mab.clut.as_ref().map(|x| x.to_clut_f32()).unwrap();
+        let lut_grid = (mab.grid_points[0] as usize)
+            .safe_mul(mab.grid_points[1] as usize)?
+            .safe_mul(mab.grid_points[2] as usize)?
+            .safe_mul(mab.grid_points[3] as usize)?
+            .safe_mul(mab.num_output_channels as usize)?;
+        if clut.len() != lut_grid {
+            return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+                size: clut.len(),
+                expected: lut_grid,
+            }));
+        }
+        Some(clut)
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    };
+
+    let a_curves: Option<Box<[Vec<f32>; 4]>> = if mab.a_curves.len() == 4 && mab.clut.is_some() {
+        let mut arr = Box::<[Vec<f32>; 4]>::default();
+        for (a_curve, dst) in mab.a_curves.iter().zip(arr.iter_mut()) {
+            *dst = a_curve.to_clut()?;
+        }
+        Some(arr)
+    } else {
+        None
+    };
+
+    let b_curves: Option<Box<[Vec<f32>; 3]>> = if mab.b_curves.len() == 3 {
+        let mut arr = Box::<[Vec<f32>; 3]>::default();
+        let all_curves_linear = mab.b_curves.iter().all(|curve| curve.is_linear());
+        if all_curves_linear {
+            None
+        } else {
+            for (c_curve, dst) in mab.b_curves.iter().zip(arr.iter_mut()) {
+                *dst = c_curve.to_clut()?;
+            }
+            Some(arr)
+        }
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    };
+
+    let matrix = mab.matrix.to_f32();
+
+    let m_curves: Option<Box<[Vec<f32>; 3]>> = if mab.m_curves.len() == 3 {
+        let all_curves_linear = mab.m_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear
+            || !mab.matrix.test_equality(Matrix3d::IDENTITY)
+            || mab.bias.ne(&Vector3d::default())
+        {
+            let mut arr = Box::<[Vec<f32>; 3]>::default();
+            for (curve, dst) in mab.m_curves.iter().zip(arr.iter_mut()) {
+                *dst = curve.to_clut()?;
+            }
+            Some(arr)
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+
+    let bias = mab.bias.cast();
+
+    let transform = Multidimensional4x3::<T> {
+        a_curves,
+        b_curves,
+        m_curves,
+        matrix,
+        direction,
+        options,
+        clut,
+        pcs,
+        grid_size,
+        bias,
+        _phantom: PhantomData,
+        bit_depth,
+    };
+
+    Ok(transform)
+}
+
+pub(crate) fn multi_dimensional_4x3_to_pcs<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    mab: &LutMultidimensionalType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaInitialStage<f32, T> + Send + Sync>, CmsError> {
+    let transform = make_multidimensional_4x3::<T>(
+        mab,
+        options,
+        pcs,
+        MultidimensionalDirection::DeviceToPcs,
+        bit_depth,
+    )?;
+    Ok(Box::new(transform))
+}
--- a/vendor/moxcms/src/conversions/katana/md_3xn.rs
+++ b/vendor/moxcms/src/conversions/katana/md_3xn.rs
@@ -0,0 +1,284 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::KatanaFinalStage;
+use crate::conversions::katana::md3x3::MultidimensionalDirection;
+use crate::conversions::katana::md4x3::{execute_matrix_stage3, execute_simple_curves3};
+use crate::conversions::md_lut::{MultidimensionalLut, tetra_3i_to_any_vec};
+use crate::safe_math::SafeMul;
+use crate::trc::lut_interp_linear_float;
+use crate::{
+    CmsError, DataColorSpace, Layout, LutMultidimensionalType, MalformedSize, Matrix3d, Matrix3f,
+    PointeeSizeExpressible, TransformOptions, Vector3d, Vector3f,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+struct Multidimensional3xN<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+> {
+    a_curves: Option<Vec<Vec<f32>>>,
+    m_curves: Option<Box<[Vec<f32>; 3]>>,
+    b_curves: Option<Box<[Vec<f32>; 3]>>,
+    clut: Option<Vec<f32>>,
+    matrix: Matrix3f,
+    bias: Vector3f,
+    direction: MultidimensionalDirection,
+    grid_size: [u8; 16],
+    output_inks: usize,
+    _phantom: PhantomData<T>,
+    dst_layout: Layout,
+    bit_depth: usize,
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    Multidimensional3xN<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn to_output_impl(&self, src: &mut [f32], dst: &mut [T]) -> Result<(), CmsError> {
+        let norm_value = if T::FINITE {
+            ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+        assert_eq!(
+            self.direction,
+            MultidimensionalDirection::PcsToDevice,
+            "PCS to device cannot be used on `to pcs` stage"
+        );
+
+        // B-curves is mandatory
+        if let Some(b_curves) = &self.b_curves.as_ref() {
+            execute_simple_curves3(src, b_curves);
+        }
+
+        // Matrix stage
+
+        if let Some(m_curves) = self.m_curves.as_ref() {
+            execute_matrix_stage3(self.matrix, self.bias, src);
+            execute_simple_curves3(src, m_curves);
+        }
+
+        if let (Some(a_curves), Some(clut)) = (self.a_curves.as_ref(), self.clut.as_ref()) {
+            let mut inks = vec![0.; self.output_inks];
+
+            if clut.is_empty() {
+                return Err(CmsError::InvalidAtoBLut);
+            }
+
+            let md_lut = MultidimensionalLut::new(self.grid_size, 3, self.output_inks);
+
+            for (src, dst) in src
+                .chunks_exact(3)
+                .zip(dst.chunks_exact_mut(self.dst_layout.channels()))
+            {
+                tetra_3i_to_any_vec(
+                    &md_lut,
+                    clut,
+                    src[0],
+                    src[1],
+                    src[2],
+                    &mut inks,
+                    self.output_inks,
+                );
+
+                for (ink, curve) in inks.iter_mut().zip(a_curves.iter()) {
+                    *ink = lut_interp_linear_float(*ink, curve);
+                }
+
+                if T::FINITE {
+                    for (dst, ink) in dst.iter_mut().zip(inks.iter()) {
+                        *dst = (*ink * norm_value).round().max(0.).min(norm_value).as_();
+                    }
+                } else {
+                    for (dst, ink) in dst.iter_mut().zip(inks.iter()) {
+                        *dst = (*ink * norm_value).as_();
+                    }
+                }
+            }
+        } else {
+            return Err(CmsError::InvalidAtoBLut);
+        }
+
+        Ok(())
+    }
+}
+
+impl<T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync>
+    KatanaFinalStage<f32, T> for Multidimensional3xN<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn to_output(&self, src: &mut [f32], dst: &mut [T]) -> Result<(), CmsError> {
+        if src.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % self.output_inks != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        self.to_output_impl(src, dst)?;
+        Ok(())
+    }
+}
+
+fn make_multidimensional_nx3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    dst_layout: Layout,
+    mab: &LutMultidimensionalType,
+    _: TransformOptions,
+    pcs: DataColorSpace,
+    direction: MultidimensionalDirection,
+    bit_depth: usize,
+) -> Result<Multidimensional3xN<T>, CmsError> {
+    let real_inks = if pcs == DataColorSpace::Rgb {
+        3
+    } else {
+        dst_layout.channels()
+    };
+
+    if mab.num_output_channels != real_inks as u8 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+
+    if mab.b_curves.is_empty() || mab.b_curves.len() != 3 {
+        return Err(CmsError::InvalidAtoBLut);
+    }
+
+    let clut: Option<Vec<f32>> =
+        if mab.a_curves.len() == mab.num_output_channels as usize && mab.clut.is_some() {
+            let clut = mab.clut.as_ref().map(|x| x.to_clut_f32()).unwrap();
+            let mut lut_grid = 1usize;
+            for grid in mab.grid_points.iter().take(mab.num_input_channels as usize) {
+                lut_grid = lut_grid.safe_mul(*grid as usize)?;
+            }
+            let lut_grid = lut_grid.safe_mul(mab.num_output_channels as usize)?;
+            if clut.len() != lut_grid {
+                return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+                    size: clut.len(),
+                    expected: lut_grid,
+                }));
+            }
+            Some(clut)
+        } else {
+            return Err(CmsError::InvalidAtoBLut);
+        };
+
+    let a_curves: Option<Vec<Vec<f32>>> =
+        if mab.a_curves.len() == mab.num_output_channels as usize && mab.clut.is_some() {
+            let mut arr = Vec::new();
+            for a_curve in mab.a_curves.iter() {
+                arr.push(a_curve.to_clut()?);
+            }
+            Some(arr)
+        } else {
+            None
+        };
+
+    let b_curves: Option<Box<[Vec<f32>; 3]>> = if mab.b_curves.len() == 3 {
+        let mut arr = Box::<[Vec<f32>; 3]>::default();
+        let all_curves_linear = mab.b_curves.iter().all(|curve| curve.is_linear());
+        if all_curves_linear {
+            None
+        } else {
+            for (c_curve, dst) in mab.b_curves.iter().zip(arr.iter_mut()) {
+                *dst = c_curve.to_clut()?;
+            }
+            Some(arr)
+        }
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    };
+
+    let matrix = mab.matrix.to_f32();
+
+    let m_curves: Option<Box<[Vec<f32>; 3]>> = if mab.m_curves.len() == 3 {
+        let all_curves_linear = mab.m_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear
+            || !mab.matrix.test_equality(Matrix3d::IDENTITY)
+            || mab.bias.ne(&Vector3d::default())
+        {
+            let mut arr = Box::<[Vec<f32>; 3]>::default();
+            for (curve, dst) in mab.m_curves.iter().zip(arr.iter_mut()) {
+                *dst = curve.to_clut()?;
+            }
+            Some(arr)
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+
+    let bias = mab.bias.cast();
+
+    let transform = Multidimensional3xN::<T> {
+        a_curves,
+        b_curves,
+        m_curves,
+        matrix,
+        direction,
+        clut,
+        grid_size: mab.grid_points,
+        bias,
+        dst_layout,
+        output_inks: real_inks,
+        _phantom: PhantomData,
+        bit_depth,
+    };
+
+    Ok(transform)
+}
+
+pub(crate) fn katana_multi_dimensional_3xn_to_device<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    dst_layout: Layout,
+    mab: &LutMultidimensionalType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaFinalStage<f32, T> + Send + Sync>, CmsError>
+where
+    f32: AsPrimitive<T>,
+{
+    if mab.num_input_channels == 0 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    let transform = make_multidimensional_nx3::<T>(
+        dst_layout,
+        mab,
+        options,
+        pcs,
+        MultidimensionalDirection::PcsToDevice,
+        bit_depth,
+    )?;
+    Ok(Box::new(transform))
+}
--- a/vendor/moxcms/src/conversions/katana/md_nx3.rs
+++ b/vendor/moxcms/src/conversions/katana/md_nx3.rs
@@ -0,0 +1,296 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::KatanaInitialStage;
+use crate::conversions::katana::md3x3::MultidimensionalDirection;
+use crate::conversions::katana::md4x3::{execute_matrix_stage3, execute_simple_curves3};
+use crate::conversions::md_lut::{
+    MultidimensionalLut, NVector, linear_1i_vec3f, linear_2i_vec3f_direct, linear_3i_vec3f_direct,
+    linear_4i_vec3f, linear_5i_vec3f, linear_6i_vec3f, linear_7i_vec3f, linear_8i_vec3f,
+    linear_9i_vec3f, linear_10i_vec3f, linear_11i_vec3f, linear_12i_vec3f, linear_13i_vec3f,
+    linear_14i_vec3f, linear_15i_vec3f,
+};
+use crate::safe_math::SafeMul;
+use crate::trc::lut_interp_linear_float;
+use crate::{
+    CmsError, DataColorSpace, Layout, LutMultidimensionalType, MalformedSize, Matrix3d, Matrix3f,
+    PointeeSizeExpressible, TransformOptions, Vector3d, Vector3f,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+struct MultidimensionalNx3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+    const BIT_DEPTH: usize,
+> {
+    a_curves: Option<Vec<Vec<f32>>>,
+    m_curves: Option<Box<[Vec<f32>; 3]>>,
+    b_curves: Option<Box<[Vec<f32>; 3]>>,
+    clut: Option<Vec<f32>>,
+    matrix: Matrix3f,
+    bias: Vector3f,
+    direction: MultidimensionalDirection,
+    grid_size: [u8; 16],
+    input_inks: usize,
+    _phantom: PhantomData<T>,
+}
+
+#[inline(never)]
+pub(crate) fn interpolate_out_function(
+    layout: Layout,
+) -> fn(lut: &MultidimensionalLut, arr: &[f32], inputs: &[f32]) -> NVector<f32, 3> {
+    const OUT: usize = 3;
+    match layout {
+        Layout::Rgb => linear_3i_vec3f_direct::<OUT>,
+        Layout::Rgba => linear_4i_vec3f::<OUT>,
+        Layout::Gray => linear_1i_vec3f::<OUT>,
+        Layout::GrayAlpha => linear_2i_vec3f_direct::<OUT>,
+        Layout::Inks5 => linear_5i_vec3f::<OUT>,
+        Layout::Inks6 => linear_6i_vec3f::<OUT>,
+        Layout::Inks7 => linear_7i_vec3f::<OUT>,
+        Layout::Inks8 => linear_8i_vec3f::<OUT>,
+        Layout::Inks9 => linear_9i_vec3f::<OUT>,
+        Layout::Inks10 => linear_10i_vec3f::<OUT>,
+        Layout::Inks11 => linear_11i_vec3f::<OUT>,
+        Layout::Inks12 => linear_12i_vec3f::<OUT>,
+        Layout::Inks13 => linear_13i_vec3f::<OUT>,
+        Layout::Inks14 => linear_14i_vec3f::<OUT>,
+        Layout::Inks15 => linear_15i_vec3f::<OUT>,
+    }
+}
+
+impl<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+    const BIT_DEPTH: usize,
+> MultidimensionalNx3<T, BIT_DEPTH>
+{
+    fn to_pcs_impl(&self, input: &[T], dst: &mut [f32]) -> Result<(), CmsError> {
+        let norm_value = if T::FINITE {
+            1.0 / ((1u32 << BIT_DEPTH) - 1) as f32
+        } else {
+            1.0
+        };
+        assert_eq!(
+            self.direction,
+            MultidimensionalDirection::DeviceToPcs,
+            "PCS to device cannot be used on `to pcs` stage"
+        );
+
+        // A -> B
+        // OR B - A A - curves stage
+
+        if let (Some(a_curves), Some(clut)) = (self.a_curves.as_ref(), self.clut.as_ref()) {
+            let layout = Layout::from_inks(self.input_inks);
+
+            let mut inks = vec![0.; self.input_inks];
+
+            if clut.is_empty() {
+                return Err(CmsError::InvalidAtoBLut);
+            }
+
+            let fetcher = interpolate_out_function(layout);
+
+            let md_lut = MultidimensionalLut::new(self.grid_size, self.input_inks, 3);
+
+            for (src, dst) in input
+                .chunks_exact(layout.channels())
+                .zip(dst.chunks_exact_mut(3))
+            {
+                for ((ink, src_ink), curve) in inks.iter_mut().zip(src).zip(a_curves.iter()) {
+                    *ink = lut_interp_linear_float(src_ink.as_() * norm_value, curve);
+                }
+
+                let interpolated = fetcher(&md_lut, clut, &inks);
+
+                dst[0] = interpolated.v[0];
+                dst[1] = interpolated.v[1];
+                dst[2] = interpolated.v[2];
+            }
+        } else {
+            return Err(CmsError::InvalidAtoBLut);
+        }
+
+        // Matrix stage
+
+        if let Some(m_curves) = self.m_curves.as_ref() {
+            execute_simple_curves3(dst, m_curves);
+            execute_matrix_stage3(self.matrix, self.bias, dst);
+        }
+
+        // B-curves is mandatory
+        if let Some(b_curves) = &self.b_curves.as_ref() {
+            execute_simple_curves3(dst, b_curves);
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+    const BIT_DEPTH: usize,
+> KatanaInitialStage<f32, T> for MultidimensionalNx3<T, BIT_DEPTH>
+{
+    fn to_pcs(&self, input: &[T]) -> Result<Vec<f32>, CmsError> {
+        if input.len() % self.input_inks != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let mut new_dst = vec![0f32; (input.len() / self.input_inks) * 3];
+
+        self.to_pcs_impl(input, &mut new_dst)?;
+        Ok(new_dst)
+    }
+}
+
+fn make_multidimensional_nx3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+    const BIT_DEPTH: usize,
+>(
+    mab: &LutMultidimensionalType,
+    _: TransformOptions,
+    _: DataColorSpace,
+    direction: MultidimensionalDirection,
+) -> Result<MultidimensionalNx3<T, BIT_DEPTH>, CmsError> {
+    if mab.num_output_channels != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    if mab.b_curves.is_empty() || mab.b_curves.len() != 3 {
+        return Err(CmsError::InvalidAtoBLut);
+    }
+
+    let clut: Option<Vec<f32>> =
+        if mab.a_curves.len() == mab.num_input_channels as usize && mab.clut.is_some() {
+            let clut = mab.clut.as_ref().map(|x| x.to_clut_f32()).unwrap();
+            let mut lut_grid = 1usize;
+            for grid in mab.grid_points.iter().take(mab.num_input_channels as usize) {
+                lut_grid = lut_grid.safe_mul(*grid as usize)?;
+            }
+            let lut_grid = lut_grid.safe_mul(mab.num_output_channels as usize)?;
+            if clut.len() != lut_grid {
+                return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+                    size: clut.len(),
+                    expected: lut_grid,
+                }));
+            }
+            Some(clut)
+        } else {
+            return Err(CmsError::InvalidAtoBLut);
+        };
+
+    let a_curves: Option<Vec<Vec<f32>>> =
+        if mab.a_curves.len() == mab.num_input_channels as usize && mab.clut.is_some() {
+            let mut arr = Vec::new();
+            for a_curve in mab.a_curves.iter() {
+                arr.push(a_curve.to_clut()?);
+            }
+            Some(arr)
+        } else {
+            None
+        };
+
+    let b_curves: Option<Box<[Vec<f32>; 3]>> = if mab.b_curves.len() == 3 {
+        let mut arr = Box::<[Vec<f32>; 3]>::default();
+        let all_curves_linear = mab.b_curves.iter().all(|curve| curve.is_linear());
+        if all_curves_linear {
+            None
+        } else {
+            for (c_curve, dst) in mab.b_curves.iter().zip(arr.iter_mut()) {
+                *dst = c_curve.to_clut()?;
+            }
+            Some(arr)
+        }
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    };
+
+    let matrix = mab.matrix.to_f32();
+
+    let m_curves: Option<Box<[Vec<f32>; 3]>> = if mab.m_curves.len() == 3 {
+        let all_curves_linear = mab.m_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear
+            || !mab.matrix.test_equality(Matrix3d::IDENTITY)
+            || mab.bias.ne(&Vector3d::default())
+        {
+            let mut arr = Box::<[Vec<f32>; 3]>::default();
+            for (curve, dst) in mab.m_curves.iter().zip(arr.iter_mut()) {
+                *dst = curve.to_clut()?;
+            }
+            Some(arr)
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+
+    let bias = mab.bias.cast();
+
+    let transform = MultidimensionalNx3::<T, BIT_DEPTH> {
+        a_curves,
+        b_curves,
+        m_curves,
+        matrix,
+        direction,
+        clut,
+        grid_size: mab.grid_points,
+        bias,
+        input_inks: mab.num_input_channels as usize,
+        _phantom: PhantomData,
+    };
+
+    Ok(transform)
+}
+
+pub(crate) fn katana_multi_dimensional_nx3_to_pcs<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+    const BIT_DEPTH: usize,
+>(
+    src_layout: Layout,
+    mab: &LutMultidimensionalType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Box<dyn KatanaInitialStage<f32, T> + Send + Sync>, CmsError> {
+    if pcs == DataColorSpace::Rgb {
+        if mab.num_input_channels != 3 {
+            return Err(CmsError::InvalidAtoBLut);
+        }
+        if src_layout != Layout::Rgba && src_layout != Layout::Rgb {
+            return Err(CmsError::InvalidInksCountForProfile);
+        }
+    } else if mab.num_input_channels != src_layout.channels() as u8 {
+        return Err(CmsError::InvalidInksCountForProfile);
+    }
+    let transform = make_multidimensional_nx3::<T, BIT_DEPTH>(
+        mab,
+        options,
+        pcs,
+        MultidimensionalDirection::DeviceToPcs,
+    )?;
+    Ok(Box::new(transform))
+}
--- a/vendor/moxcms/src/conversions/katana/md_pipeline.rs
+++ b/vendor/moxcms/src/conversions/katana/md_pipeline.rs
@@ -0,0 +1,393 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::md_nx3::interpolate_out_function;
+use crate::conversions::katana::{KatanaFinalStage, KatanaInitialStage};
+use crate::conversions::md_lut::{MultidimensionalLut, tetra_3i_to_any_vec};
+use crate::profile::LutDataType;
+use crate::safe_math::{SafeMul, SafePowi};
+use crate::trc::lut_interp_linear_float;
+use crate::{
+    CmsError, DataColorSpace, Layout, MalformedSize, PointeeSizeExpressible, TransformOptions,
+};
+use num_traits::AsPrimitive;
+use std::array::from_fn;
+use std::marker::PhantomData;
+
+#[derive(Default)]
+struct KatanaLutNx3<T> {
+    linearization: Vec<Vec<f32>>,
+    clut: Vec<f32>,
+    grid_size: u8,
+    input_inks: usize,
+    output: [Vec<f32>; 3],
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+struct KatanaLut3xN<T> {
+    linearization: [Vec<f32>; 3],
+    clut: Vec<f32>,
+    grid_size: u8,
+    output_inks: usize,
+    output: Vec<Vec<f32>>,
+    dst_layout: Layout,
+    target_color_space: DataColorSpace,
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+impl<T: Copy + PointeeSizeExpressible + AsPrimitive<f32>> KatanaLutNx3<T> {
+    fn to_pcs_impl(&self, input: &[T]) -> Result<Vec<f32>, CmsError> {
+        if input.len() % self.input_inks != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let norm_value = if T::FINITE {
+            1.0 / ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+
+        let grid_sizes: [u8; 16] = from_fn(|i| {
+            if i < self.input_inks {
+                self.grid_size
+            } else {
+                0
+            }
+        });
+
+        let md_lut = MultidimensionalLut::new(grid_sizes, self.input_inks, 3);
+
+        let layout = Layout::from_inks(self.input_inks);
+
+        let mut inks = vec![0.; self.input_inks];
+
+        let mut dst = vec![0.; (input.len() / layout.channels()) * 3];
+
+        let fetcher = interpolate_out_function(layout);
+
+        for (dest, src) in dst
+            .chunks_exact_mut(3)
+            .zip(input.chunks_exact(layout.channels()))
+        {
+            for ((ink, src_ink), curve) in inks.iter_mut().zip(src).zip(self.linearization.iter()) {
+                *ink = lut_interp_linear_float(src_ink.as_() * norm_value, curve);
+            }
+
+            let clut = fetcher(&md_lut, &self.clut, &inks);
+
+            let pcs_x = lut_interp_linear_float(clut.v[0], &self.output[0]);
+            let pcs_y = lut_interp_linear_float(clut.v[1], &self.output[1]);
+            let pcs_z = lut_interp_linear_float(clut.v[2], &self.output[2]);
+
+            dest[0] = pcs_x;
+            dest[1] = pcs_y;
+            dest[2] = pcs_z;
+        }
+        Ok(dst)
+    }
+}
+
+impl<T: Copy + PointeeSizeExpressible + AsPrimitive<f32>> KatanaInitialStage<f32, T>
+    for KatanaLutNx3<T>
+{
+    fn to_pcs(&self, input: &[T]) -> Result<Vec<f32>, CmsError> {
+        if input.len() % self.input_inks != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        self.to_pcs_impl(input)
+    }
+}
+
+impl<T: Copy + PointeeSizeExpressible + AsPrimitive<f32>> KatanaFinalStage<f32, T>
+    for KatanaLut3xN<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn to_output(&self, src: &mut [f32], dst: &mut [T]) -> Result<(), CmsError> {
+        if src.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let grid_sizes: [u8; 16] = from_fn(|i| {
+            if i < self.output_inks {
+                self.grid_size
+            } else {
+                0
+            }
+        });
+
+        let md_lut = MultidimensionalLut::new(grid_sizes, 3, self.output_inks);
+
+        let scale_value = if T::FINITE {
+            ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+
+        let mut working = vec![0.; self.output_inks];
+
+        for (dest, src) in dst
+            .chunks_exact_mut(self.dst_layout.channels())
+            .zip(src.chunks_exact(3))
+        {
+            let x = lut_interp_linear_float(src[0], &self.linearization[0]);
+            let y = lut_interp_linear_float(src[1], &self.linearization[1]);
+            let z = lut_interp_linear_float(src[2], &self.linearization[2]);
+
+            tetra_3i_to_any_vec(&md_lut, &self.clut, x, y, z, &mut working, self.output_inks);
+
+            for (ink, curve) in working.iter_mut().zip(self.output.iter()) {
+                *ink = lut_interp_linear_float(*ink, curve);
+            }
+
+            if T::FINITE {
+                for (dst, ink) in dest.iter_mut().zip(working.iter()) {
+                    *dst = (*ink * scale_value).round().max(0.).min(scale_value).as_();
+                }
+            } else {
+                for (dst, ink) in dest.iter_mut().zip(working.iter()) {
+                    *dst = (*ink * scale_value).as_();
+                }
+            }
+        }
+
+        if self.dst_layout == Layout::Rgba && self.target_color_space == DataColorSpace::Rgb {
+            for dst in dst.chunks_exact_mut(self.dst_layout.channels()) {
+                dst[3] = scale_value.as_();
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn katana_make_lut_nx3<T: Copy + PointeeSizeExpressible + AsPrimitive<f32>>(
+    inks: usize,
+    lut: &LutDataType,
+    _: TransformOptions,
+    _: DataColorSpace,
+    bit_depth: usize,
+) -> Result<KatanaLutNx3<T>, CmsError> {
+    if inks != lut.num_input_channels as usize {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    if lut.num_output_channels != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    let clut_length: usize = (lut.num_clut_grid_points as usize)
+        .safe_powi(lut.num_input_channels as u32)?
+        .safe_mul(lut.num_output_channels as usize)?;
+
+    let clut_table = lut.clut_table.to_clut_f32();
+    if clut_table.len() != clut_length {
+        return Err(CmsError::MalformedClut(MalformedSize {
+            size: clut_table.len(),
+            expected: clut_length,
+        }));
+    }
+
+    let linearization_table = lut.input_table.to_clut_f32();
+
+    if linearization_table.len() < lut.num_input_table_entries as usize * inks {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: linearization_table.len(),
+            expected: lut.num_input_table_entries as usize * inks,
+        }));
+    }
+
+    let linearization = (0..inks)
+        .map(|x| {
+            linearization_table[x * lut.num_input_table_entries as usize
+                ..(x + 1) * lut.num_input_table_entries as usize]
+                .to_vec()
+        })
+        .collect::<_>();
+
+    let gamma_table = lut.output_table.to_clut_f32();
+
+    if gamma_table.len() < lut.num_output_table_entries as usize * 3 {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: gamma_table.len(),
+            expected: lut.num_output_table_entries as usize * 3,
+        }));
+    }
+
+    let gamma_curve0 = gamma_table[..lut.num_output_table_entries as usize].to_vec();
+    let gamma_curve1 = gamma_table
+        [lut.num_output_table_entries as usize..lut.num_output_table_entries as usize * 2]
+        .to_vec();
+    let gamma_curve2 = gamma_table
+        [lut.num_output_table_entries as usize * 2..lut.num_output_table_entries as usize * 3]
+        .to_vec();
+
+    let transform = KatanaLutNx3::<T> {
+        linearization,
+        clut: clut_table,
+        grid_size: lut.num_clut_grid_points,
+        output: [gamma_curve0, gamma_curve1, gamma_curve2],
+        input_inks: inks,
+        _phantom: PhantomData,
+        bit_depth,
+    };
+    Ok(transform)
+}
+
+fn katana_make_lut_3xn<T: Copy + PointeeSizeExpressible + AsPrimitive<f32>>(
+    inks: usize,
+    dst_layout: Layout,
+    lut: &LutDataType,
+    _: TransformOptions,
+    target_color_space: DataColorSpace,
+    bit_depth: usize,
+) -> Result<KatanaLut3xN<T>, CmsError> {
+    if lut.num_input_channels as usize != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    if target_color_space == DataColorSpace::Rgb {
+        if lut.num_output_channels != 3 || lut.num_output_channels != 4 {
+            return Err(CmsError::InvalidInksCountForProfile);
+        }
+        if dst_layout != Layout::Rgb || dst_layout != Layout::Rgba {
+            return Err(CmsError::InvalidInksCountForProfile);
+        }
+    } else if lut.num_output_channels as usize != dst_layout.channels() {
+        return Err(CmsError::InvalidInksCountForProfile);
+    }
+    let clut_length: usize = (lut.num_clut_grid_points as usize)
+        .safe_powi(lut.num_input_channels as u32)?
+        .safe_mul(lut.num_output_channels as usize)?;
+
+    let clut_table = lut.clut_table.to_clut_f32();
+    if clut_table.len() != clut_length {
+        return Err(CmsError::MalformedClut(MalformedSize {
+            size: clut_table.len(),
+            expected: clut_length,
+        }));
+    }
+
+    let linearization_table = lut.input_table.to_clut_f32();
+
+    if linearization_table.len() < lut.num_input_table_entries as usize * 3 {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: linearization_table.len(),
+            expected: lut.num_input_table_entries as usize * 3,
+        }));
+    }
+
+    let linear_curve0 = linearization_table[..lut.num_input_table_entries as usize].to_vec();
+    let linear_curve1 = linearization_table
+        [lut.num_input_table_entries as usize..lut.num_input_table_entries as usize * 2]
+        .to_vec();
+    let linear_curve2 = linearization_table
+        [lut.num_input_table_entries as usize * 2..lut.num_input_table_entries as usize * 3]
+        .to_vec();
+
+    let gamma_table = lut.output_table.to_clut_f32();
+
+    if gamma_table.len() < lut.num_output_table_entries as usize * inks {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: gamma_table.len(),
+            expected: lut.num_output_table_entries as usize * inks,
+        }));
+    }
+
+    let gamma = (0..inks)
+        .map(|x| {
+            gamma_table[x * lut.num_output_table_entries as usize
+                ..(x + 1) * lut.num_output_table_entries as usize]
+                .to_vec()
+        })
+        .collect::<_>();
+
+    let transform = KatanaLut3xN::<T> {
+        linearization: [linear_curve0, linear_curve1, linear_curve2],
+        clut: clut_table,
+        grid_size: lut.num_clut_grid_points,
+        output: gamma,
+        output_inks: inks,
+        _phantom: PhantomData,
+        target_color_space,
+        dst_layout,
+        bit_depth,
+    };
+    Ok(transform)
+}
+
+pub(crate) fn katana_input_make_lut_nx3<
+    T: Copy + PointeeSizeExpressible + AsPrimitive<f32> + Send + Sync,
+>(
+    src_layout: Layout,
+    inks: usize,
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaInitialStage<f32, T> + Send + Sync>, CmsError> {
+    if pcs == DataColorSpace::Rgb {
+        if lut.num_input_channels != 3 {
+            return Err(CmsError::InvalidAtoBLut);
+        }
+        if src_layout != Layout::Rgba && src_layout != Layout::Rgb {
+            return Err(CmsError::InvalidInksCountForProfile);
+        }
+    } else if lut.num_input_channels != src_layout.channels() as u8 {
+        return Err(CmsError::InvalidInksCountForProfile);
+    }
+    let z0 = katana_make_lut_nx3::<T>(inks, lut, options, pcs, bit_depth)?;
+    Ok(Box::new(z0))
+}
+
+pub(crate) fn katana_output_make_lut_3xn<
+    T: Copy + PointeeSizeExpressible + AsPrimitive<f32> + Send + Sync,
+>(
+    dst_layout: Layout,
+    lut: &LutDataType,
+    options: TransformOptions,
+    target_color_space: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaFinalStage<f32, T> + Send + Sync>, CmsError>
+where
+    f32: AsPrimitive<T>,
+{
+    let real_inks = if target_color_space == DataColorSpace::Rgb {
+        3
+    } else {
+        dst_layout.channels()
+    };
+    let z0 = katana_make_lut_3xn::<T>(
+        real_inks,
+        dst_layout,
+        lut,
+        options,
+        target_color_space,
+        bit_depth,
+    )?;
+    Ok(Box::new(z0))
+}
--- a/vendor/moxcms/src/conversions/katana/mod.rs
+++ b/vendor/moxcms/src/conversions/katana/mod.rs
@@ -0,0 +1,56 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+mod finalizers;
+mod md3x3;
+mod md4x3;
+mod md_3xn;
+mod md_nx3;
+mod md_pipeline;
+mod pcs_stages;
+mod rgb_xyz;
+mod stages;
+mod xyz_lab;
+mod xyz_rgb;
+
+pub(crate) use finalizers::{CopyAlphaStage, InjectAlphaStage};
+pub(crate) use md_3xn::katana_multi_dimensional_3xn_to_device;
+pub(crate) use md_nx3::katana_multi_dimensional_nx3_to_pcs;
+pub(crate) use md_pipeline::{katana_input_make_lut_nx3, katana_output_make_lut_3xn};
+pub(crate) use md3x3::{multi_dimensional_3x3_to_device, multi_dimensional_3x3_to_pcs};
+pub(crate) use md4x3::multi_dimensional_4x3_to_pcs;
+pub(crate) use pcs_stages::{
+    KatanaDefaultIntermediate, katana_pcs_lab_v2_to_v4, katana_pcs_lab_v4_to_v2,
+};
+pub(crate) use rgb_xyz::katana_create_rgb_lin_lut;
+pub(crate) use stages::{
+    Katana, KatanaFinalStage, KatanaInitialStage, KatanaIntermediateStage,
+    KatanaPostFinalizationStage,
+};
+pub(crate) use xyz_lab::{KatanaStageLabToXyz, KatanaStageXyzToLab};
+pub(crate) use xyz_rgb::katana_prepare_inverse_lut_rgb_xyz;
--- a/vendor/moxcms/src/conversions/katana/pcs_stages.rs
+++ b/vendor/moxcms/src/conversions/katana/pcs_stages.rs
@@ -0,0 +1,100 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::KatanaIntermediateStage;
+use crate::conversions::katana::stages::BlackholeIntermediateStage;
+use crate::mlaf::mlaf;
+use crate::{CmsError, ColorProfile, DataColorSpace, Matrix3f, ProfileVersion};
+use std::marker::PhantomData;
+
+pub(crate) struct KatanaMatrixStage {
+    pub(crate) matrices: Vec<Matrix3f>,
+}
+
+impl KatanaMatrixStage {
+    pub(crate) fn new(matrix: Matrix3f) -> Self {
+        Self {
+            matrices: vec![matrix],
+        }
+    }
+}
+
+pub(crate) type KatanaDefaultIntermediate = dyn KatanaIntermediateStage<f32> + Send + Sync;
+
+impl KatanaIntermediateStage<f32> for KatanaMatrixStage {
+    fn stage(&self, input: &mut Vec<f32>) -> Result<Vec<f32>, CmsError> {
+        if input.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        for m in self.matrices.iter() {
+            for dst in input.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(x * m.v[0][0], y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(x * m.v[1][0], y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(x * m.v[2][0], y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+
+        Ok(std::mem::take(input))
+    }
+}
+
+pub(crate) fn katana_pcs_lab_v4_to_v2(profile: &ColorProfile) -> Box<KatanaDefaultIntermediate> {
+    if profile.pcs == DataColorSpace::Lab && profile.version_internal <= ProfileVersion::V4_0 {
+        let v_mat = vec![Matrix3f {
+            v: [
+                [65280.0 / 65535.0, 0., 0.],
+                [0., 65280.0 / 65535.0, 0.],
+                [0., 0., 65280.0 / 65535.0],
+            ],
+        }];
+        return Box::new(KatanaMatrixStage { matrices: v_mat });
+    }
+    Box::new(BlackholeIntermediateStage {
+        _phantom: PhantomData,
+    })
+}
+
+pub(crate) fn katana_pcs_lab_v2_to_v4(profile: &ColorProfile) -> Box<KatanaDefaultIntermediate> {
+    if profile.pcs == DataColorSpace::Lab && profile.version_internal <= ProfileVersion::V4_0 {
+        let v_mat = vec![Matrix3f {
+            v: [
+                [65535.0 / 65280.0, 0., 0.],
+                [0., 65535.0 / 65280.0, 0.],
+                [0., 0., 65535.0 / 65280.0],
+            ],
+        }];
+        return Box::new(KatanaMatrixStage { matrices: v_mat });
+    }
+    Box::new(BlackholeIntermediateStage {
+        _phantom: PhantomData,
+    })
+}
--- a/vendor/moxcms/src/conversions/katana/rgb_xyz.rs
+++ b/vendor/moxcms/src/conversions/katana/rgb_xyz.rs
@@ -0,0 +1,161 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::pcs_stages::KatanaMatrixStage;
+use crate::conversions::katana::{KatanaInitialStage, KatanaIntermediateStage};
+use crate::{CmsError, ColorProfile, Layout, Matrix3f, PointeeSizeExpressible, TransformOptions};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+struct KatanaRgbLinearizationStage<T: Clone, const LAYOUT: u8, const LINEAR_CAP: usize> {
+    r_lin: Box<[f32; LINEAR_CAP]>,
+    g_lin: Box<[f32; LINEAR_CAP]>,
+    b_lin: Box<[f32; LINEAR_CAP]>,
+    linear_cap: usize,
+    bit_depth: usize,
+    _phantom: PhantomData<T>,
+}
+
+impl<
+    T: Clone + AsPrimitive<f32> + PointeeSizeExpressible,
+    const LAYOUT: u8,
+    const LINEAR_CAP: usize,
+> KatanaInitialStage<f32, T> for KatanaRgbLinearizationStage<T, LAYOUT, LINEAR_CAP>
+{
+    fn to_pcs(&self, input: &[T]) -> Result<Vec<f32>, CmsError> {
+        let src_layout = Layout::from(LAYOUT);
+        if input.len() % src_layout.channels() != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let mut dst = vec![0.; input.len() / src_layout.channels() * 3];
+
+        let scale = if T::FINITE {
+            (self.linear_cap as f32 - 1.) / ((1 << self.bit_depth) - 1) as f32
+        } else {
+            (T::NOT_FINITE_LINEAR_TABLE_SIZE - 1) as f32
+        };
+
+        let cap_value = if T::FINITE {
+            ((1 << self.bit_depth) - 1) as f32
+        } else {
+            (T::NOT_FINITE_LINEAR_TABLE_SIZE - 1) as f32
+        };
+
+        for (src, dst) in input
+            .chunks_exact(src_layout.channels())
+            .zip(dst.chunks_exact_mut(3))
+        {
+            let j_r = src[0].as_() * scale;
+            let j_g = src[1].as_() * scale;
+            let j_b = src[2].as_() * scale;
+            dst[0] = self.r_lin[(j_r.round().min(cap_value).max(0.) as u16) as usize];
+            dst[1] = self.g_lin[(j_g.round().min(cap_value).max(0.) as u16) as usize];
+            dst[2] = self.b_lin[(j_b.round().min(cap_value).max(0.) as u16) as usize];
+        }
+        Ok(dst)
+    }
+}
+
+pub(crate) struct KatanaRgbLinearizationState<T> {
+    pub(crate) stages: Vec<Box<dyn KatanaIntermediateStage<f32> + Send + Sync>>,
+    pub(crate) initial_stage: Box<dyn KatanaInitialStage<f32, T> + Send + Sync>,
+}
+
+pub(crate) fn katana_create_rgb_lin_lut<
+    T: Copy + Default + AsPrimitive<f32> + Send + Sync + AsPrimitive<usize> + PointeeSizeExpressible,
+    const BIT_DEPTH: usize,
+    const LINEAR_CAP: usize,
+>(
+    layout: Layout,
+    source: &ColorProfile,
+    opts: TransformOptions,
+) -> Result<KatanaRgbLinearizationState<T>, CmsError>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    let lin_r =
+        source.build_r_linearize_table::<T, LINEAR_CAP, BIT_DEPTH>(opts.allow_use_cicp_transfer)?;
+    let lin_g =
+        source.build_g_linearize_table::<T, LINEAR_CAP, BIT_DEPTH>(opts.allow_use_cicp_transfer)?;
+    let lin_b =
+        source.build_b_linearize_table::<T, LINEAR_CAP, BIT_DEPTH>(opts.allow_use_cicp_transfer)?;
+
+    let lin_stage: Box<dyn KatanaInitialStage<f32, T> + Send + Sync> = match layout {
+        Layout::Rgb => {
+            Box::new(
+                KatanaRgbLinearizationStage::<T, { Layout::Rgb as u8 }, LINEAR_CAP> {
+                    r_lin: lin_r,
+                    g_lin: lin_g,
+                    b_lin: lin_b,
+                    bit_depth: BIT_DEPTH,
+                    linear_cap: LINEAR_CAP,
+                    _phantom: PhantomData,
+                },
+            )
+        }
+        Layout::Rgba => {
+            Box::new(
+                KatanaRgbLinearizationStage::<T, { Layout::Rgba as u8 }, LINEAR_CAP> {
+                    r_lin: lin_r,
+                    g_lin: lin_g,
+                    b_lin: lin_b,
+                    bit_depth: BIT_DEPTH,
+                    linear_cap: LINEAR_CAP,
+                    _phantom: PhantomData,
+                },
+            )
+        }
+        Layout::Gray => unimplemented!("Gray should not be called on Rgb/Rgba execution path"),
+        Layout::GrayAlpha => {
+            unimplemented!("GrayAlpha should not be called on Rgb/Rgba execution path")
+        }
+        _ => unreachable!(),
+    };
+
+    let xyz_to_rgb = source.rgb_to_xyz_matrix();
+
+    let matrices: Vec<Box<dyn KatanaIntermediateStage<f32> + Send + Sync>> =
+        vec![Box::new(KatanaMatrixStage {
+            matrices: vec![
+                xyz_to_rgb.to_f32(),
+                Matrix3f {
+                    v: [
+                        [32768.0 / 65535.0, 0.0, 0.0],
+                        [0.0, 32768.0 / 65535.0, 0.0],
+                        [0.0, 0.0, 32768.0 / 65535.0],
+                    ],
+                },
+            ],
+        })];
+
+    Ok(KatanaRgbLinearizationState {
+        stages: matrices,
+        initial_stage: lin_stage,
+    })
+}
--- a/vendor/moxcms/src/conversions/katana/stages.rs
+++ b/vendor/moxcms/src/conversions/katana/stages.rs
@@ -0,0 +1,85 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::{CmsError, TransformExecutor};
+use std::marker::PhantomData;
+
+/// W storage working data type
+/// I input/output data type
+pub(crate) trait KatanaInitialStage<W, I> {
+    fn to_pcs(&self, input: &[I]) -> Result<Vec<W>, CmsError>;
+}
+
+/// W storage working data type
+/// I input/output data type
+pub(crate) trait KatanaFinalStage<W, I> {
+    fn to_output(&self, src: &mut [W], dst: &mut [I]) -> Result<(), CmsError>;
+}
+
+/// W storage working data type
+pub(crate) trait KatanaIntermediateStage<W> {
+    fn stage(&self, input: &mut Vec<W>) -> Result<Vec<W>, CmsError>;
+}
+
+pub(crate) struct BlackholeIntermediateStage<W> {
+    pub(crate) _phantom: PhantomData<W>,
+}
+
+impl<W> KatanaIntermediateStage<W> for BlackholeIntermediateStage<W> {
+    fn stage(&self, input: &mut Vec<W>) -> Result<Vec<W>, CmsError> {
+        Ok(std::mem::take(input))
+    }
+}
+
+/// I input/output data type
+pub(crate) trait KatanaPostFinalizationStage<I> {
+    fn finalize(&self, src: &[I], dst: &mut [I]) -> Result<(), CmsError>;
+}
+
+/// W storage working data type
+/// I input/output data type
+pub(crate) struct Katana<W, I> {
+    pub(crate) initial_stage: Box<dyn KatanaInitialStage<W, I> + Send + Sync>,
+    pub(crate) final_stage: Box<dyn KatanaFinalStage<W, I> + Sync + Send>,
+    pub(crate) stages: Vec<Box<dyn KatanaIntermediateStage<W> + Send + Sync>>,
+    pub(crate) post_finalization: Vec<Box<dyn KatanaPostFinalizationStage<I> + Send + Sync>>,
+}
+
+impl<W, I: Copy + Default> TransformExecutor<I> for Katana<W, I> {
+    fn transform(&self, src: &[I], dst: &mut [I]) -> Result<(), CmsError> {
+        let mut working_vec = self.initial_stage.to_pcs(src)?;
+        for stage in self.stages.iter() {
+            working_vec = stage.stage(&mut working_vec)?;
+        }
+        self.final_stage.to_output(&mut working_vec, dst)?;
+        for finalization in self.post_finalization.iter() {
+            finalization.finalize(src, dst)?;
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/katana/xyz_lab.rs
+++ b/vendor/moxcms/src/conversions/katana/xyz_lab.rs
@@ -0,0 +1,62 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::KatanaIntermediateStage;
+use crate::{CmsError, Lab, Xyz};
+
+#[derive(Default)]
+pub(crate) struct KatanaStageLabToXyz {}
+
+impl KatanaIntermediateStage<f32> for KatanaStageLabToXyz {
+    fn stage(&self, input: &mut Vec<f32>) -> Result<Vec<f32>, CmsError> {
+        for dst in input.chunks_exact_mut(3) {
+            let lab = Lab::new(dst[0], dst[1], dst[2]);
+            let xyz = lab.to_pcs_xyz();
+            dst[0] = xyz.x;
+            dst[1] = xyz.y;
+            dst[2] = xyz.z;
+        }
+        Ok(std::mem::take(input))
+    }
+}
+
+#[derive(Default)]
+pub(crate) struct KatanaStageXyzToLab {}
+
+impl KatanaIntermediateStage<f32> for KatanaStageXyzToLab {
+    fn stage(&self, input: &mut Vec<f32>) -> Result<Vec<f32>, CmsError> {
+        for dst in input.chunks_exact_mut(3) {
+            let xyz = Xyz::new(dst[0], dst[1], dst[2]);
+            let lab = Lab::from_pcs_xyz(xyz);
+            dst[0] = lab.l;
+            dst[1] = lab.a;
+            dst[2] = lab.b;
+        }
+        Ok(std::mem::take(input))
+    }
+}
--- a/vendor/moxcms/src/conversions/katana/xyz_rgb.rs
+++ b/vendor/moxcms/src/conversions/katana/xyz_rgb.rs
@@ -0,0 +1,223 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::pcs_stages::KatanaMatrixStage;
+use crate::conversions::katana::{
+    KatanaDefaultIntermediate, KatanaFinalStage, KatanaIntermediateStage,
+};
+use crate::mlaf::mlaf;
+use crate::{
+    CmsError, ColorProfile, GammaLutInterpolate, Layout, Matrix3f, PointeeSizeExpressible,
+    RenderingIntent, Rgb, TransformOptions, filmlike_clip,
+};
+use num_traits::AsPrimitive;
+
+pub(crate) struct KatanaXyzToRgbStage<T: Clone, const LAYOUT: u8> {
+    pub(crate) r_gamma: Box<[T; 65536]>,
+    pub(crate) g_gamma: Box<[T; 65536]>,
+    pub(crate) b_gamma: Box<[T; 65536]>,
+    pub(crate) intent: RenderingIntent,
+    pub(crate) bit_depth: usize,
+    pub(crate) gamma_lut: usize,
+}
+
+impl<T: Clone + AsPrimitive<f32> + PointeeSizeExpressible, const LAYOUT: u8>
+    KatanaFinalStage<f32, T> for KatanaXyzToRgbStage<T, LAYOUT>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    fn to_output(&self, src: &mut [f32], dst: &mut [T]) -> Result<(), CmsError> {
+        let dst_cn = Layout::from(LAYOUT);
+        let dst_channels = dst_cn.channels();
+        if src.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / 3;
+        let dst_chunks = dst.len() / dst_channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        let max_colors: T = (if T::FINITE {
+            ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        })
+        .as_();
+        let lut_cap = (self.gamma_lut - 1) as f32;
+
+        if self.intent != RenderingIntent::AbsoluteColorimetric {
+            for (src, dst) in src.chunks_exact(3).zip(dst.chunks_exact_mut(dst_channels)) {
+                let mut rgb = Rgb::new(src[0], src[1], src[2]);
+                if rgb.is_out_of_gamut() {
+                    rgb = filmlike_clip(rgb);
+                }
+                let r = mlaf(0.5, rgb.r, lut_cap).min(lut_cap).max(0.) as u16;
+                let g = mlaf(0.5, rgb.g, lut_cap).min(lut_cap).max(0.) as u16;
+                let b = mlaf(0.5, rgb.b, lut_cap).min(lut_cap).max(0.) as u16;
+
+                dst[0] = self.r_gamma[r as usize];
+                dst[1] = self.g_gamma[g as usize];
+                dst[2] = self.b_gamma[b as usize];
+                if dst_cn == Layout::Rgba {
+                    dst[3] = max_colors;
+                }
+            }
+        } else {
+            for (src, dst) in src.chunks_exact(3).zip(dst.chunks_exact_mut(dst_channels)) {
+                let rgb = Rgb::new(src[0], src[1], src[2]);
+                let r = mlaf(0.5, rgb.r, lut_cap).min(lut_cap).max(0.) as u16;
+                let g = mlaf(0.5, rgb.g, lut_cap).min(lut_cap).max(0.) as u16;
+                let b = mlaf(0.5, rgb.b, lut_cap).min(lut_cap).max(0.) as u16;
+
+                dst[0] = self.r_gamma[r as usize];
+                dst[1] = self.g_gamma[g as usize];
+                dst[2] = self.b_gamma[b as usize];
+                if dst_cn == Layout::Rgba {
+                    dst[3] = max_colors;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) struct KatanaXyzRgbState<T> {
+    pub(crate) stages: Vec<Box<dyn KatanaIntermediateStage<f32> + Send + Sync>>,
+    pub(crate) final_stage: Box<dyn KatanaFinalStage<f32, T> + Send + Sync>,
+}
+
+pub(crate) fn katana_prepare_inverse_lut_rgb_xyz<
+    T: Copy
+        + Default
+        + AsPrimitive<f32>
+        + Send
+        + Sync
+        + AsPrimitive<usize>
+        + PointeeSizeExpressible
+        + GammaLutInterpolate,
+    const BIT_DEPTH: usize,
+    const GAMMA_LUT: usize,
+>(
+    dest: &ColorProfile,
+    dest_layout: Layout,
+    options: TransformOptions,
+) -> Result<KatanaXyzRgbState<T>, CmsError>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+{
+    // if !T::FINITE {
+    // if let Some(extended_gamma) = dest.try_extended_gamma_evaluator() {
+    //     let xyz_to_rgb = dest.rgb_to_xyz_matrix().inverse();
+    //
+    //     let mut matrices = vec![Matrix3f {
+    //         v: [
+    //             [65535.0 / 32768.0, 0.0, 0.0],
+    //             [0.0, 65535.0 / 32768.0, 0.0],
+    //             [0.0, 0.0, 65535.0 / 32768.0],
+    //         ],
+    //     }];
+    //
+    //     matrices.push(xyz_to_rgb.to_f32());
+    //     let xyz_to_rgb_stage = XyzToRgbStageExtended::<T> {
+    //         gamma_evaluator: extended_gamma,
+    //         matrices,
+    //         phantom_data: PhantomData,
+    //     };
+    //     xyz_to_rgb_stage.transform(lut)?;
+    //     return Ok(());
+    // }
+    // }
+    let gamma_map_r = dest.build_gamma_table::<T, 65536, GAMMA_LUT, BIT_DEPTH>(
+        &dest.red_trc,
+        options.allow_use_cicp_transfer,
+    )?;
+    let gamma_map_g = dest.build_gamma_table::<T, 65536, GAMMA_LUT, BIT_DEPTH>(
+        &dest.green_trc,
+        options.allow_use_cicp_transfer,
+    )?;
+    let gamma_map_b = dest.build_gamma_table::<T, 65536, GAMMA_LUT, BIT_DEPTH>(
+        &dest.blue_trc,
+        options.allow_use_cicp_transfer,
+    )?;
+
+    let xyz_to_rgb = dest.rgb_to_xyz_matrix().inverse();
+
+    let mut matrices: Vec<Box<KatanaDefaultIntermediate>> =
+        vec![Box::new(KatanaMatrixStage::new(Matrix3f {
+            v: [
+                [65535.0 / 32768.0, 0.0, 0.0],
+                [0.0, 65535.0 / 32768.0, 0.0],
+                [0.0, 0.0, 65535.0 / 32768.0],
+            ],
+        }))];
+
+    matrices.push(Box::new(KatanaMatrixStage::new(xyz_to_rgb.to_f32())));
+    match dest_layout {
+        Layout::Rgb => {
+            let xyz_to_rgb_stage = KatanaXyzToRgbStage::<T, { Layout::Rgb as u8 }> {
+                r_gamma: gamma_map_r,
+                g_gamma: gamma_map_g,
+                b_gamma: gamma_map_b,
+                intent: options.rendering_intent,
+                bit_depth: BIT_DEPTH,
+                gamma_lut: GAMMA_LUT,
+            };
+            Ok(KatanaXyzRgbState {
+                stages: matrices,
+                final_stage: Box::new(xyz_to_rgb_stage),
+            })
+        }
+        Layout::Rgba => {
+            let xyz_to_rgb_stage = KatanaXyzToRgbStage::<T, { Layout::Rgba as u8 }> {
+                r_gamma: gamma_map_r,
+                g_gamma: gamma_map_g,
+                b_gamma: gamma_map_b,
+                intent: options.rendering_intent,
+                bit_depth: BIT_DEPTH,
+                gamma_lut: GAMMA_LUT,
+            };
+            Ok(KatanaXyzRgbState {
+                stages: matrices,
+                final_stage: Box::new(xyz_to_rgb_stage),
+            })
+        }
+        Layout::Gray => unreachable!("Gray layout must not be called on Rgb/Rgba path"),
+        Layout::GrayAlpha => unreachable!("Gray layout must not be called on Rgb/Rgba path"),
+        _ => unreachable!(
+            "layout {:?} should not be called on xyz->rgb path",
+            dest_layout
+        ),
+    }
+}
--- a/vendor/moxcms/src/conversions/lut3x3.rs
+++ b/vendor/moxcms/src/conversions/lut3x3.rs
@@ -0,0 +1,428 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::{KatanaFinalStage, KatanaInitialStage};
+use crate::err::MalformedSize;
+use crate::profile::LutDataType;
+use crate::safe_math::{SafeMul, SafePowi};
+use crate::trc::lut_interp_linear_float;
+use crate::{
+    CmsError, Cube, DataColorSpace, InterpolationMethod, PointeeSizeExpressible, Stage,
+    TransformOptions, Vector3f,
+};
+use num_traits::AsPrimitive;
+
+#[derive(Default)]
+struct Lut3x3 {
+    input: [Vec<f32>; 3],
+    clut: Vec<f32>,
+    grid_size: u8,
+    gamma: [Vec<f32>; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+#[derive(Default)]
+struct KatanaLut3x3<T: Copy + Default> {
+    input: [Vec<f32>; 3],
+    clut: Vec<f32>,
+    grid_size: u8,
+    gamma: [Vec<f32>; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+    _phantom: std::marker::PhantomData<T>,
+    bit_depth: usize,
+}
+
+fn make_lut_3x3(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Lut3x3, CmsError> {
+    let clut_length: usize = (lut.num_clut_grid_points as usize)
+        .safe_powi(lut.num_input_channels as u32)?
+        .safe_mul(lut.num_output_channels as usize)?;
+
+    let lin_table = lut.input_table.to_clut_f32();
+
+    if lin_table.len() < lut.num_input_table_entries as usize * 3 {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: lin_table.len(),
+            expected: lut.num_input_table_entries as usize * 3,
+        }));
+    }
+
+    let lin_curve0 = lin_table[..lut.num_input_table_entries as usize].to_vec();
+    let lin_curve1 = lin_table
+        [lut.num_input_table_entries as usize..lut.num_input_table_entries as usize * 2]
+        .to_vec();
+    let lin_curve2 = lin_table
+        [lut.num_input_table_entries as usize * 2..lut.num_input_table_entries as usize * 3]
+        .to_vec();
+
+    let clut_table = lut.clut_table.to_clut_f32();
+    if clut_table.len() != clut_length {
+        return Err(CmsError::MalformedClut(MalformedSize {
+            size: clut_table.len(),
+            expected: clut_length,
+        }));
+    }
+
+    let gamma_curves = lut.output_table.to_clut_f32();
+
+    if gamma_curves.len() < lut.num_output_table_entries as usize * 3 {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: gamma_curves.len(),
+            expected: lut.num_output_table_entries as usize * 3,
+        }));
+    }
+
+    let gamma_curve0 = gamma_curves[..lut.num_output_table_entries as usize].to_vec();
+    let gamma_curve1 = gamma_curves
+        [lut.num_output_table_entries as usize..lut.num_output_table_entries as usize * 2]
+        .to_vec();
+    let gamma_curve2 = gamma_curves
+        [lut.num_output_table_entries as usize * 2..lut.num_output_table_entries as usize * 3]
+        .to_vec();
+
+    let transform = Lut3x3 {
+        input: [lin_curve0, lin_curve1, lin_curve2],
+        gamma: [gamma_curve0, gamma_curve1, gamma_curve2],
+        interpolation_method: options.interpolation_method,
+        clut: clut_table,
+        grid_size: lut.num_clut_grid_points,
+        pcs,
+    };
+
+    Ok(transform)
+}
+
+fn stage_lut_3x3(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Box<dyn Stage>, CmsError> {
+    let lut = make_lut_3x3(lut, options, pcs)?;
+
+    let transform = Lut3x3 {
+        input: lut.input,
+        gamma: lut.gamma,
+        interpolation_method: lut.interpolation_method,
+        clut: lut.clut,
+        grid_size: lut.grid_size,
+        pcs: lut.pcs,
+    };
+
+    Ok(Box::new(transform))
+}
+
+pub(crate) fn katana_input_stage_lut_3x3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaInitialStage<f32, T> + Send + Sync>, CmsError>
+where
+    f32: AsPrimitive<T>,
+{
+    let lut = make_lut_3x3(lut, options, pcs)?;
+
+    let transform = KatanaLut3x3::<T> {
+        input: lut.input,
+        gamma: lut.gamma,
+        interpolation_method: lut.interpolation_method,
+        clut: lut.clut,
+        grid_size: lut.grid_size,
+        pcs: lut.pcs,
+        _phantom: std::marker::PhantomData,
+        bit_depth,
+    };
+
+    Ok(Box::new(transform))
+}
+
+pub(crate) fn katana_output_stage_lut_3x3<
+    T: Copy + Default + AsPrimitive<f32> + PointeeSizeExpressible + Send + Sync,
+>(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaFinalStage<f32, T> + Send + Sync>, CmsError>
+where
+    f32: AsPrimitive<T>,
+{
+    let lut = make_lut_3x3(lut, options, pcs)?;
+
+    let transform = KatanaLut3x3::<T> {
+        input: lut.input,
+        gamma: lut.gamma,
+        interpolation_method: lut.interpolation_method,
+        clut: lut.clut,
+        grid_size: lut.grid_size,
+        pcs: lut.pcs,
+        _phantom: std::marker::PhantomData,
+        bit_depth,
+    };
+
+    Ok(Box::new(transform))
+}
+
+impl Lut3x3 {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> Vector3f>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let linearization_0 = &self.input[0];
+        let linearization_1 = &self.input[1];
+        let linearization_2 = &self.input[2];
+        for (dest, src) in dst.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            debug_assert!(self.grid_size as i32 >= 1);
+            let linear_x = lut_interp_linear_float(src[0], linearization_0);
+            let linear_y = lut_interp_linear_float(src[1], linearization_1);
+            let linear_z = lut_interp_linear_float(src[2], linearization_2);
+
+            let clut = fetch(linear_x, linear_y, linear_z);
+
+            let pcs_x = lut_interp_linear_float(clut.v[0], &self.gamma[0]);
+            let pcs_y = lut_interp_linear_float(clut.v[1], &self.gamma[1]);
+            let pcs_z = lut_interp_linear_float(clut.v[2], &self.gamma[2]);
+            dest[0] = pcs_x;
+            dest[1] = pcs_y;
+            dest[2] = pcs_z;
+        }
+        Ok(())
+    }
+}
+
+impl Stage for Lut3x3 {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let l_tbl = Cube::new(&self.clut, self.grid_size as usize);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(src, dst, |x, y, z| l_tbl.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(src, dst, |x, y, z| l_tbl.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(src, dst, |x, y, z| l_tbl.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(src, dst, |x, y, z| l_tbl.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(src, dst, |x, y, z| l_tbl.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<T: Copy + Default + PointeeSizeExpressible + AsPrimitive<f32>> KatanaLut3x3<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn to_pcs_impl<Fetch: Fn(f32, f32, f32) -> Vector3f>(
+        &self,
+        input: &[T],
+        fetch: Fetch,
+    ) -> Result<Vec<f32>, CmsError> {
+        if input.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let normalizing_value = if T::FINITE {
+            1.0 / ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+        let mut dst = vec![0.; input.len()];
+        let linearization_0 = &self.input[0];
+        let linearization_1 = &self.input[1];
+        let linearization_2 = &self.input[2];
+        for (dest, src) in dst.chunks_exact_mut(3).zip(input.chunks_exact(3)) {
+            let linear_x =
+                lut_interp_linear_float(src[0].as_() * normalizing_value, linearization_0);
+            let linear_y =
+                lut_interp_linear_float(src[1].as_() * normalizing_value, linearization_1);
+            let linear_z =
+                lut_interp_linear_float(src[2].as_() * normalizing_value, linearization_2);
+
+            let clut = fetch(linear_x, linear_y, linear_z);
+
+            let pcs_x = lut_interp_linear_float(clut.v[0], &self.gamma[0]);
+            let pcs_y = lut_interp_linear_float(clut.v[1], &self.gamma[1]);
+            let pcs_z = lut_interp_linear_float(clut.v[2], &self.gamma[2]);
+            dest[0] = pcs_x;
+            dest[1] = pcs_y;
+            dest[2] = pcs_z;
+        }
+        Ok(dst)
+    }
+
+    fn to_output<Fetch: Fn(f32, f32, f32) -> Vector3f>(
+        &self,
+        src: &[f32],
+        dst: &mut [T],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        if src.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() != src.len() {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        let norm_value = if T::FINITE {
+            ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+
+        let linearization_0 = &self.input[0];
+        let linearization_1 = &self.input[1];
+        let linearization_2 = &self.input[2];
+        for (dest, src) in dst.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            let linear_x = lut_interp_linear_float(src[0], linearization_0);
+            let linear_y = lut_interp_linear_float(src[1], linearization_1);
+            let linear_z = lut_interp_linear_float(src[2], linearization_2);
+
+            let clut = fetch(linear_x, linear_y, linear_z);
+
+            let pcs_x = lut_interp_linear_float(clut.v[0], &self.gamma[0]);
+            let pcs_y = lut_interp_linear_float(clut.v[1], &self.gamma[1]);
+            let pcs_z = lut_interp_linear_float(clut.v[2], &self.gamma[2]);
+
+            if T::FINITE {
+                dest[0] = (pcs_x * norm_value).round().max(0.0).min(norm_value).as_();
+                dest[1] = (pcs_y * norm_value).round().max(0.0).min(norm_value).as_();
+                dest[2] = (pcs_z * norm_value).round().max(0.0).min(norm_value).as_();
+            } else {
+                dest[0] = pcs_x.as_();
+                dest[1] = pcs_y.as_();
+                dest[2] = pcs_z.as_();
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<T: Copy + Default + PointeeSizeExpressible + AsPrimitive<f32>> KatanaInitialStage<f32, T>
+    for KatanaLut3x3<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn to_pcs(&self, input: &[T]) -> Result<Vec<f32>, CmsError> {
+        let l_tbl = Cube::new(&self.clut, self.grid_size as usize);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.to_pcs_impl(input, |x, y, z| l_tbl.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.to_pcs_impl(input, |x, y, z| l_tbl.tetra_vec3(x, y, z))
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.to_pcs_impl(input, |x, y, z| l_tbl.pyramid_vec3(x, y, z))
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.to_pcs_impl(input, |x, y, z| l_tbl.prism_vec3(x, y, z))
+            }
+            InterpolationMethod::Linear => {
+                self.to_pcs_impl(input, |x, y, z| l_tbl.trilinear_vec3(x, y, z))
+            }
+        }
+    }
+}
+
+impl<T: Copy + Default + PointeeSizeExpressible + AsPrimitive<f32>> KatanaFinalStage<f32, T>
+    for KatanaLut3x3<T>
+where
+    f32: AsPrimitive<T>,
+{
+    fn to_output(&self, src: &mut [f32], dst: &mut [T]) -> Result<(), CmsError> {
+        let l_tbl = Cube::new(&self.clut, self.grid_size as usize);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.to_output(src, dst, |x, y, z| l_tbl.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.to_output(src, dst, |x, y, z| l_tbl.tetra_vec3(x, y, z))
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.to_output(src, dst, |x, y, z| l_tbl.pyramid_vec3(x, y, z))
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.to_output(src, dst, |x, y, z| l_tbl.prism_vec3(x, y, z))
+            }
+            InterpolationMethod::Linear => {
+                self.to_output(src, dst, |x, y, z| l_tbl.trilinear_vec3(x, y, z))
+            }
+        }
+    }
+}
+
+pub(crate) fn create_lut3x3(
+    lut: &LutDataType,
+    src: &[f32],
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Vec<f32>, CmsError> {
+    if lut.num_input_channels != 3 || lut.num_output_channels != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+
+    let mut dest = vec![0.; src.len()];
+
+    let lut_stage = stage_lut_3x3(lut, options, pcs)?;
+    lut_stage.transform(src, &mut dest)?;
+    Ok(dest)
+}
--- a/vendor/moxcms/src/conversions/lut3x4.rs
+++ b/vendor/moxcms/src/conversions/lut3x4.rs
@@ -0,0 +1,248 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::profile::LutDataType;
+use crate::safe_math::{SafeMul, SafePowi};
+use crate::trc::lut_interp_linear_float;
+use crate::{
+    CmsError, Cube, DataColorSpace, InterpolationMethod, MalformedSize, Stage, TransformOptions,
+    Vector4f,
+};
+use num_traits::AsPrimitive;
+
+#[derive(Default)]
+struct Lut3x4 {
+    input: [Vec<f32>; 3],
+    clut: Vec<f32>,
+    grid_size: u8,
+    gamma: [Vec<f32>; 4],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+fn make_lut_3x4(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Lut3x4, CmsError> {
+    let clut_length: usize = (lut.num_clut_grid_points as usize)
+        .safe_powi(lut.num_input_channels as u32)?
+        .safe_mul(lut.num_output_channels as usize)?;
+
+    let clut_table = lut.clut_table.to_clut_f32();
+    if clut_table.len() != clut_length {
+        return Err(CmsError::MalformedClut(MalformedSize {
+            size: clut_table.len(),
+            expected: clut_length,
+        }));
+    }
+
+    let linearization_table = lut.input_table.to_clut_f32();
+
+    if linearization_table.len() < lut.num_input_table_entries as usize * 3 {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: linearization_table.len(),
+            expected: lut.num_input_table_entries as usize * 3,
+        }));
+    }
+
+    let linear_curve0 = linearization_table[..lut.num_input_table_entries as usize].to_vec();
+    let linear_curve1 = linearization_table
+        [lut.num_input_table_entries as usize..lut.num_input_table_entries as usize * 2]
+        .to_vec();
+    let linear_curve2 = linearization_table
+        [lut.num_input_table_entries as usize * 2..lut.num_input_table_entries as usize * 3]
+        .to_vec();
+
+    let gamma_table = lut.output_table.to_clut_f32();
+
+    if gamma_table.len() < lut.num_output_table_entries as usize * 4 {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: gamma_table.len(),
+            expected: lut.num_output_table_entries as usize * 4,
+        }));
+    }
+
+    let gamma_curve0 = gamma_table[..lut.num_output_table_entries as usize].to_vec();
+    let gamma_curve1 = gamma_table
+        [lut.num_output_table_entries as usize..lut.num_output_table_entries as usize * 2]
+        .to_vec();
+    let gamma_curve2 = gamma_table
+        [lut.num_output_table_entries as usize * 2..lut.num_output_table_entries as usize * 3]
+        .to_vec();
+    let gamma_curve3 = gamma_table
+        [lut.num_output_table_entries as usize * 3..lut.num_output_table_entries as usize * 4]
+        .to_vec();
+
+    let transform = Lut3x4 {
+        input: [linear_curve0, linear_curve1, linear_curve2],
+        interpolation_method: options.interpolation_method,
+        clut: clut_table,
+        grid_size: lut.num_clut_grid_points,
+        pcs,
+        gamma: [gamma_curve0, gamma_curve1, gamma_curve2, gamma_curve3],
+    };
+    Ok(transform)
+}
+
+fn stage_lut_3x4(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Box<dyn Stage>, CmsError> {
+    let lut = make_lut_3x4(lut, options, pcs)?;
+
+    let transform = Lut3x4 {
+        input: lut.input,
+        interpolation_method: lut.interpolation_method,
+        clut: lut.clut,
+        grid_size: lut.grid_size,
+        pcs: lut.pcs,
+        gamma: lut.gamma,
+    };
+    Ok(Box::new(transform))
+}
+
+impl Lut3x4 {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> Vector4f>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let linearization_0 = &self.input[0];
+        let linearization_1 = &self.input[1];
+        let linearization_2 = &self.input[2];
+        for (dest, src) in dst.chunks_exact_mut(4).zip(src.chunks_exact(3)) {
+            debug_assert!(self.grid_size as i32 >= 1);
+            let linear_x = lut_interp_linear_float(src[0], linearization_0);
+            let linear_y = lut_interp_linear_float(src[1], linearization_1);
+            let linear_z = lut_interp_linear_float(src[2], linearization_2);
+
+            let clut = fetch(linear_x, linear_y, linear_z);
+
+            let pcs_x = lut_interp_linear_float(clut.v[0], &self.gamma[0]);
+            let pcs_y = lut_interp_linear_float(clut.v[1], &self.gamma[1]);
+            let pcs_z = lut_interp_linear_float(clut.v[2], &self.gamma[2]);
+            let pcs_w = lut_interp_linear_float(clut.v[3], &self.gamma[3]);
+            dest[0] = pcs_x;
+            dest[1] = pcs_y;
+            dest[2] = pcs_z;
+            dest[3] = pcs_w;
+        }
+        Ok(())
+    }
+}
+
+impl Stage for Lut3x4 {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let l_tbl = Cube::new(&self.clut, self.grid_size as usize);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(src, dst, |x, y, z| l_tbl.trilinear_vec4(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(src, dst, |x, y, z| l_tbl.tetra_vec4(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(src, dst, |x, y, z| l_tbl.pyramid_vec4(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(src, dst, |x, y, z| l_tbl.prism_vec4(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(src, dst, |x, y, z| l_tbl.trilinear_vec4(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+pub(crate) fn create_lut3_samples<T: Copy + 'static, const SAMPLES: usize>() -> Vec<T>
+where
+    u32: AsPrimitive<T>,
+{
+    let lut_size: u32 = (3 * SAMPLES * SAMPLES * SAMPLES) as u32;
+
+    assert!(SAMPLES >= 1);
+
+    let mut src = Vec::with_capacity(lut_size as usize);
+    for x in 0..SAMPLES as u32 {
+        for y in 0..SAMPLES as u32 {
+            for z in 0..SAMPLES as u32 {
+                src.push(x.as_());
+                src.push(y.as_());
+                src.push(z.as_());
+            }
+        }
+    }
+    src
+}
+
+pub(crate) fn create_lut3_samples_norm<const SAMPLES: usize>() -> Vec<f32> {
+    let lut_size: u32 = (3 * SAMPLES * SAMPLES * SAMPLES) as u32;
+
+    assert!(SAMPLES >= 1);
+
+    let scale = 1. / (SAMPLES as f32 - 1.0);
+
+    let mut src = Vec::with_capacity(lut_size as usize);
+    for x in 0..SAMPLES as u32 {
+        for y in 0..SAMPLES as u32 {
+            for z in 0..SAMPLES as u32 {
+                src.push(x as f32 * scale);
+                src.push(y as f32 * scale);
+                src.push(z as f32 * scale);
+            }
+        }
+    }
+    src
+}
+
+pub(crate) fn create_lut3x4(
+    lut: &LutDataType,
+    src: &[f32],
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Vec<f32>, CmsError> {
+    if lut.num_input_channels != 3 || lut.num_output_channels != 4 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+
+    let mut dest = vec![0.; (src.len() / 3) * 4];
+
+    let lut_stage = stage_lut_3x4(lut, options, pcs)?;
+    lut_stage.transform(src, &mut dest)?;
+    Ok(dest)
+}
--- a/vendor/moxcms/src/conversions/lut4.rs
+++ b/vendor/moxcms/src/conversions/lut4.rs
@@ -0,0 +1,392 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::katana::KatanaInitialStage;
+use crate::profile::LutDataType;
+use crate::safe_math::{SafeMul, SafePowi};
+use crate::trc::lut_interp_linear_float;
+use crate::{
+    CmsError, DataColorSpace, Hypercube, InterpolationMethod, MalformedSize,
+    PointeeSizeExpressible, Stage, TransformOptions, Vector3f,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+#[allow(unused)]
+#[derive(Default)]
+struct Lut4x3 {
+    linearization: [Vec<f32>; 4],
+    clut: Vec<f32>,
+    grid_size: u8,
+    output: [Vec<f32>; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+#[allow(unused)]
+#[derive(Default)]
+struct KatanaLut4x3<T: Copy + PointeeSizeExpressible + AsPrimitive<f32>> {
+    linearization: [Vec<f32>; 4],
+    clut: Vec<f32>,
+    grid_size: u8,
+    output: [Vec<f32>; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+#[allow(unused)]
+impl Lut4x3 {
+    fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> Vector3f>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let linearization_0 = &self.linearization[0];
+        let linearization_1 = &self.linearization[1];
+        let linearization_2 = &self.linearization[2];
+        let linearization_3 = &self.linearization[3];
+        for (dest, src) in dst.chunks_exact_mut(3).zip(src.chunks_exact(4)) {
+            debug_assert!(self.grid_size as i32 >= 1);
+            let linear_x = lut_interp_linear_float(src[0], linearization_0);
+            let linear_y = lut_interp_linear_float(src[1], linearization_1);
+            let linear_z = lut_interp_linear_float(src[2], linearization_2);
+            let linear_w = lut_interp_linear_float(src[3], linearization_3);
+
+            let clut = fetch(linear_x, linear_y, linear_z, linear_w);
+
+            let pcs_x = lut_interp_linear_float(clut.v[0], &self.output[0]);
+            let pcs_y = lut_interp_linear_float(clut.v[1], &self.output[1]);
+            let pcs_z = lut_interp_linear_float(clut.v[2], &self.output[2]);
+            dest[0] = pcs_x;
+            dest[1] = pcs_y;
+            dest[2] = pcs_z;
+        }
+        Ok(())
+    }
+}
+
+macro_rules! define_lut4_dispatch {
+    ($dispatcher: ident) => {
+        impl Stage for $dispatcher {
+            fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+                let l_tbl = Hypercube::new(&self.clut, self.grid_size as usize);
+
+                // If Source PCS is LAB trilinear should be used
+                if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+                    return self
+                        .transform_impl(src, dst, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w));
+                }
+
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_impl(src, dst, |x, y, z, w| l_tbl.tetra_vec3(x, y, z, w))?;
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_impl(src, dst, |x, y, z, w| l_tbl.pyramid_vec3(x, y, z, w))?;
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_impl(src, dst, |x, y, z, w| l_tbl.prism_vec3(x, y, z, w))?
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_impl(src, dst, |x, y, z, w| {
+                            l_tbl.quadlinear_vec3(x, y, z, w)
+                        })?
+                    }
+                }
+                Ok(())
+            }
+        }
+    };
+}
+
+impl<T: Copy + PointeeSizeExpressible + AsPrimitive<f32>> KatanaLut4x3<T> {
+    fn to_pcs_impl<Fetch: Fn(f32, f32, f32, f32) -> Vector3f>(
+        &self,
+        input: &[T],
+        fetch: Fetch,
+    ) -> Result<Vec<f32>, CmsError> {
+        if input.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let norm_value = if T::FINITE {
+            1.0 / ((1u32 << self.bit_depth) - 1) as f32
+        } else {
+            1.0
+        };
+        let mut dst = vec![0.; (input.len() / 4) * 3];
+        let linearization_0 = &self.linearization[0];
+        let linearization_1 = &self.linearization[1];
+        let linearization_2 = &self.linearization[2];
+        let linearization_3 = &self.linearization[3];
+        for (dest, src) in dst.chunks_exact_mut(3).zip(input.chunks_exact(4)) {
+            let linear_x = lut_interp_linear_float(src[0].as_() * norm_value, linearization_0);
+            let linear_y = lut_interp_linear_float(src[1].as_() * norm_value, linearization_1);
+            let linear_z = lut_interp_linear_float(src[2].as_() * norm_value, linearization_2);
+            let linear_w = lut_interp_linear_float(src[3].as_() * norm_value, linearization_3);
+
+            let clut = fetch(linear_x, linear_y, linear_z, linear_w);
+
+            let pcs_x = lut_interp_linear_float(clut.v[0], &self.output[0]);
+            let pcs_y = lut_interp_linear_float(clut.v[1], &self.output[1]);
+            let pcs_z = lut_interp_linear_float(clut.v[2], &self.output[2]);
+            dest[0] = pcs_x;
+            dest[1] = pcs_y;
+            dest[2] = pcs_z;
+        }
+        Ok(dst)
+    }
+}
+
+impl<T: Copy + PointeeSizeExpressible + AsPrimitive<f32>> KatanaInitialStage<f32, T>
+    for KatanaLut4x3<T>
+{
+    fn to_pcs(&self, input: &[T]) -> Result<Vec<f32>, CmsError> {
+        if input.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let l_tbl = Hypercube::new(&self.clut, self.grid_size as usize);
+
+        // If Source PCS is LAB trilinear should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.to_pcs_impl(input, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.to_pcs_impl(input, |x, y, z, w| l_tbl.tetra_vec3(x, y, z, w))
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.to_pcs_impl(input, |x, y, z, w| l_tbl.pyramid_vec3(x, y, z, w))
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.to_pcs_impl(input, |x, y, z, w| l_tbl.prism_vec3(x, y, z, w))
+            }
+            InterpolationMethod::Linear => {
+                self.to_pcs_impl(input, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w))
+            }
+        }
+    }
+}
+
+define_lut4_dispatch!(Lut4x3);
+
+fn make_lut_4x3(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Lut4x3, CmsError> {
+    // There is 4 possible cases:
+    // - All curves are non-linear
+    // - Linearization curves are non-linear, but gamma is linear
+    // - Gamma curves are non-linear, but linearization is linear
+    // - All curves linear
+    let clut_length: usize = (lut.num_clut_grid_points as usize)
+        .safe_powi(lut.num_input_channels as u32)?
+        .safe_mul(lut.num_output_channels as usize)?;
+
+    let clut_table = lut.clut_table.to_clut_f32();
+    if clut_table.len() != clut_length {
+        return Err(CmsError::MalformedClut(MalformedSize {
+            size: clut_table.len(),
+            expected: clut_length,
+        }));
+    }
+
+    let linearization_table = lut.input_table.to_clut_f32();
+
+    if linearization_table.len() < lut.num_input_table_entries as usize * 4 {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: linearization_table.len(),
+            expected: lut.num_input_table_entries as usize * 4,
+        }));
+    }
+
+    let lin_curve0 = linearization_table[0..lut.num_input_table_entries as usize].to_vec();
+    let lin_curve1 = linearization_table
+        [lut.num_input_table_entries as usize..lut.num_input_table_entries as usize * 2]
+        .to_vec();
+    let lin_curve2 = linearization_table
+        [lut.num_input_table_entries as usize * 2..lut.num_input_table_entries as usize * 3]
+        .to_vec();
+    let lin_curve3 = linearization_table
+        [lut.num_input_table_entries as usize * 3..lut.num_input_table_entries as usize * 4]
+        .to_vec();
+
+    let gamma_table = lut.output_table.to_clut_f32();
+
+    if gamma_table.len() < lut.num_output_table_entries as usize * 3 {
+        return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+            size: gamma_table.len(),
+            expected: lut.num_output_table_entries as usize * 3,
+        }));
+    }
+
+    let gamma_curve0 = gamma_table[..lut.num_output_table_entries as usize].to_vec();
+    let gamma_curve1 = gamma_table
+        [lut.num_output_table_entries as usize..lut.num_output_table_entries as usize * 2]
+        .to_vec();
+    let gamma_curve2 = gamma_table
+        [lut.num_output_table_entries as usize * 2..lut.num_output_table_entries as usize * 3]
+        .to_vec();
+
+    let transform = Lut4x3 {
+        linearization: [lin_curve0, lin_curve1, lin_curve2, lin_curve3],
+        interpolation_method: options.interpolation_method,
+        pcs,
+        clut: clut_table,
+        grid_size: lut.num_clut_grid_points,
+        output: [gamma_curve0, gamma_curve1, gamma_curve2],
+    };
+    Ok(transform)
+}
+
+fn stage_lut_4x3(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Box<dyn Stage>, CmsError> {
+    let lut = make_lut_4x3(lut, options, pcs)?;
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+    {
+        use crate::conversions::neon::Lut4x3Neon;
+        let transform = Lut4x3Neon {
+            linearization: lut.linearization,
+            interpolation_method: lut.interpolation_method,
+            pcs: lut.pcs,
+            clut: lut.clut,
+            grid_size: lut.grid_size,
+            output: lut.output,
+        };
+        Ok(Box::new(transform))
+    }
+    #[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+    {
+        #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+        {
+            use crate::conversions::avx::Lut4x3AvxFma;
+            if std::arch::is_x86_feature_detected!("avx2")
+                && std::arch::is_x86_feature_detected!("fma")
+            {
+                let transform = Lut4x3AvxFma {
+                    linearization: lut.linearization,
+                    interpolation_method: lut.interpolation_method,
+                    pcs: lut.pcs,
+                    clut: lut.clut,
+                    grid_size: lut.grid_size,
+                    output: lut.output,
+                };
+                return Ok(Box::new(transform));
+            }
+        }
+        let transform = Lut4x3 {
+            linearization: lut.linearization,
+            interpolation_method: lut.interpolation_method,
+            pcs: lut.pcs,
+            clut: lut.clut,
+            grid_size: lut.grid_size,
+            output: lut.output,
+        };
+        Ok(Box::new(transform))
+    }
+}
+
+pub(crate) fn katana_input_stage_lut_4x3<
+    T: Copy + PointeeSizeExpressible + AsPrimitive<f32> + Send + Sync,
+>(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+    bit_depth: usize,
+) -> Result<Box<dyn KatanaInitialStage<f32, T> + Send + Sync>, CmsError> {
+    // There is 4 possible cases:
+    // - All curves are non-linear
+    // - Linearization curves are non-linear, but gamma is linear
+    // - Gamma curves are non-linear, but linearization is linear
+    // - All curves linear
+    let lut = make_lut_4x3(lut, options, pcs)?;
+
+    let transform = KatanaLut4x3::<T> {
+        linearization: lut.linearization,
+        interpolation_method: lut.interpolation_method,
+        pcs: lut.pcs,
+        clut: lut.clut,
+        grid_size: lut.grid_size,
+        output: lut.output,
+        _phantom: PhantomData,
+        bit_depth,
+    };
+    Ok(Box::new(transform))
+}
+
+pub(crate) fn create_lut4_norm_samples<const SAMPLES: usize>() -> Vec<f32> {
+    let lut_size: u32 = (4 * SAMPLES * SAMPLES * SAMPLES * SAMPLES) as u32;
+
+    let mut src = Vec::with_capacity(lut_size as usize);
+
+    let recpeq = 1f32 / (SAMPLES - 1) as f32;
+    for k in 0..SAMPLES {
+        for c in 0..SAMPLES {
+            for m in 0..SAMPLES {
+                for y in 0..SAMPLES {
+                    src.push(c as f32 * recpeq);
+                    src.push(m as f32 * recpeq);
+                    src.push(y as f32 * recpeq);
+                    src.push(k as f32 * recpeq);
+                }
+            }
+        }
+    }
+    src
+}
+
+pub(crate) fn create_lut4<const SAMPLES: usize>(
+    lut: &LutDataType,
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Vec<f32>, CmsError> {
+    if lut.num_input_channels != 4 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    let lut_size: u32 = (4 * SAMPLES * SAMPLES * SAMPLES * SAMPLES) as u32;
+
+    let src = create_lut4_norm_samples::<SAMPLES>();
+    let mut dest = vec![0.; (lut_size as usize) / 4 * 3];
+
+    let lut_stage = stage_lut_4x3(lut, options, pcs)?;
+    lut_stage.transform(&src, &mut dest)?;
+    Ok(dest)
+}
--- a/vendor/moxcms/src/conversions/lut_transforms.rs
+++ b/vendor/moxcms/src/conversions/lut_transforms.rs
@@ -0,0 +1,802 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::lut3x3::{
+    create_lut3x3, katana_input_stage_lut_3x3, katana_output_stage_lut_3x3,
+};
+use crate::conversions::lut3x4::{create_lut3_samples_norm, create_lut3x4};
+use crate::conversions::lut4::{create_lut4, create_lut4_norm_samples, katana_input_stage_lut_4x3};
+use crate::conversions::mab::{prepare_mab_3x3, prepare_mba_3x3};
+use crate::conversions::transform_lut3_to_4::make_transform_3x4;
+use crate::mlaf::mlaf;
+use crate::{
+    CmsError, ColorProfile, DataColorSpace, InPlaceStage, Layout, LutWarehouse, Matrix3f,
+    ProfileVersion, TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+
+pub(crate) struct MatrixStage {
+    pub(crate) matrices: Vec<Matrix3f>,
+}
+
+impl InPlaceStage for MatrixStage {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        if !self.matrices.is_empty() {
+            let m = self.matrices[0];
+            for dst in dst.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(x * m.v[0][0], y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(x * m.v[1][0], y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(x * m.v[2][0], y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+
+        for m in self.matrices.iter().skip(1) {
+            for dst in dst.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(x * m.v[0][0], y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(x * m.v[1][0], y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(x * m.v[2][0], y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) const LUT_SAMPLING: u16 = 255;
+
+pub(crate) trait Lut3x3Factory {
+    fn make_transform_3x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const SRC_LAYOUT: u8,
+        const DST_LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Send + Sync>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>;
+}
+
+pub(crate) trait Lut4x3Factory {
+    fn make_transform_4x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Sync + Send>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>;
+}
+
+fn pcs_lab_v4_to_v2(profile: &ColorProfile, lut: &mut [f32]) {
+    if profile.pcs == DataColorSpace::Lab
+        && profile.version_internal <= ProfileVersion::V4_0
+        && lut.len() % 3 == 0
+    {
+        assert_eq!(
+            lut.len() % 3,
+            0,
+            "Lut {:?} is not a multiple of 3, this should not happen for lab",
+            lut.len()
+        );
+        let v_mat = vec![Matrix3f {
+            v: [
+                [65280.0 / 65535.0, 0f32, 0f32],
+                [0f32, 65280.0 / 65535.0, 0f32],
+                [0f32, 0f32, 65280.0 / 65535.0f32],
+            ],
+        }];
+        let stage = MatrixStage { matrices: v_mat };
+        stage.transform(lut).unwrap();
+    }
+}
+
+fn pcs_lab_v2_to_v4(profile: &ColorProfile, lut: &mut [f32]) {
+    if profile.pcs == DataColorSpace::Lab
+        && profile.version_internal <= ProfileVersion::V4_0
+        && lut.len() % 3 == 0
+    {
+        assert_eq!(
+            lut.len() % 3,
+            0,
+            "Lut {:?} is not a multiple of 3, this should not happen for lab",
+            lut.len()
+        );
+        let v_mat = vec![Matrix3f {
+            v: [
+                [65535.0 / 65280.0f32, 0f32, 0f32],
+                [0f32, 65535.0f32 / 65280.0f32, 0f32],
+                [0f32, 0f32, 65535.0f32 / 65280.0f32],
+            ],
+        }];
+        let stage = MatrixStage { matrices: v_mat };
+        stage.transform(lut).unwrap();
+    }
+}
+
+macro_rules! make_transform_3x3_fn {
+    ($method_name: ident, $exec_impl: ident) => {
+        fn $method_name<
+            T: Copy
+                + Default
+                + AsPrimitive<f32>
+                + Send
+                + Sync
+                + AsPrimitive<usize>
+                + PointeeSizeExpressible,
+            const GRID_SIZE: usize,
+            const BIT_DEPTH: usize,
+        >(
+            src_layout: Layout,
+            dst_layout: Layout,
+            lut: Vec<f32>,
+            options: TransformOptions,
+            color_space: DataColorSpace,
+            is_linear: bool,
+        ) -> Box<dyn TransformExecutor<T> + Send + Sync>
+        where
+            f32: AsPrimitive<T>,
+            u32: AsPrimitive<T>,
+            (): LutBarycentricReduction<T, u8>,
+            (): LutBarycentricReduction<T, u16>,
+        {
+            match src_layout {
+                Layout::Rgb => match dst_layout {
+                    Layout::Rgb => $exec_impl::make_transform_3x3::<
+                        T,
+                        { Layout::Rgb as u8 },
+                        { Layout::Rgb as u8 },
+                        GRID_SIZE,
+                        BIT_DEPTH,
+                    >(lut, options, color_space, is_linear),
+                    Layout::Rgba => $exec_impl::make_transform_3x3::<
+                        T,
+                        { Layout::Rgb as u8 },
+                        { Layout::Rgba as u8 },
+                        GRID_SIZE,
+                        BIT_DEPTH,
+                    >(lut, options, color_space, is_linear),
+                    _ => unimplemented!(),
+                },
+                Layout::Rgba => match dst_layout {
+                    Layout::Rgb => $exec_impl::make_transform_3x3::<
+                        T,
+                        { Layout::Rgba as u8 },
+                        { Layout::Rgb as u8 },
+                        GRID_SIZE,
+                        BIT_DEPTH,
+                    >(lut, options, color_space, is_linear),
+                    Layout::Rgba => $exec_impl::make_transform_3x3::<
+                        T,
+                        { Layout::Rgba as u8 },
+                        { Layout::Rgba as u8 },
+                        GRID_SIZE,
+                        BIT_DEPTH,
+                    >(lut, options, color_space, is_linear),
+                    _ => unimplemented!(),
+                },
+                _ => unimplemented!(),
+            }
+        }
+    };
+}
+
+macro_rules! make_transform_4x3_fn {
+    ($method_name: ident, $exec_name: ident) => {
+        fn $method_name<
+            T: Copy
+                + Default
+                + AsPrimitive<f32>
+                + Send
+                + Sync
+                + AsPrimitive<usize>
+                + PointeeSizeExpressible,
+            const GRID_SIZE: usize,
+            const BIT_DEPTH: usize,
+        >(
+            dst_layout: Layout,
+            lut: Vec<f32>,
+            options: TransformOptions,
+            data_color_space: DataColorSpace,
+            is_linear: bool,
+        ) -> Box<dyn TransformExecutor<T> + Send + Sync>
+        where
+            f32: AsPrimitive<T>,
+            u32: AsPrimitive<T>,
+            (): LutBarycentricReduction<T, u8>,
+            (): LutBarycentricReduction<T, u16>,
+        {
+            match dst_layout {
+                Layout::Rgb => $exec_name::make_transform_4x3::<
+                    T,
+                    { Layout::Rgb as u8 },
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                >(lut, options, data_color_space, is_linear),
+                Layout::Rgba => $exec_name::make_transform_4x3::<
+                    T,
+                    { Layout::Rgba as u8 },
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                >(lut, options, data_color_space, is_linear),
+                _ => unimplemented!(),
+            }
+        }
+    };
+}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+use crate::conversions::neon::NeonLut3x3Factory;
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+make_transform_3x3_fn!(make_transformer_3x3, NeonLut3x3Factory);
+
+#[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+use crate::conversions::transform_lut3_to_3::DefaultLut3x3Factory;
+#[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+make_transform_3x3_fn!(make_transformer_3x3, DefaultLut3x3Factory);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+use crate::conversions::avx::AvxLut3x3Factory;
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+make_transform_3x3_fn!(make_transformer_3x3_avx_fma, AvxLut3x3Factory);
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+use crate::conversions::sse::SseLut3x3Factory;
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+make_transform_3x3_fn!(make_transformer_3x3_sse41, SseLut3x3Factory);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+use crate::conversions::avx::AvxLut4x3Factory;
+use crate::conversions::interpolator::LutBarycentricReduction;
+use crate::conversions::katana::{
+    Katana, KatanaDefaultIntermediate, KatanaInitialStage, KatanaPostFinalizationStage,
+    KatanaStageLabToXyz, KatanaStageXyzToLab, katana_create_rgb_lin_lut, katana_pcs_lab_v2_to_v4,
+    katana_pcs_lab_v4_to_v2, katana_prepare_inverse_lut_rgb_xyz, multi_dimensional_3x3_to_device,
+    multi_dimensional_3x3_to_pcs, multi_dimensional_4x3_to_pcs,
+};
+use crate::conversions::mab4x3::prepare_mab_4x3;
+use crate::conversions::mba3x4::prepare_mba_3x4;
+use crate::conversions::md_luts_factory::{do_any_to_any, prepare_alpha_finalizer};
+// use crate::conversions::bpc::compensate_bpc_in_lut;
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+make_transform_4x3_fn!(make_transformer_4x3_avx_fma, AvxLut4x3Factory);
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+use crate::conversions::sse::SseLut4x3Factory;
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+make_transform_4x3_fn!(make_transformer_4x3_sse41, SseLut4x3Factory);
+
+#[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+use crate::conversions::transform_lut4_to_3::DefaultLut4x3Factory;
+
+#[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+make_transform_4x3_fn!(make_transformer_4x3, DefaultLut4x3Factory);
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+use crate::conversions::neon::NeonLut4x3Factory;
+use crate::conversions::prelude_lut_xyz_rgb::{create_rgb_lin_lut, prepare_inverse_lut_rgb_xyz};
+use crate::conversions::xyz_lab::{StageLabToXyz, StageXyzToLab};
+use crate::transform::PointeeSizeExpressible;
+use crate::trc::GammaLutInterpolate;
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+make_transform_4x3_fn!(make_transformer_4x3, NeonLut4x3Factory);
+
+#[inline(never)]
+#[cold]
+pub(crate) fn make_lut_transform<
+    T: Copy
+        + Default
+        + AsPrimitive<f32>
+        + Send
+        + Sync
+        + AsPrimitive<usize>
+        + PointeeSizeExpressible
+        + GammaLutInterpolate,
+    const BIT_DEPTH: usize,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+>(
+    src_layout: Layout,
+    source: &ColorProfile,
+    dst_layout: Layout,
+    dest: &ColorProfile,
+    options: TransformOptions,
+) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, u8>,
+    (): LutBarycentricReduction<T, u16>,
+{
+    if (source.color_space == DataColorSpace::Cmyk || source.color_space == DataColorSpace::Color4)
+        && (dest.color_space == DataColorSpace::Rgb || dest.color_space == DataColorSpace::Lab)
+    {
+        source.color_space.check_layout(src_layout)?;
+        dest.color_space.check_layout(dst_layout)?;
+        if source.pcs != DataColorSpace::Xyz && source.pcs != DataColorSpace::Lab {
+            return Err(CmsError::UnsupportedProfileConnection);
+        }
+        if dest.pcs != DataColorSpace::Lab && dest.pcs != DataColorSpace::Xyz {
+            return Err(CmsError::UnsupportedProfileConnection);
+        }
+
+        const GRID_SIZE: usize = 17;
+
+        let is_katana_required_for_source = source
+            .get_device_to_pcs(options.rendering_intent)
+            .ok_or(CmsError::UnsupportedLutRenderingIntent(
+                source.rendering_intent,
+            ))
+            .map(|x| x.is_katana_required())?;
+
+        let is_katana_required_for_destination =
+            if dest.is_matrix_shaper() || dest.pcs == DataColorSpace::Xyz {
+                false
+            } else if dest.pcs == DataColorSpace::Lab {
+                dest.get_pcs_to_device(options.rendering_intent)
+                    .ok_or(CmsError::UnsupportedProfileConnection)
+                    .map(|x| x.is_katana_required())?
+            } else {
+                return Err(CmsError::UnsupportedProfileConnection);
+            };
+
+        if is_katana_required_for_source || is_katana_required_for_destination {
+            let initial_stage: Box<dyn KatanaInitialStage<f32, T> + Send + Sync> =
+                match source.get_device_to_pcs(options.rendering_intent).ok_or(
+                    CmsError::UnsupportedLutRenderingIntent(source.rendering_intent),
+                )? {
+                    LutWarehouse::Lut(lut) => {
+                        katana_input_stage_lut_4x3::<T>(lut, options, source.pcs, BIT_DEPTH)?
+                    }
+                    LutWarehouse::Multidimensional(mab) => {
+                        multi_dimensional_4x3_to_pcs::<T>(mab, options, source.pcs, BIT_DEPTH)?
+                    }
+                };
+
+            let mut stages = Vec::new();
+
+            stages.push(katana_pcs_lab_v2_to_v4(source));
+            if source.pcs == DataColorSpace::Lab {
+                stages.push(Box::new(KatanaStageLabToXyz::default()));
+            }
+            if dest.pcs == DataColorSpace::Lab {
+                stages.push(Box::new(KatanaStageXyzToLab::default()));
+            }
+            stages.push(katana_pcs_lab_v4_to_v2(dest));
+
+            let final_stage = if dest.has_pcs_to_device_lut() {
+                let pcs_to_device = dest
+                    .get_pcs_to_device(options.rendering_intent)
+                    .ok_or(CmsError::UnsupportedProfileConnection)?;
+                match pcs_to_device {
+                    LutWarehouse::Lut(lut) => {
+                        katana_output_stage_lut_3x3::<T>(lut, options, dest.pcs, BIT_DEPTH)?
+                    }
+                    LutWarehouse::Multidimensional(mab) => {
+                        multi_dimensional_3x3_to_device::<T>(mab, options, dest.pcs, BIT_DEPTH)?
+                    }
+                }
+            } else if dest.is_matrix_shaper() {
+                let state = katana_prepare_inverse_lut_rgb_xyz::<T, BIT_DEPTH, GAMMA_LUT>(
+                    dest, dst_layout, options,
+                )?;
+                stages.extend(state.stages);
+                state.final_stage
+            } else {
+                return Err(CmsError::UnsupportedProfileConnection);
+            };
+
+            let mut post_finalization: Vec<Box<dyn KatanaPostFinalizationStage<T> + Send + Sync>> =
+                Vec::new();
+            if let Some(stage) =
+                prepare_alpha_finalizer::<T>(src_layout, source, dst_layout, dest, BIT_DEPTH)
+            {
+                post_finalization.push(stage);
+            }
+
+            return Ok(Box::new(Katana::<f32, T> {
+                initial_stage,
+                final_stage,
+                stages,
+                post_finalization,
+            }));
+        }
+
+        let mut lut = match source.get_device_to_pcs(options.rendering_intent).ok_or(
+            CmsError::UnsupportedLutRenderingIntent(source.rendering_intent),
+        )? {
+            LutWarehouse::Lut(lut) => create_lut4::<GRID_SIZE>(lut, options, source.pcs)?,
+            LutWarehouse::Multidimensional(m_curves) => {
+                let mut samples = create_lut4_norm_samples::<GRID_SIZE>();
+                prepare_mab_4x3(m_curves, &mut samples, options, source.pcs)?
+            }
+        };
+
+        pcs_lab_v2_to_v4(source, &mut lut);
+
+        if source.pcs == DataColorSpace::Lab {
+            let lab_to_xyz_stage = StageLabToXyz::default();
+            lab_to_xyz_stage.transform(&mut lut)?;
+        }
+
+        // if source.color_space == DataColorSpace::Cmyk
+        //     && (options.rendering_intent == RenderingIntent::Perceptual
+        //         || options.rendering_intent == RenderingIntent::RelativeColorimetric)
+        //     && options.black_point_compensation
+        // {
+        //     if let (Some(src_bp), Some(dst_bp)) = (
+        //         source.detect_black_point::<GRID_SIZE>(&lut),
+        //         dest.detect_black_point::<GRID_SIZE>(&lut),
+        //     ) {
+        //         compensate_bpc_in_lut(&mut lut, src_bp, dst_bp);
+        //     }
+        // }
+
+        if dest.pcs == DataColorSpace::Lab {
+            let lab_to_xyz_stage = StageXyzToLab::default();
+            lab_to_xyz_stage.transform(&mut lut)?;
+        }
+
+        pcs_lab_v4_to_v2(dest, &mut lut);
+
+        if dest.pcs == DataColorSpace::Xyz {
+            if dest.is_matrix_shaper() {
+                prepare_inverse_lut_rgb_xyz::<T, BIT_DEPTH, GAMMA_LUT>(dest, &mut lut, options)?;
+            } else {
+                return Err(CmsError::UnsupportedProfileConnection);
+            }
+        } else if dest.pcs == DataColorSpace::Lab {
+            let pcs_to_device = dest
+                .get_pcs_to_device(options.rendering_intent)
+                .ok_or(CmsError::UnsupportedProfileConnection)?;
+            match pcs_to_device {
+                LutWarehouse::Lut(lut_data_type) => {
+                    lut = create_lut3x3(lut_data_type, &lut, options, dest.pcs)?
+                }
+                LutWarehouse::Multidimensional(mab) => {
+                    prepare_mba_3x3(mab, &mut lut, options, dest.pcs)?
+                }
+            }
+        }
+
+        let is_dest_linear_profile = dest.color_space == DataColorSpace::Rgb
+            && dest.is_matrix_shaper()
+            && dest.is_linear_matrix_shaper();
+
+        #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+        if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma")
+        {
+            return Ok(make_transformer_4x3_avx_fma::<T, GRID_SIZE, BIT_DEPTH>(
+                dst_layout,
+                lut,
+                options,
+                dest.color_space,
+                is_dest_linear_profile,
+            ));
+        }
+        #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+        if std::arch::is_x86_feature_detected!("sse4.1") {
+            return Ok(make_transformer_4x3_sse41::<T, GRID_SIZE, BIT_DEPTH>(
+                dst_layout,
+                lut,
+                options,
+                dest.color_space,
+                is_dest_linear_profile,
+            ));
+        }
+
+        Ok(make_transformer_4x3::<T, GRID_SIZE, BIT_DEPTH>(
+            dst_layout,
+            lut,
+            options,
+            dest.color_space,
+            is_dest_linear_profile,
+        ))
+    } else if (source.color_space == DataColorSpace::Rgb
+        || source.color_space == DataColorSpace::Lab)
+        && (dest.color_space == DataColorSpace::Cmyk || dest.color_space == DataColorSpace::Color4)
+    {
+        source.color_space.check_layout(src_layout)?;
+        dest.color_space.check_layout(dst_layout)?;
+
+        if source.pcs != DataColorSpace::Xyz && source.pcs != DataColorSpace::Lab {
+            return Err(CmsError::UnsupportedProfileConnection);
+        }
+
+        const GRID_SIZE: usize = 33;
+
+        let mut lut: Vec<f32>;
+
+        if source.has_device_to_pcs_lut() {
+            let device_to_pcs = source
+                .get_device_to_pcs(options.rendering_intent)
+                .ok_or(CmsError::UnsupportedProfileConnection)?;
+            lut = create_lut3_samples_norm::<GRID_SIZE>();
+
+            match device_to_pcs {
+                LutWarehouse::Lut(lut_data_type) => {
+                    lut = create_lut3x3(lut_data_type, &lut, options, source.pcs)?;
+                }
+                LutWarehouse::Multidimensional(mab) => {
+                    prepare_mab_3x3(mab, &mut lut, options, source.pcs)?
+                }
+            }
+        } else if source.is_matrix_shaper() {
+            lut = create_rgb_lin_lut::<T, BIT_DEPTH, LINEAR_CAP, GRID_SIZE>(source, options)?;
+        } else {
+            return Err(CmsError::UnsupportedProfileConnection);
+        }
+
+        pcs_lab_v2_to_v4(source, &mut lut);
+
+        if source.pcs == DataColorSpace::Xyz && dest.pcs == DataColorSpace::Lab {
+            let xyz_to_lab = StageXyzToLab::default();
+            xyz_to_lab.transform(&mut lut)?;
+        } else if source.pcs == DataColorSpace::Lab && dest.pcs == DataColorSpace::Xyz {
+            let lab_to_xyz_stage = StageLabToXyz::default();
+            lab_to_xyz_stage.transform(&mut lut)?;
+        }
+
+        pcs_lab_v4_to_v2(dest, &mut lut);
+
+        let lut = match dest
+            .get_pcs_to_device(options.rendering_intent)
+            .ok_or(CmsError::UnsupportedProfileConnection)?
+        {
+            LutWarehouse::Lut(lut_type) => create_lut3x4(lut_type, &lut, options, dest.pcs)?,
+            LutWarehouse::Multidimensional(m_curves) => {
+                prepare_mba_3x4(m_curves, &mut lut, options, dest.pcs)?
+            }
+        };
+
+        let is_dest_linear_profile = dest.color_space == DataColorSpace::Rgb
+            && dest.is_matrix_shaper()
+            && dest.is_linear_matrix_shaper();
+
+        Ok(make_transform_3x4::<T, GRID_SIZE, BIT_DEPTH>(
+            src_layout,
+            lut,
+            options,
+            dest.color_space,
+            is_dest_linear_profile,
+        ))
+    } else if (source.color_space.is_three_channels()) && (dest.color_space.is_three_channels()) {
+        source.color_space.check_layout(src_layout)?;
+        dest.color_space.check_layout(dst_layout)?;
+
+        const GRID_SIZE: usize = 33;
+
+        let is_katana_required_for_source = if source.is_matrix_shaper() {
+            false
+        } else {
+            source
+                .get_device_to_pcs(options.rendering_intent)
+                .ok_or(CmsError::UnsupportedLutRenderingIntent(
+                    source.rendering_intent,
+                ))
+                .map(|x| x.is_katana_required())?
+        };
+
+        let is_katana_required_for_destination =
+            if source.is_matrix_shaper() || dest.pcs == DataColorSpace::Xyz {
+                false
+            } else if dest.pcs == DataColorSpace::Lab {
+                dest.get_pcs_to_device(options.rendering_intent)
+                    .ok_or(CmsError::UnsupportedProfileConnection)
+                    .map(|x| x.is_katana_required())?
+            } else {
+                return Err(CmsError::UnsupportedProfileConnection);
+            };
+
+        let mut stages: Vec<Box<KatanaDefaultIntermediate>> = Vec::new();
+
+        // Slow and accurate fallback if anything not acceptable is detected by curve analysis
+        if is_katana_required_for_source || is_katana_required_for_destination {
+            let source_stage: Box<dyn KatanaInitialStage<f32, T> + Send + Sync> =
+                if source.is_matrix_shaper() {
+                    let state = katana_create_rgb_lin_lut::<T, BIT_DEPTH, LINEAR_CAP>(
+                        src_layout, source, options,
+                    )?;
+                    stages.extend(state.stages);
+                    state.initial_stage
+                } else {
+                    match source.get_device_to_pcs(options.rendering_intent).ok_or(
+                        CmsError::UnsupportedLutRenderingIntent(source.rendering_intent),
+                    )? {
+                        LutWarehouse::Lut(lut) => {
+                            katana_input_stage_lut_3x3::<T>(lut, options, source.pcs, BIT_DEPTH)?
+                        }
+                        LutWarehouse::Multidimensional(mab) => {
+                            multi_dimensional_3x3_to_pcs::<T>(mab, options, source.pcs, BIT_DEPTH)?
+                        }
+                    }
+                };
+
+            stages.push(katana_pcs_lab_v2_to_v4(source));
+            if source.pcs == DataColorSpace::Lab {
+                stages.push(Box::new(KatanaStageLabToXyz::default()));
+            }
+            if dest.pcs == DataColorSpace::Lab {
+                stages.push(Box::new(KatanaStageXyzToLab::default()));
+            }
+            stages.push(katana_pcs_lab_v4_to_v2(dest));
+
+            let final_stage = if dest.has_pcs_to_device_lut() {
+                let pcs_to_device = dest
+                    .get_pcs_to_device(options.rendering_intent)
+                    .ok_or(CmsError::UnsupportedProfileConnection)?;
+                match pcs_to_device {
+                    LutWarehouse::Lut(lut) => {
+                        katana_output_stage_lut_3x3::<T>(lut, options, dest.pcs, BIT_DEPTH)?
+                    }
+                    LutWarehouse::Multidimensional(mab) => {
+                        multi_dimensional_3x3_to_device::<T>(mab, options, dest.pcs, BIT_DEPTH)?
+                    }
+                }
+            } else if dest.is_matrix_shaper() {
+                let state = katana_prepare_inverse_lut_rgb_xyz::<T, BIT_DEPTH, GAMMA_LUT>(
+                    dest, dst_layout, options,
+                )?;
+                stages.extend(state.stages);
+                state.final_stage
+            } else {
+                return Err(CmsError::UnsupportedProfileConnection);
+            };
+
+            let mut post_finalization: Vec<Box<dyn KatanaPostFinalizationStage<T> + Send + Sync>> =
+                Vec::new();
+            if let Some(stage) =
+                prepare_alpha_finalizer::<T>(src_layout, source, dst_layout, dest, BIT_DEPTH)
+            {
+                post_finalization.push(stage);
+            }
+
+            return Ok(Box::new(Katana::<f32, T> {
+                initial_stage: source_stage,
+                final_stage,
+                stages,
+                post_finalization,
+            }));
+        }
+
+        let mut lut: Vec<f32>;
+
+        if source.has_device_to_pcs_lut() {
+            let device_to_pcs = source
+                .get_device_to_pcs(options.rendering_intent)
+                .ok_or(CmsError::UnsupportedProfileConnection)?;
+            lut = create_lut3_samples_norm::<GRID_SIZE>();
+
+            match device_to_pcs {
+                LutWarehouse::Lut(lut_data_type) => {
+                    lut = create_lut3x3(lut_data_type, &lut, options, source.pcs)?;
+                }
+                LutWarehouse::Multidimensional(mab) => {
+                    prepare_mab_3x3(mab, &mut lut, options, source.pcs)?
+                }
+            }
+        } else if source.is_matrix_shaper() {
+            lut = create_rgb_lin_lut::<T, BIT_DEPTH, LINEAR_CAP, GRID_SIZE>(source, options)?;
+        } else {
+            return Err(CmsError::UnsupportedProfileConnection);
+        }
+
+        pcs_lab_v2_to_v4(source, &mut lut);
+
+        if source.pcs == DataColorSpace::Xyz && dest.pcs == DataColorSpace::Lab {
+            let xyz_to_lab = StageXyzToLab::default();
+            xyz_to_lab.transform(&mut lut)?;
+        } else if source.pcs == DataColorSpace::Lab && dest.pcs == DataColorSpace::Xyz {
+            let lab_to_xyz_stage = StageLabToXyz::default();
+            lab_to_xyz_stage.transform(&mut lut)?;
+        }
+
+        pcs_lab_v4_to_v2(dest, &mut lut);
+
+        if dest.has_pcs_to_device_lut() {
+            let pcs_to_device = dest
+                .get_pcs_to_device(options.rendering_intent)
+                .ok_or(CmsError::UnsupportedProfileConnection)?;
+            match pcs_to_device {
+                LutWarehouse::Lut(lut_data_type) => {
+                    lut = create_lut3x3(lut_data_type, &lut, options, dest.pcs)?;
+                }
+                LutWarehouse::Multidimensional(mab) => {
+                    prepare_mba_3x3(mab, &mut lut, options, dest.pcs)?
+                }
+            }
+        } else if dest.is_matrix_shaper() {
+            prepare_inverse_lut_rgb_xyz::<T, BIT_DEPTH, GAMMA_LUT>(dest, &mut lut, options)?;
+        } else {
+            return Err(CmsError::UnsupportedProfileConnection);
+        }
+
+        let is_dest_linear_profile = dest.color_space == DataColorSpace::Rgb
+            && dest.is_matrix_shaper()
+            && dest.is_linear_matrix_shaper();
+
+        #[cfg(all(feature = "avx", target_arch = "x86_64"))]
+        if std::arch::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") {
+            return Ok(make_transformer_3x3_avx_fma::<T, GRID_SIZE, BIT_DEPTH>(
+                src_layout,
+                dst_layout,
+                lut,
+                options,
+                dest.color_space,
+                is_dest_linear_profile,
+            ));
+        }
+        #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+        if std::arch::is_x86_feature_detected!("sse4.1") {
+            return Ok(make_transformer_3x3_sse41::<T, GRID_SIZE, BIT_DEPTH>(
+                src_layout,
+                dst_layout,
+                lut,
+                options,
+                dest.color_space,
+                is_dest_linear_profile,
+            ));
+        }
+
+        Ok(make_transformer_3x3::<T, GRID_SIZE, BIT_DEPTH>(
+            src_layout,
+            dst_layout,
+            lut,
+            options,
+            dest.color_space,
+            is_dest_linear_profile,
+        ))
+    } else {
+        do_any_to_any::<T, BIT_DEPTH, LINEAR_CAP, GAMMA_LUT>(
+            src_layout, source, dst_layout, dest, options,
+        )
+    }
+}
--- a/vendor/moxcms/src/conversions/mab.rs
+++ b/vendor/moxcms/src/conversions/mab.rs
@@ -0,0 +1,730 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::mlaf::mlaf;
+use crate::safe_math::SafeMul;
+use crate::{
+    CmsError, Cube, DataColorSpace, InPlaceStage, InterpolationMethod, LutMultidimensionalType,
+    MalformedSize, Matrix3d, Matrix3f, TransformOptions, Vector3d, Vector3f,
+};
+
+#[allow(unused)]
+struct ACurves3<'a, const DEPTH: usize> {
+    curve0: Box<[f32; 65536]>,
+    curve1: Box<[f32; 65536]>,
+    curve2: Box<[f32; 65536]>,
+    clut: &'a [f32],
+    grid_size: [u8; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+#[allow(unused)]
+struct ACurves3Optimized<'a> {
+    clut: &'a [f32],
+    grid_size: [u8; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+#[allow(unused)]
+impl<const DEPTH: usize> ACurves3<'_, DEPTH> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> Vector3f>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let scale_value = (DEPTH - 1) as f32;
+
+        for dst in dst.chunks_exact_mut(3) {
+            let a0 = (dst[0] * scale_value).round().min(scale_value) as u16;
+            let a1 = (dst[1] * scale_value).round().min(scale_value) as u16;
+            let a2 = (dst[2] * scale_value).round().min(scale_value) as u16;
+            let b0 = self.curve0[a0 as usize];
+            let b1 = self.curve1[a1 as usize];
+            let b2 = self.curve2[a2 as usize];
+            let interpolated = fetch(b0, b1, b2);
+            dst[0] = interpolated.v[0];
+            dst[1] = interpolated.v[1];
+            dst[2] = interpolated.v[2];
+        }
+        Ok(())
+    }
+}
+
+#[allow(unused)]
+impl ACurves3Optimized<'_> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> Vector3f>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        for dst in dst.chunks_exact_mut(3) {
+            let a0 = dst[0];
+            let a1 = dst[1];
+            let a2 = dst[2];
+            let interpolated = fetch(a0, a1, a2);
+            dst[0] = interpolated.v[0];
+            dst[1] = interpolated.v[1];
+            dst[2] = interpolated.v[2];
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> InPlaceStage for ACurves3<'_, DEPTH> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = Cube::new_cube(self.clut, self.grid_size);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl InPlaceStage for ACurves3Optimized<'_> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = Cube::new_cube(self.clut, self.grid_size);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab {
+            return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+#[allow(unused)]
+struct ACurves3Inverse<'a, const DEPTH: usize> {
+    curve0: Box<[f32; 65536]>,
+    curve1: Box<[f32; 65536]>,
+    curve2: Box<[f32; 65536]>,
+    clut: &'a [f32],
+    grid_size: [u8; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+#[allow(unused)]
+impl<const DEPTH: usize> ACurves3Inverse<'_, DEPTH> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> Vector3f>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let scale_value = (DEPTH as u32 - 1u32) as f32;
+
+        for dst in dst.chunks_exact_mut(3) {
+            let interpolated = fetch(dst[0], dst[1], dst[2]);
+            let a0 = (interpolated.v[0] * scale_value).round().min(scale_value) as u16;
+            let a1 = (interpolated.v[1] * scale_value).round().min(scale_value) as u16;
+            let a2 = (interpolated.v[2] * scale_value).round().min(scale_value) as u16;
+            let b0 = self.curve0[a0 as usize];
+            let b1 = self.curve1[a1 as usize];
+            let b2 = self.curve2[a2 as usize];
+            dst[0] = b0;
+            dst[1] = b1;
+            dst[2] = b2;
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> InPlaceStage for ACurves3Inverse<'_, DEPTH> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = Cube::new_cube(self.clut, self.grid_size);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+pub(crate) struct MCurves3<const DEPTH: usize> {
+    pub(crate) curve0: Box<[f32; 65536]>,
+    pub(crate) curve1: Box<[f32; 65536]>,
+    pub(crate) curve2: Box<[f32; 65536]>,
+    pub(crate) matrix: Matrix3f,
+    pub(crate) bias: Vector3f,
+    pub(crate) inverse: bool,
+}
+
+impl<const DEPTH: usize> MCurves3<DEPTH> {
+    fn execute_matrix_stage(&self, dst: &mut [f32]) {
+        let m = self.matrix;
+        let b = self.bias;
+
+        if !m.test_equality(Matrix3f::IDENTITY) || !b.eq(&Vector3f::default()) {
+            for dst in dst.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(mlaf(b.v[0], x, m.v[0][0]), y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(mlaf(b.v[1], x, m.v[1][0]), y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(mlaf(b.v[2], x, m.v[2][0]), y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+    }
+}
+
+impl<const DEPTH: usize> InPlaceStage for MCurves3<DEPTH> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let scale_value = (DEPTH - 1) as f32;
+
+        if self.inverse {
+            self.execute_matrix_stage(dst);
+        }
+
+        for dst in dst.chunks_exact_mut(3) {
+            let a0 = (dst[0] * scale_value).round().min(scale_value) as u16;
+            let a1 = (dst[1] * scale_value).round().min(scale_value) as u16;
+            let a2 = (dst[2] * scale_value).round().min(scale_value) as u16;
+            let b0 = self.curve0[a0 as usize];
+            let b1 = self.curve1[a1 as usize];
+            let b2 = self.curve2[a2 as usize];
+            dst[0] = b0;
+            dst[1] = b1;
+            dst[2] = b2;
+        }
+
+        if !self.inverse {
+            self.execute_matrix_stage(dst);
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) struct BCurves3<const DEPTH: usize> {
+    pub(crate) curve0: Box<[f32; 65536]>,
+    pub(crate) curve1: Box<[f32; 65536]>,
+    pub(crate) curve2: Box<[f32; 65536]>,
+}
+
+impl<const DEPTH: usize> InPlaceStage for BCurves3<DEPTH> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let scale_value = (DEPTH - 1) as f32;
+
+        for dst in dst.chunks_exact_mut(3) {
+            let a0 = (dst[0] * scale_value).round().min(scale_value) as u16;
+            let a1 = (dst[1] * scale_value).round().min(scale_value) as u16;
+            let a2 = (dst[2] * scale_value).round().min(scale_value) as u16;
+            let b0 = self.curve0[a0 as usize];
+            let b1 = self.curve1[a1 as usize];
+            let b2 = self.curve2[a2 as usize];
+            dst[0] = b0;
+            dst[1] = b1;
+            dst[2] = b2;
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) fn prepare_mab_3x3(
+    mab: &LutMultidimensionalType,
+    lut: &mut [f32],
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<(), CmsError> {
+    const LERP_DEPTH: usize = 65536;
+    const BP: usize = 13;
+    const DEPTH: usize = 8192;
+
+    if mab.num_input_channels != 3 && mab.num_output_channels != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    if mab.a_curves.len() == 3 && mab.clut.is_some() {
+        let clut = &mab.clut.as_ref().map(|x| x.to_clut_f32()).unwrap();
+        let lut_grid = (mab.grid_points[0] as usize)
+            .safe_mul(mab.grid_points[1] as usize)?
+            .safe_mul(mab.grid_points[2] as usize)?
+            .safe_mul(mab.num_output_channels as usize)?;
+        if clut.len() != lut_grid {
+            return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+                size: clut.len(),
+                expected: lut_grid,
+            }));
+        }
+
+        let all_curves_linear = mab.a_curves.iter().all(|curve| curve.is_linear());
+        let grid_size = [mab.grid_points[0], mab.grid_points[1], mab.grid_points[2]];
+
+        #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+        if all_curves_linear {
+            use crate::conversions::neon::ACurves3OptimizedNeon;
+            let a_curves = ACurves3OptimizedNeon {
+                clut,
+                grid_size,
+                interpolation_method: options.interpolation_method,
+                pcs,
+            };
+            a_curves.transform(lut)?;
+        } else {
+            use crate::conversions::neon::ACurves3Neon;
+            let curves: Result<Vec<_>, _> = mab
+                .a_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+            let a_curves = ACurves3Neon::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+                clut,
+                grid_size,
+                interpolation_method: options.interpolation_method,
+                pcs,
+            };
+            a_curves.transform(lut)?;
+        }
+
+        #[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+        {
+            let mut execution_box: Option<Box<dyn InPlaceStage>> = None;
+
+            if all_curves_linear {
+                #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+                {
+                    use crate::conversions::avx::ACurves3OptimizedAvxFma;
+                    if std::arch::is_x86_feature_detected!("avx2")
+                        && std::arch::is_x86_feature_detected!("fma")
+                    {
+                        execution_box = Some(Box::new(ACurves3OptimizedAvxFma {
+                            clut,
+                            grid_size,
+                            interpolation_method: options.interpolation_method,
+                            pcs,
+                        }));
+                    }
+                }
+                if execution_box.is_none() {
+                    execution_box = Some(Box::new(ACurves3Optimized {
+                        clut,
+                        grid_size,
+                        interpolation_method: options.interpolation_method,
+                        pcs,
+                    }));
+                }
+            } else {
+                #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+                {
+                    use crate::conversions::avx::ACurves3AvxFma;
+                    if std::arch::is_x86_feature_detected!("avx2")
+                        && std::arch::is_x86_feature_detected!("fma")
+                    {
+                        let curves: Result<Vec<_>, _> = mab
+                            .a_curves
+                            .iter()
+                            .map(|c| {
+                                c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                                    .ok_or(CmsError::InvalidTrcCurve)
+                            })
+                            .collect();
+
+                        let [curve0, curve1, curve2] =
+                            curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+                        execution_box = Some(Box::new(ACurves3AvxFma::<DEPTH> {
+                            curve0,
+                            curve1,
+                            curve2,
+                            clut,
+                            grid_size,
+                            interpolation_method: options.interpolation_method,
+                            pcs,
+                        }));
+                    }
+                }
+
+                if execution_box.is_none() {
+                    let curves: Result<Vec<_>, _> = mab
+                        .a_curves
+                        .iter()
+                        .map(|c| {
+                            c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                                .ok_or(CmsError::InvalidTrcCurve)
+                        })
+                        .collect();
+
+                    let [curve0, curve1, curve2] =
+                        curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+                    execution_box = Some(Box::new(ACurves3::<DEPTH> {
+                        curve0,
+                        curve1,
+                        curve2,
+                        clut,
+                        grid_size,
+                        interpolation_method: options.interpolation_method,
+                        pcs,
+                    }));
+                }
+            }
+
+            execution_box
+                .expect("LUT Sampler on Multidimensional 3x3 must be set")
+                .transform(lut)?;
+        }
+    }
+
+    if mab.m_curves.len() == 3 {
+        let all_curves_linear = mab.m_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear
+            || !mab.matrix.test_equality(Matrix3d::IDENTITY)
+            || mab.bias.ne(&Vector3d::default())
+        {
+            let curves: Result<Vec<_>, _> = mab
+                .m_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+            let matrix = mab.matrix.to_f32();
+            let bias: Vector3f = mab.bias.cast();
+            let m_curves = MCurves3::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+                matrix,
+                bias,
+                inverse: false,
+            };
+            m_curves.transform(lut)?;
+        }
+    }
+
+    if mab.b_curves.len() == 3 {
+        const LERP_DEPTH: usize = 65536;
+        const BP: usize = 13;
+        const DEPTH: usize = 8192;
+        let all_curves_linear = mab.b_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear {
+            let curves: Result<Vec<_>, _> = mab
+                .b_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+
+            let b_curves = BCurves3::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+            };
+            b_curves.transform(lut)?;
+        }
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    }
+
+    Ok(())
+}
+
+pub(crate) fn prepare_mba_3x3(
+    mab: &LutMultidimensionalType,
+    lut: &mut [f32],
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<(), CmsError> {
+    if mab.num_input_channels != 3 && mab.num_output_channels != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    const LERP_DEPTH: usize = 65536;
+    const BP: usize = 13;
+    const DEPTH: usize = 8192;
+
+    if mab.b_curves.len() == 3 {
+        let all_curves_linear = mab.b_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear {
+            let curves: Result<Vec<_>, _> = mab
+                .b_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+            let b_curves = BCurves3::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+            };
+            b_curves.transform(lut)?;
+        }
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    }
+
+    if mab.m_curves.len() == 3 {
+        let all_curves_linear = mab.m_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear
+            || !mab.matrix.test_equality(Matrix3d::IDENTITY)
+            || mab.bias.ne(&Vector3d::default())
+        {
+            let curves: Result<Vec<_>, _> = mab
+                .m_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+
+            let matrix = mab.matrix.to_f32();
+            let bias: Vector3f = mab.bias.cast();
+            let m_curves = MCurves3::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+                matrix,
+                bias,
+                inverse: true,
+            };
+            m_curves.transform(lut)?;
+        }
+    }
+
+    if mab.a_curves.len() == 3 && mab.clut.is_some() {
+        let clut = &mab.clut.as_ref().map(|x| x.to_clut_f32()).unwrap();
+        let lut_grid = (mab.grid_points[0] as usize)
+            .safe_mul(mab.grid_points[1] as usize)?
+            .safe_mul(mab.grid_points[2] as usize)?
+            .safe_mul(mab.num_output_channels as usize)?;
+        if clut.len() != lut_grid {
+            return Err(CmsError::MalformedCurveLutTable(MalformedSize {
+                size: clut.len(),
+                expected: lut_grid,
+            }));
+        }
+
+        let all_curves_linear = mab.a_curves.iter().all(|curve| curve.is_linear());
+        let grid_size = [mab.grid_points[0], mab.grid_points[1], mab.grid_points[2]];
+
+        #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+        if all_curves_linear {
+            use crate::conversions::neon::ACurves3OptimizedNeon;
+            let a_curves = ACurves3OptimizedNeon {
+                clut,
+                grid_size,
+                interpolation_method: options.interpolation_method,
+                pcs,
+            };
+            a_curves.transform(lut)?;
+        } else {
+            use crate::conversions::neon::ACurves3InverseNeon;
+            let curves: Result<Vec<_>, _> = mab
+                .a_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+            let a_curves = ACurves3InverseNeon::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+                clut,
+                grid_size,
+                interpolation_method: options.interpolation_method,
+                pcs,
+            };
+            a_curves.transform(lut)?;
+        }
+        #[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+        {
+            let mut execution_box: Option<Box<dyn InPlaceStage>> = None;
+
+            if all_curves_linear {
+                #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+                {
+                    use crate::conversions::avx::ACurves3OptimizedAvxFma;
+                    if std::arch::is_x86_feature_detected!("avx2")
+                        && std::arch::is_x86_feature_detected!("fma")
+                    {
+                        execution_box = Some(Box::new(ACurves3OptimizedAvxFma {
+                            clut,
+                            grid_size,
+                            interpolation_method: options.interpolation_method,
+                            pcs,
+                        }));
+                    }
+                }
+
+                if execution_box.is_none() {
+                    execution_box = Some(Box::new(ACurves3Optimized {
+                        clut,
+                        grid_size,
+                        interpolation_method: options.interpolation_method,
+                        pcs,
+                    }));
+                }
+            } else {
+                #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+                {
+                    use crate::conversions::avx::ACurves3InverseAvxFma;
+                    if std::arch::is_x86_feature_detected!("avx2")
+                        && std::arch::is_x86_feature_detected!("fma")
+                    {
+                        let curves: Result<Vec<_>, _> = mab
+                            .a_curves
+                            .iter()
+                            .map(|c| {
+                                c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                                    .ok_or(CmsError::InvalidTrcCurve)
+                            })
+                            .collect();
+
+                        let [curve0, curve1, curve2] =
+                            curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+                        execution_box = Some(Box::new(ACurves3InverseAvxFma::<DEPTH> {
+                            curve0,
+                            curve1,
+                            curve2,
+                            clut,
+                            grid_size,
+                            interpolation_method: options.interpolation_method,
+                            pcs,
+                        }));
+                    }
+                }
+
+                if execution_box.is_none() {
+                    let curves: Result<Vec<_>, _> = mab
+                        .a_curves
+                        .iter()
+                        .map(|c| {
+                            c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                                .ok_or(CmsError::InvalidTrcCurve)
+                        })
+                        .collect();
+
+                    let [curve0, curve1, curve2] =
+                        curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+                    execution_box = Some(Box::new(ACurves3Inverse::<DEPTH> {
+                        curve0,
+                        curve1,
+                        curve2,
+                        clut,
+                        grid_size,
+                        interpolation_method: options.interpolation_method,
+                        pcs,
+                    }));
+                }
+            }
+
+            execution_box
+                .expect("LUT Sampler on Multidimensional Inverse 3x3 must be set")
+                .transform(lut)?;
+        }
+    }
+
+    Ok(())
+}
--- a/vendor/moxcms/src/conversions/mab4x3.rs
+++ b/vendor/moxcms/src/conversions/mab4x3.rs
@@ -0,0 +1,394 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::mab::{BCurves3, MCurves3};
+use crate::safe_math::SafeMul;
+use crate::{
+    CmsError, DataColorSpace, Hypercube, InPlaceStage, InterpolationMethod,
+    LutMultidimensionalType, MalformedSize, Matrix3d, Stage, TransformOptions, Vector3d, Vector3f,
+};
+
+#[allow(dead_code)]
+struct ACurves4x3<'a, const DEPTH: usize> {
+    curve0: Box<[f32; 65536]>,
+    curve1: Box<[f32; 65536]>,
+    curve2: Box<[f32; 65536]>,
+    curve3: Box<[f32; 65536]>,
+    clut: &'a [f32],
+    grid_size: [u8; 4],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+#[allow(dead_code)]
+struct ACurves4x3Optimized<'a> {
+    clut: &'a [f32],
+    grid_size: [u8; 4],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+#[allow(dead_code)]
+impl<const DEPTH: usize> ACurves4x3<'_, DEPTH> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> Vector3f>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let scale_value = (DEPTH - 1) as f32;
+
+        assert_eq!(src.len() / 4, dst.len() / 3);
+
+        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
+            let a0 = (src[0] * scale_value).round().min(scale_value) as u16;
+            let a1 = (src[1] * scale_value).round().min(scale_value) as u16;
+            let a2 = (src[2] * scale_value).round().min(scale_value) as u16;
+            let a3 = (src[3] * scale_value).round().min(scale_value) as u16;
+            let c = self.curve0[a0 as usize];
+            let m = self.curve1[a1 as usize];
+            let y = self.curve2[a2 as usize];
+            let k = self.curve3[a3 as usize];
+
+            let r = fetch(c, m, y, k);
+            dst[0] = r.v[0];
+            dst[1] = r.v[1];
+            dst[2] = r.v[2];
+        }
+        Ok(())
+    }
+}
+
+#[allow(dead_code)]
+impl ACurves4x3Optimized<'_> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> Vector3f>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        assert_eq!(src.len() / 4, dst.len() / 3);
+
+        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
+            let c = src[0];
+            let m = src[1];
+            let y = src[2];
+            let k = src[3];
+
+            let r = fetch(c, m, y, k);
+            dst[0] = r.v[0];
+            dst[1] = r.v[1];
+            dst[2] = r.v[2];
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> Stage for ACurves4x3<'_, DEPTH> {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = Hypercube::new_hypercube(self.clut, self.grid_size);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Stage for ACurves4x3Optimized<'_> {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = Hypercube::new_hypercube(self.clut, self.grid_size);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+pub(crate) fn prepare_mab_4x3(
+    mab: &LutMultidimensionalType,
+    lut: &mut [f32],
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Vec<f32>, CmsError> {
+    const LERP_DEPTH: usize = 65536;
+    const BP: usize = 13;
+    const DEPTH: usize = 8192;
+    if mab.num_input_channels != 4 && mab.num_output_channels != 3 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+    let mut new_lut = vec![0f32; (lut.len() / 4) * 3];
+    if mab.a_curves.len() == 4 && mab.clut.is_some() {
+        let clut = &mab.clut.as_ref().map(|x| x.to_clut_f32()).unwrap();
+
+        let lut_grid = (mab.grid_points[0] as usize)
+            .safe_mul(mab.grid_points[1] as usize)?
+            .safe_mul(mab.grid_points[2] as usize)?
+            .safe_mul(mab.grid_points[3] as usize)?
+            .safe_mul(mab.num_output_channels as usize)?;
+        if clut.len() != lut_grid {
+            return Err(CmsError::MalformedClut(MalformedSize {
+                size: clut.len(),
+                expected: lut_grid,
+            }));
+        }
+
+        let all_curves_linear = mab.a_curves.iter().all(|curve| curve.is_linear());
+        let grid_size = [
+            mab.grid_points[0],
+            mab.grid_points[1],
+            mab.grid_points[2],
+            mab.grid_points[3],
+        ];
+
+        #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+        if all_curves_linear {
+            use crate::conversions::neon::ACurves4x3NeonOptimizedNeon;
+            let a_curves = ACurves4x3NeonOptimizedNeon {
+                clut,
+                grid_size,
+                interpolation_method: options.interpolation_method,
+                pcs,
+            };
+            a_curves.transform(lut, &mut new_lut)?;
+        } else {
+            use crate::conversions::neon::ACurves4x3Neon;
+            let curves: Result<Vec<_>, _> = mab
+                .a_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2, curve3] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+            let a_curves = ACurves4x3Neon::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+                curve3,
+                clut,
+                grid_size,
+                interpolation_method: options.interpolation_method,
+                pcs,
+            };
+            a_curves.transform(lut, &mut new_lut)?;
+        }
+
+        #[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+        {
+            let mut execution_box: Option<Box<dyn Stage>> = None;
+
+            if all_curves_linear {
+                #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+                {
+                    use crate::conversions::avx::ACurves4x3AvxFmaOptimized;
+                    if std::arch::is_x86_feature_detected!("avx2")
+                        && std::arch::is_x86_feature_detected!("fma")
+                    {
+                        execution_box = Some(Box::new(ACurves4x3AvxFmaOptimized {
+                            clut,
+                            grid_size,
+                            interpolation_method: options.interpolation_method,
+                            pcs,
+                        }));
+                    }
+                }
+                if execution_box.is_none() {
+                    execution_box = Some(Box::new(ACurves4x3Optimized {
+                        clut,
+                        grid_size,
+                        interpolation_method: options.interpolation_method,
+                        pcs,
+                    }));
+                }
+            } else {
+                #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+                {
+                    use crate::conversions::avx::ACurves4x3AvxFma;
+                    if std::arch::is_x86_feature_detected!("avx2")
+                        && std::arch::is_x86_feature_detected!("fma")
+                    {
+                        let curves: Result<Vec<_>, _> = mab
+                            .a_curves
+                            .iter()
+                            .map(|c| {
+                                c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                                    .ok_or(CmsError::InvalidTrcCurve)
+                            })
+                            .collect();
+
+                        let [curve0, curve1, curve2, curve3] =
+                            curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+                        execution_box = Some(Box::new(ACurves4x3AvxFma::<DEPTH> {
+                            curve0,
+                            curve1,
+                            curve2,
+                            curve3,
+                            clut,
+                            grid_size,
+                            interpolation_method: options.interpolation_method,
+                            pcs,
+                        }));
+                    }
+                }
+
+                if execution_box.is_none() {
+                    let curves: Result<Vec<_>, _> = mab
+                        .a_curves
+                        .iter()
+                        .map(|c| {
+                            c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                                .ok_or(CmsError::InvalidTrcCurve)
+                        })
+                        .collect();
+
+                    let [curve0, curve1, curve2, curve3] =
+                        curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+                    execution_box = Some(Box::new(ACurves4x3::<DEPTH> {
+                        curve0,
+                        curve1,
+                        curve2,
+                        curve3,
+                        clut,
+                        grid_size,
+                        interpolation_method: options.interpolation_method,
+                        pcs,
+                    }));
+                }
+            }
+
+            execution_box
+                .expect("Sampler for Multidimensional 4x3 must be set")
+                .transform(lut, &mut new_lut)?;
+        }
+    } else {
+        // Not supported
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+
+    if mab.m_curves.len() == 3 {
+        let all_curves_linear = mab.m_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear
+            || !mab.matrix.test_equality(Matrix3d::IDENTITY)
+            || mab.bias.ne(&Vector3d::default())
+        {
+            let curves: Result<Vec<_>, _> = mab
+                .m_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+
+            let matrix = mab.matrix.to_f32();
+            let bias: Vector3f = mab.bias.cast();
+            let m_curves = MCurves3::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+                matrix,
+                bias,
+                inverse: false,
+            };
+            m_curves.transform(&mut new_lut)?;
+        }
+    }
+
+    if mab.b_curves.len() == 3 {
+        let all_curves_linear = mab.b_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear {
+            let curves: Result<Vec<_>, _> = mab
+                .b_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+            let b_curves = BCurves3::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+            };
+            b_curves.transform(&mut new_lut)?;
+        }
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    }
+
+    Ok(new_lut)
+}
--- a/vendor/moxcms/src/conversions/mba3x4.rs
+++ b/vendor/moxcms/src/conversions/mba3x4.rs
@@ -0,0 +1,298 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::mab::{BCurves3, MCurves3};
+use crate::safe_math::SafeMul;
+use crate::{
+    CmsError, Cube, DataColorSpace, InPlaceStage, InterpolationMethod, LutMultidimensionalType,
+    MalformedSize, Matrix3d, Stage, TransformOptions, Vector3d, Vector4f,
+};
+
+struct ACurves3x4Inverse<'a, const DEPTH: usize> {
+    curve0: Box<[f32; 65536]>,
+    curve1: Box<[f32; 65536]>,
+    curve2: Box<[f32; 65536]>,
+    curve3: Box<[f32; 65536]>,
+    clut: &'a [f32],
+    grid_size: [u8; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+struct ACurves3x4InverseOptimized<'a> {
+    clut: &'a [f32],
+    grid_size: [u8; 3],
+    interpolation_method: InterpolationMethod,
+    pcs: DataColorSpace,
+}
+
+impl<const DEPTH: usize> ACurves3x4Inverse<'_, DEPTH> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> Vector4f>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let scale_value = (DEPTH as u32 - 1u32) as f32;
+
+        assert_eq!(src.len() / 3, dst.len() / 4);
+
+        for (src, dst) in src.chunks_exact(3).zip(dst.chunks_exact_mut(4)) {
+            let interpolated = fetch(src[0], src[1], src[2]);
+            let a0 = (interpolated.v[0] * scale_value).round().min(scale_value) as u16;
+            let a1 = (interpolated.v[1] * scale_value).round().min(scale_value) as u16;
+            let a2 = (interpolated.v[2] * scale_value).round().min(scale_value) as u16;
+            let a3 = (interpolated.v[3] * scale_value).round().min(scale_value) as u16;
+            let b0 = self.curve0[a0 as usize];
+            let b1 = self.curve1[a1 as usize];
+            let b2 = self.curve2[a2 as usize];
+            let b3 = self.curve3[a3 as usize];
+            dst[0] = b0;
+            dst[1] = b1;
+            dst[2] = b2;
+            dst[3] = b3;
+        }
+        Ok(())
+    }
+}
+
+impl ACurves3x4InverseOptimized<'_> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> Vector4f>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        assert_eq!(src.len() / 3, dst.len() / 4);
+
+        for (src, dst) in src.chunks_exact(3).zip(dst.chunks_exact_mut(4)) {
+            let interpolated = fetch(src[0], src[1], src[2]);
+            let b0 = interpolated.v[0];
+            let b1 = interpolated.v[1];
+            let b2 = interpolated.v[2];
+            let b3 = interpolated.v[3];
+            dst[0] = b0;
+            dst[1] = b1;
+            dst[2] = b2;
+            dst[3] = b3;
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> Stage for ACurves3x4Inverse<'_, DEPTH> {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = Cube::new_cube(self.clut, self.grid_size);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(src, dst, |x, y, z| lut.trilinear_vec4(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(src, dst, |x, y, z| lut.tetra_vec4(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(src, dst, |x, y, z| lut.pyramid_vec4(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(src, dst, |x, y, z| lut.prism_vec4(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(src, dst, |x, y, z| lut.trilinear_vec4(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Stage for ACurves3x4InverseOptimized<'_> {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = Cube::new_cube(self.clut, self.grid_size);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(src, dst, |x, y, z| lut.trilinear_vec4(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(src, dst, |x, y, z| lut.tetra_vec4(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(src, dst, |x, y, z| lut.pyramid_vec4(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(src, dst, |x, y, z| lut.prism_vec4(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(src, dst, |x, y, z| lut.trilinear_vec4(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+pub(crate) fn prepare_mba_3x4(
+    mab: &LutMultidimensionalType,
+    lut: &mut [f32],
+    options: TransformOptions,
+    pcs: DataColorSpace,
+) -> Result<Vec<f32>, CmsError> {
+    if mab.num_input_channels != 3 && mab.num_output_channels != 4 {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+
+    const LERP_DEPTH: usize = 65536;
+    const BP: usize = 13;
+    const DEPTH: usize = 8192;
+
+    if mab.b_curves.len() == 3 {
+        let all_curves_linear = mab.b_curves.iter().all(|curve| curve.is_linear());
+
+        if !all_curves_linear {
+            let curves: Result<Vec<_>, _> = mab
+                .b_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+            let b_curves = BCurves3::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+            };
+            b_curves.transform(lut)?;
+        }
+    } else {
+        return Err(CmsError::InvalidAtoBLut);
+    }
+
+    if mab.m_curves.len() == 3 {
+        let all_curves_linear = mab.m_curves.iter().all(|curve| curve.is_linear());
+        if !all_curves_linear
+            || !mab.matrix.test_equality(Matrix3d::IDENTITY)
+            || mab.bias.ne(&Vector3d::default())
+        {
+            let curves: Result<Vec<_>, _> = mab
+                .m_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+
+            let matrix = mab.matrix.to_f32();
+            let bias = mab.bias.cast();
+            let m_curves = MCurves3::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+                matrix,
+                bias,
+                inverse: true,
+            };
+            m_curves.transform(lut)?;
+        }
+    }
+
+    let mut new_lut = vec![0f32; (lut.len() / 3) * 4];
+
+    if mab.a_curves.len() == 4 && mab.clut.is_some() {
+        let clut = &mab.clut.as_ref().map(|x| x.to_clut_f32()).unwrap();
+
+        let lut_grid = (mab.grid_points[0] as usize)
+            .safe_mul(mab.grid_points[1] as usize)?
+            .safe_mul(mab.grid_points[2] as usize)?
+            .safe_mul(mab.num_output_channels as usize)?;
+        if clut.len() != lut_grid {
+            return Err(CmsError::MalformedClut(MalformedSize {
+                size: clut.len(),
+                expected: lut_grid,
+            }));
+        }
+
+        let grid_size = [mab.grid_points[0], mab.grid_points[1], mab.grid_points[2]];
+
+        let all_curves_linear = mab.a_curves.iter().all(|curve| curve.is_linear());
+
+        if all_curves_linear {
+            let a_curves = ACurves3x4InverseOptimized {
+                clut,
+                grid_size: [mab.grid_points[0], mab.grid_points[1], mab.grid_points[2]],
+                interpolation_method: options.interpolation_method,
+                pcs,
+            };
+            a_curves.transform(lut, &mut new_lut)?;
+        } else {
+            let curves: Result<Vec<_>, _> = mab
+                .a_curves
+                .iter()
+                .map(|c| {
+                    c.build_linearize_table::<u16, LERP_DEPTH, BP>()
+                        .ok_or(CmsError::InvalidTrcCurve)
+                })
+                .collect();
+
+            let [curve0, curve1, curve2, curve3] =
+                curves?.try_into().map_err(|_| CmsError::InvalidTrcCurve)?;
+
+            let a_curves = ACurves3x4Inverse::<DEPTH> {
+                curve0,
+                curve1,
+                curve2,
+                curve3,
+                clut,
+                grid_size,
+                interpolation_method: options.interpolation_method,
+                pcs,
+            };
+            a_curves.transform(lut, &mut new_lut)?;
+        }
+    } else {
+        return Err(CmsError::UnsupportedProfileConnection);
+    }
+
+    Ok(new_lut)
+}
--- a/vendor/moxcms/src/conversions/md_lut.rs
+++ b/vendor/moxcms/src/conversions/md_lut.rs
@@ -0,0 +1,728 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
+use crate::mlaf::{mlaf, neg_mlaf};
+use crate::nd_array::{ArrayFetch, lerp};
+use crate::{Vector3f, Vector3i};
+use num_traits::MulAdd;
+use std::array::from_fn;
+use std::marker::PhantomData;
+use std::ops::{Add, Mul, Neg, Sub};
+
+pub(crate) struct MultidimensionalLut {
+    pub(crate) grid_strides: [u32; 16],
+    pub(crate) grid_filling_size: [u32; 16],
+    pub(crate) grid_scale: [f32; 16],
+    pub(crate) output_inks: usize,
+}
+
+struct FastCube<T, F: ArrayFetch<T>> {
+    fetch: F,
+    _phantom: PhantomData<T>,
+}
+
+struct ArrayFetchVectorN<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+    z_stride: u32,
+    output_inks: usize,
+}
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug)]
+pub(crate) struct NVector<T, const N: usize> {
+    pub(crate) v: [T; N],
+}
+
+impl<T: Copy, const N: usize> NVector<T, N> {
+    pub(crate) fn from_slice(v: &[T; N]) -> Self {
+        Self { v: *v }
+    }
+}
+
+impl<T: Copy, const N: usize> From<T> for NVector<T, N> {
+    #[inline]
+    fn from(value: T) -> Self {
+        Self { v: [value; N] }
+    }
+}
+
+impl<T: Copy + Add<T, Output = T> + Mul<T, Output = T> + MulAdd<T, Output = T>, const N: usize>
+    FusedMultiplyAdd<NVector<T, N>> for NVector<T, N>
+{
+    #[inline]
+    fn mla(&self, b: NVector<T, N>, c: NVector<T, N>) -> NVector<T, N> {
+        Self {
+            v: from_fn(|i| mlaf(self.v[i], b.v[i], c.v[i])),
+        }
+    }
+}
+
+impl<
+    T: Copy + Add<T, Output = T> + Mul<T, Output = T> + MulAdd<T, Output = T> + Neg<Output = T>,
+    const N: usize,
+> FusedMultiplyNegAdd<NVector<T, N>> for NVector<T, N>
+{
+    #[inline]
+    fn neg_mla(&self, b: NVector<T, N>, c: NVector<T, N>) -> NVector<T, N> {
+        Self {
+            v: from_fn(|i| neg_mlaf(self.v[i], b.v[i], c.v[i])),
+        }
+    }
+}
+
+impl<T: Sub<Output = T> + Default + Copy, const N: usize> Sub<NVector<T, N>> for NVector<T, N> {
+    type Output = Self;
+
+    #[inline]
+    fn sub(self, rhs: NVector<T, N>) -> Self::Output {
+        Self {
+            v: from_fn(|i| self.v[i] - rhs.v[i]),
+        }
+    }
+}
+
+impl<T: Add<Output = T> + Default + Copy, const N: usize> Add<NVector<T, N>> for NVector<T, N> {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: NVector<T, N>) -> Self::Output {
+        Self {
+            v: from_fn(|i| self.v[i] + rhs.v[i]),
+        }
+    }
+}
+
+impl<T: Mul<Output = T> + Default + Copy, const N: usize> Mul<NVector<T, N>> for NVector<T, N> {
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, rhs: NVector<T, N>) -> Self::Output {
+        Self {
+            v: from_fn(|i| self.v[i] * rhs.v[i]),
+        }
+    }
+}
+
+impl<const N: usize> ArrayFetch<NVector<f32, N>> for ArrayFetchVectorN<'_> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32) -> NVector<f32, N> {
+        let start = (x as u32 * self.x_stride + y as u32 * self.y_stride + z as u32 * self.z_stride)
+            as usize
+            * self.output_inks;
+        let k = &self.array[start..start + N];
+        NVector::<f32, N>::from_slice(k.try_into().unwrap())
+    }
+}
+
+impl<T, F: ArrayFetch<T>> FastCube<T, F>
+where
+    T: Copy
+        + From<f32>
+        + Sub<T, Output = T>
+        + Mul<T, Output = T>
+        + Add<T, Output = T>
+        + FusedMultiplyNegAdd<T>
+        + FusedMultiplyAdd<T>,
+{
+    #[inline(always)]
+    fn tetra(&self, src: Vector3i, src_next: Vector3i, w: Vector3f) -> T {
+        let x = src.v[0];
+        let y = src.v[1];
+        let z = src.v[2];
+
+        let x_n = src_next.v[0];
+        let y_n = src_next.v[1];
+        let z_n = src_next.v[2];
+
+        let rx = w.v[0];
+        let ry = w.v[1];
+        let rz = w.v[2];
+
+        let c0 = self.fetch.fetch(x, y, z);
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = self.fetch.fetch(x_n, y, z) - c0;
+                c2 = self.fetch.fetch(x_n, y_n, z) - self.fetch.fetch(x_n, y, z);
+                c3 = self.fetch.fetch(x_n, y_n, z_n) - self.fetch.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = self.fetch.fetch(x_n, y, z) - c0;
+                c2 = self.fetch.fetch(x_n, y_n, z_n) - self.fetch.fetch(x_n, y, z_n);
+                c3 = self.fetch.fetch(x_n, y, z_n) - self.fetch.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = self.fetch.fetch(x_n, y, z_n) - self.fetch.fetch(x, y, z_n);
+                c2 = self.fetch.fetch(x_n, y_n, z_n) - self.fetch.fetch(x_n, y, z_n);
+                c3 = self.fetch.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = self.fetch.fetch(x_n, y_n, z) - self.fetch.fetch(x, y_n, z);
+            c2 = self.fetch.fetch(x, y_n, z) - c0;
+            c3 = self.fetch.fetch(x_n, y_n, z_n) - self.fetch.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = self.fetch.fetch(x_n, y_n, z_n) - self.fetch.fetch(x, y_n, z_n);
+            c2 = self.fetch.fetch(x, y_n, z) - c0;
+            c3 = self.fetch.fetch(x, y_n, z_n) - self.fetch.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = self.fetch.fetch(x_n, y_n, z_n) - self.fetch.fetch(x, y_n, z_n);
+            c2 = self.fetch.fetch(x, y_n, z_n) - self.fetch.fetch(x, y, z_n);
+            c3 = self.fetch.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, T::from(rx));
+        let s1 = s0.mla(c2, T::from(ry));
+        s1.mla(c3, T::from(rz))
+    }
+}
+
+impl MultidimensionalLut {
+    pub(crate) fn new(grid_size: [u8; 16], input_inks: usize, output_inks: usize) -> Self {
+        assert!(input_inks <= 16);
+        let mut grid_strides = [1u32; 16];
+        let mut grid_filling_size = [1u32; 16];
+
+        for (ink, dst_stride) in grid_strides.iter_mut().take(input_inks - 1).enumerate() {
+            let mut stride = 1u32;
+            let how_many = input_inks.saturating_sub(ink).saturating_sub(1);
+            for &grid_stride in grid_size.iter().take(how_many) {
+                stride *= grid_stride as u32;
+            }
+            *dst_stride = stride;
+        }
+
+        for (ink, dst_stride) in grid_filling_size.iter_mut().take(input_inks).enumerate() {
+            let mut stride = output_inks as u32;
+            let how_many = input_inks.saturating_sub(ink).saturating_sub(1);
+            for &grid_stride in grid_size.iter().take(how_many) {
+                stride *= grid_stride as u32;
+            }
+            *dst_stride = stride;
+        }
+
+        let mut grid_strides_f = [0f32; 16];
+
+        for (dst, src) in grid_strides_f
+            .iter_mut()
+            .zip(grid_size.iter())
+            .take(input_inks)
+        {
+            *dst = (*src - 1) as f32;
+        }
+
+        Self {
+            grid_strides,
+            grid_scale: grid_strides_f,
+            grid_filling_size,
+            output_inks,
+        }
+    }
+}
+
+pub(crate) fn linear_4i_vec3f_direct<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    lx: f32,
+    ly: f32,
+    lz: f32,
+    lw: f32,
+) -> NVector<f32, N> {
+    let lin_x = lx.max(0.0).min(1.0);
+    let lin_y = ly.max(0.0).min(1.0);
+    let lin_z = lz.max(0.0).min(1.0);
+    let lin_w = lw.max(0.0).min(1.0);
+
+    let scale_x = lut.grid_scale[0];
+    let scale_y = lut.grid_scale[1];
+    let scale_z = lut.grid_scale[2];
+    let scale_w = lut.grid_scale[3];
+
+    let lx = lin_x * scale_x;
+    let ly = lin_y * scale_y;
+    let lz = lin_z * scale_z;
+    let lw = lin_w * scale_w;
+
+    let x = lx.floor() as i32;
+    let y = ly.floor() as i32;
+    let z = lz.floor() as i32;
+    let w = lw.floor() as i32;
+
+    let src_x = Vector3i { v: [x, y, z] };
+
+    let x_n = lx.ceil() as i32;
+    let y_n = ly.ceil() as i32;
+    let z_n = lz.ceil() as i32;
+    let w_n = lw.ceil() as i32;
+
+    let src_next = Vector3i { v: [x_n, y_n, z_n] };
+
+    let x_w = lx - x as f32;
+    let y_w = ly - y as f32;
+    let z_w = lz - z as f32;
+    let w_w = lw - w as f32;
+
+    let weights = Vector3f { v: [x_w, y_w, z_w] };
+
+    let cube0 = &arr[(w as usize * lut.grid_filling_size[3] as usize)..];
+    let cube1 = &arr[(w_n as usize * lut.grid_filling_size[3] as usize)..];
+
+    let fast_cube0 = FastCube {
+        fetch: ArrayFetchVectorN {
+            array: cube0,
+            x_stride: lut.grid_strides[0],
+            y_stride: lut.grid_strides[1],
+            z_stride: lut.grid_strides[2],
+            output_inks: lut.output_inks,
+        },
+        _phantom: PhantomData,
+    };
+    let fast_cube1 = FastCube {
+        fetch: ArrayFetchVectorN {
+            array: cube1,
+            x_stride: lut.grid_strides[0],
+            y_stride: lut.grid_strides[1],
+            z_stride: lut.grid_strides[2],
+            output_inks: lut.output_inks,
+        },
+        _phantom: PhantomData,
+    };
+    let w0 = fast_cube0.tetra(src_x, src_next, weights);
+    let w1 = fast_cube1.tetra(src_x, src_next, weights);
+    lerp(w0, w1, NVector::<f32, N>::from(w_w))
+}
+
+pub(crate) fn linear_3i_vec3f_direct<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    linear_3i_vec3f(lut, arr, inputs[0], inputs[1], inputs[2])
+}
+
+fn linear_3i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    x: f32,
+    y: f32,
+    z: f32,
+) -> NVector<f32, N> {
+    let lin_x = x.max(0.0).min(1.0);
+    let lin_y = y.max(0.0).min(1.0);
+    let lin_z = z.max(0.0).min(1.0);
+
+    let scale_x = lut.grid_scale[0];
+    let scale_y = lut.grid_scale[1];
+    let scale_z = lut.grid_scale[2];
+
+    let lx = lin_x * scale_x;
+    let ly = lin_y * scale_y;
+    let lz = lin_z * scale_z;
+
+    let x = lx.floor() as i32;
+    let y = ly.floor() as i32;
+    let z = lz.floor() as i32;
+
+    let src_x = Vector3i { v: [x, y, z] };
+
+    let x_n = lx.ceil() as i32;
+    let y_n = ly.ceil() as i32;
+    let z_n = lz.ceil() as i32;
+
+    let src_next = Vector3i { v: [x_n, y_n, z_n] };
+
+    let x_w = lx - x as f32;
+    let y_w = ly - y as f32;
+    let z_w = lz - z as f32;
+
+    let weights = Vector3f { v: [x_w, y_w, z_w] };
+
+    let fast_cube = FastCube {
+        fetch: ArrayFetchVectorN {
+            array: arr,
+            x_stride: lut.grid_strides[0],
+            y_stride: lut.grid_strides[1],
+            z_stride: lut.grid_strides[2],
+            output_inks: lut.output_inks,
+        },
+        _phantom: PhantomData,
+    };
+
+    fast_cube.tetra(src_x, src_next, weights)
+}
+
+pub(crate) fn linear_1i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let lin_x = inputs[0].max(0.0).min(1.0);
+
+    let scale_x = lut.grid_scale[0];
+
+    let lx = lin_x * scale_x;
+
+    let x = lx.floor() as i32;
+
+    let x_n = lx.ceil() as i32;
+
+    let x_w = lx - x as f32;
+
+    let x_stride = lut.grid_strides[0];
+
+    let offset = |xi: i32| -> usize { (xi as u32 * x_stride) as usize * lut.output_inks };
+
+    // Sample 2 corners
+    let a = NVector::<f32, N>::from_slice(&arr[offset(x)..][..N].try_into().unwrap());
+    let b = NVector::<f32, N>::from_slice(&arr[offset(x_n)..][..N].try_into().unwrap());
+
+    a * NVector::<f32, N>::from(1.0 - x_w) + b * NVector::<f32, N>::from(x_w)
+}
+
+pub(crate) fn linear_2i_vec3f_direct<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    linear_2i_vec3f(lut, arr, inputs[0], inputs[1])
+}
+
+fn linear_2i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    x: f32,
+    y: f32,
+) -> NVector<f32, N> {
+    let lin_x = x.max(0.0).min(1.0);
+    let lin_y = y.max(0.0).min(1.0);
+
+    let scale_x = lut.grid_scale[0];
+    let scale_y = lut.grid_scale[1];
+
+    let lx = lin_x * scale_x;
+    let ly = lin_y * scale_y;
+
+    let x = lx.floor() as i32;
+    let y = ly.floor() as i32;
+
+    let x_n = lx.ceil() as i32;
+    let y_n = ly.ceil() as i32;
+
+    let x_w = lx - x as f32;
+    let y_w = ly - y as f32;
+
+    let x_stride = lut.grid_strides[0];
+    let y_stride = lut.grid_strides[1];
+
+    let offset = |xi: i32, yi: i32| -> usize {
+        (xi as u32 * x_stride + yi as u32 * y_stride) as usize * lut.output_inks
+    };
+
+    // Sample 4 corners
+    let a = NVector::<f32, N>::from_slice(&arr[offset(x, y)..][..N].try_into().unwrap());
+    let b = NVector::<f32, N>::from_slice(&arr[offset(x_n, y)..][..N].try_into().unwrap());
+    let c = NVector::<f32, N>::from_slice(&arr[offset(x, y_n)..][..N].try_into().unwrap());
+    let d = NVector::<f32, N>::from_slice(&arr[offset(x_n, y_n)..][..N].try_into().unwrap());
+
+    let ab = a * NVector::<f32, N>::from(1.0 - x_w) + b * NVector::<f32, N>::from(x_w);
+    let cd = c * NVector::<f32, N>::from(1.0 - x_w) + d * NVector::<f32, N>::from(x_w);
+
+    ab * NVector::<f32, N>::from(1.0 - y_w) + cd * NVector::<f32, N>::from(y_w)
+}
+
+pub(crate) fn linear_4i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    linear_4i_vec3f_direct(lut, arr, inputs[0], inputs[1], inputs[2], inputs[3])
+}
+
+type FHandle<const N: usize> = fn(&MultidimensionalLut, &[f32], &[f32]) -> NVector<f32, N>;
+
+#[inline(never)]
+pub(crate) fn linear_n_i_vec3f<
+    const N: usize,
+    const I: usize,
+    Handle: Fn(&MultidimensionalLut, &[f32], &[f32]) -> NVector<f32, N>,
+>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+    handle: Handle,
+) -> NVector<f32, N> {
+    let lin_w = inputs[I];
+
+    let w_c = lin_w.max(0.).min(1.);
+    let scale_p = lut.grid_scale[I];
+    let wf = w_c * scale_p;
+    let w0 = wf.min(scale_p) as usize;
+    let w1 = (wf + 1.).min(scale_p) as usize;
+    let w = wf - w0 as f32;
+
+    let cube0 = &arr[(w0 * lut.grid_filling_size[I] as usize)..];
+    let cube1 = &arr[(w1 * lut.grid_filling_size[I] as usize)..];
+
+    let inputs_sliced = &inputs[0..I];
+    let w0 = handle(lut, cube0, inputs_sliced);
+    let w1 = handle(lut, cube1, inputs_sliced);
+    lerp(w0, w1, NVector::<f32, N>::from(w))
+}
+
+#[inline(never)]
+pub(crate) fn linear_5i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let lin_w = inputs[4];
+
+    let w_c = lin_w.max(0.).min(1.);
+    let scale_p = lut.grid_scale[4];
+    let wf = w_c * scale_p;
+    let w0 = wf.min(scale_p) as usize;
+    let w1 = (wf + 1.).min(scale_p) as usize;
+    let w = wf - w0 as f32;
+
+    let cube0 = &arr[(w0 * lut.grid_filling_size[4] as usize)..];
+    let cube1 = &arr[(w1 * lut.grid_filling_size[4] as usize)..];
+
+    let w0 = linear_4i_vec3f_direct(lut, cube0, inputs[0], inputs[1], inputs[2], inputs[3]);
+    let w1 = linear_4i_vec3f_direct(lut, cube1, inputs[0], inputs[1], inputs[2], inputs[3]);
+    lerp(w0, w1, NVector::<f32, N>::from(w))
+}
+
+#[inline(never)]
+pub(crate) fn linear_6i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_5i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 5, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_7i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_6i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 6, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_8i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_7i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 7, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_9i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_8i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 8, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_10i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_9i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 9, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_11i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_10i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 10, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_12i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_11i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 11, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_13i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_12i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 12, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_14i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_13i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 13, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn linear_15i_vec3f<const N: usize>(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    inputs: &[f32],
+) -> NVector<f32, N> {
+    let f = linear_14i_vec3f::<N>;
+    linear_n_i_vec3f::<N, 14, FHandle<N>>(lut, arr, inputs, f)
+}
+
+#[inline(never)]
+pub(crate) fn tetra_3i_to_any_vec(
+    lut: &MultidimensionalLut,
+    arr: &[f32],
+    x: f32,
+    y: f32,
+    z: f32,
+    dst: &mut [f32],
+    inks: usize,
+) {
+    match inks {
+        1 => {
+            let vec3 = linear_3i_vec3f::<1>(lut, arr, x, y, z);
+            dst[0] = vec3.v[0];
+        }
+        2 => {
+            let vec3 = linear_3i_vec3f::<2>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        3 => {
+            let vec3 = linear_3i_vec3f::<3>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        4 => {
+            let vec3 = linear_3i_vec3f::<4>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        5 => {
+            let vec3 = linear_3i_vec3f::<5>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        6 => {
+            let vec3 = linear_3i_vec3f::<6>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        7 => {
+            let vec3 = linear_3i_vec3f::<7>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        8 => {
+            let vec3 = linear_3i_vec3f::<8>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        9 => {
+            let vec3 = linear_3i_vec3f::<9>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        10 => {
+            let vec3 = linear_3i_vec3f::<10>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        11 => {
+            let vec3 = linear_3i_vec3f::<11>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        12 => {
+            let vec3 = linear_3i_vec3f::<12>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        13 => {
+            let vec3 = linear_3i_vec3f::<13>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        14 => {
+            let vec3 = linear_3i_vec3f::<14>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        15 => {
+            let vec3 = linear_3i_vec3f::<15>(lut, arr, x, y, z);
+            for (dst, src) in dst.iter_mut().zip(vec3.v.iter()) {
+                *dst = *src;
+            }
+        }
+        _ => unreachable!(),
+    }
+}
--- a/vendor/moxcms/src/conversions/md_luts_factory.rs
+++ b/vendor/moxcms/src/conversions/md_luts_factory.rs
@@ -0,0 +1,190 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::katana::{
+    CopyAlphaStage, InjectAlphaStage, Katana, KatanaInitialStage, KatanaIntermediateStage,
+    KatanaPostFinalizationStage, KatanaStageLabToXyz, KatanaStageXyzToLab,
+    katana_create_rgb_lin_lut, katana_input_make_lut_nx3, katana_multi_dimensional_3xn_to_device,
+    katana_multi_dimensional_nx3_to_pcs, katana_output_make_lut_3xn, katana_pcs_lab_v2_to_v4,
+    katana_pcs_lab_v4_to_v2, katana_prepare_inverse_lut_rgb_xyz,
+};
+use crate::{
+    CmsError, ColorProfile, DataColorSpace, GammaLutInterpolate, Layout, LutWarehouse,
+    PointeeSizeExpressible, TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+
+pub(crate) fn do_any_to_any<
+    T: Copy
+        + Default
+        + AsPrimitive<f32>
+        + Send
+        + Sync
+        + AsPrimitive<usize>
+        + PointeeSizeExpressible
+        + GammaLutInterpolate,
+    const BIT_DEPTH: usize,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+>(
+    src_layout: Layout,
+    source: &ColorProfile,
+    dst_layout: Layout,
+    dest: &ColorProfile,
+    options: TransformOptions,
+) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, u8>,
+    (): LutBarycentricReduction<T, u16>,
+{
+    let mut stages: Vec<Box<dyn KatanaIntermediateStage<f32> + Send + Sync>> = Vec::new();
+
+    let initial_stage: Box<dyn KatanaInitialStage<f32, T> + Send + Sync> = match source
+        .is_matrix_shaper()
+    {
+        true => {
+            let state =
+                katana_create_rgb_lin_lut::<T, BIT_DEPTH, LINEAR_CAP>(src_layout, source, options)?;
+            stages.extend(state.stages);
+            state.initial_stage
+        }
+        false => match source.get_device_to_pcs(options.rendering_intent).ok_or(
+            CmsError::UnsupportedLutRenderingIntent(source.rendering_intent),
+        )? {
+            LutWarehouse::Lut(lut) => katana_input_make_lut_nx3::<T>(
+                src_layout,
+                src_layout.channels(),
+                lut,
+                options,
+                source.pcs,
+                BIT_DEPTH,
+            )?,
+            LutWarehouse::Multidimensional(mab) => {
+                katana_multi_dimensional_nx3_to_pcs::<T, BIT_DEPTH>(
+                    src_layout, mab, options, source.pcs,
+                )?
+            }
+        },
+    };
+
+    stages.push(katana_pcs_lab_v2_to_v4(source));
+    if source.pcs == DataColorSpace::Lab {
+        stages.push(Box::new(KatanaStageLabToXyz::default()));
+    }
+    if dest.pcs == DataColorSpace::Lab {
+        stages.push(Box::new(KatanaStageXyzToLab::default()));
+    }
+    stages.push(katana_pcs_lab_v4_to_v2(dest));
+
+    let final_stage = if dest.has_pcs_to_device_lut() {
+        let pcs_to_device = dest
+            .get_pcs_to_device(options.rendering_intent)
+            .ok_or(CmsError::UnsupportedProfileConnection)?;
+        match pcs_to_device {
+            LutWarehouse::Lut(lut) => katana_output_make_lut_3xn::<T>(
+                dst_layout,
+                lut,
+                options,
+                dest.color_space,
+                BIT_DEPTH,
+            )?,
+            LutWarehouse::Multidimensional(mab) => katana_multi_dimensional_3xn_to_device::<T>(
+                dst_layout, mab, options, dest.pcs, BIT_DEPTH,
+            )?,
+        }
+    } else if dest.is_matrix_shaper() {
+        let state = katana_prepare_inverse_lut_rgb_xyz::<T, BIT_DEPTH, GAMMA_LUT>(
+            dest, dst_layout, options,
+        )?;
+        stages.extend(state.stages);
+        state.final_stage
+    } else {
+        return Err(CmsError::UnsupportedProfileConnection);
+    };
+
+    let mut post_finalization: Vec<Box<dyn KatanaPostFinalizationStage<T> + Send + Sync>> =
+        Vec::new();
+    if let Some(stage) =
+        prepare_alpha_finalizer::<T>(src_layout, source, dst_layout, dest, BIT_DEPTH)
+    {
+        post_finalization.push(stage);
+    }
+
+    Ok(Box::new(Katana::<f32, T> {
+        initial_stage,
+        final_stage,
+        stages,
+        post_finalization,
+    }))
+}
+
+pub(crate) fn prepare_alpha_finalizer<
+    T: Copy
+        + Default
+        + AsPrimitive<f32>
+        + Send
+        + Sync
+        + AsPrimitive<usize>
+        + PointeeSizeExpressible
+        + GammaLutInterpolate,
+>(
+    src_layout: Layout,
+    source: &ColorProfile,
+    dst_layout: Layout,
+    dest: &ColorProfile,
+    bit_depth: usize,
+) -> Option<Box<dyn KatanaPostFinalizationStage<T> + Send + Sync>>
+where
+    f32: AsPrimitive<T>,
+{
+    if (dst_layout == Layout::GrayAlpha && dest.color_space == DataColorSpace::Gray)
+        || (dst_layout == Layout::Rgba || dest.color_space == DataColorSpace::Rgb)
+    {
+        return if (src_layout == Layout::GrayAlpha && source.color_space == DataColorSpace::Gray)
+            || (src_layout == Layout::Rgba || source.color_space == DataColorSpace::Rgb)
+        {
+            Some(Box::new(CopyAlphaStage {
+                src_layout,
+                dst_layout,
+                target_color_space: dest.color_space,
+                _phantom: Default::default(),
+            }))
+        } else {
+            Some(Box::new(InjectAlphaStage {
+                dst_layout,
+                target_color_space: dest.color_space,
+                _phantom: Default::default(),
+                bit_depth,
+            }))
+        };
+    }
+    None
+}
--- a/vendor/moxcms/src/conversions/mod.rs
+++ b/vendor/moxcms/src/conversions/mod.rs
@@ -0,0 +1,74 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+mod avx;
+#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
+mod avx512;
+mod bpc;
+mod gray2rgb;
+mod gray2rgb_extended;
+mod interpolator;
+mod katana;
+mod lut3x3;
+mod lut3x4;
+mod lut4;
+mod lut_transforms;
+mod mab;
+mod mab4x3;
+mod mba3x4;
+mod md_lut;
+mod md_luts_factory;
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+mod neon;
+mod prelude_lut_xyz_rgb;
+mod rgb2gray;
+mod rgb2gray_extended;
+mod rgb_xyz_factory;
+mod rgbxyz;
+mod rgbxyz_fixed;
+mod rgbxyz_float;
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+mod sse;
+mod transform_lut3_to_3;
+mod transform_lut3_to_4;
+mod transform_lut4_to_3;
+mod xyz_lab;
+
+pub(crate) use gray2rgb::{make_gray_to_unfused, make_gray_to_x};
+pub(crate) use gray2rgb_extended::{make_gray_to_one_trc_extended, make_gray_to_rgb_extended};
+pub(crate) use interpolator::LutBarycentricReduction;
+pub(crate) use lut_transforms::make_lut_transform;
+pub(crate) use rgb_xyz_factory::{RgbXyzFactory, RgbXyzFactoryOpt};
+pub(crate) use rgb2gray::{ToneReproductionRgbToGray, make_rgb_to_gray};
+pub(crate) use rgb2gray_extended::make_rgb_to_gray_extended;
+pub(crate) use rgbxyz::{TransformMatrixShaper, TransformMatrixShaperOptimized};
+pub(crate) use rgbxyz_float::{
+    TransformShaperFloatInOut, TransformShaperRgbFloat, make_rgb_xyz_rgb_transform_float,
+    make_rgb_xyz_rgb_transform_float_in_out,
+};
--- a/vendor/moxcms/src/conversions/neon/a_curves3.rs
+++ b/vendor/moxcms/src/conversions/neon/a_curves3.rs
@@ -0,0 +1,225 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::neon::cube::CubeNeon;
+use crate::conversions::neon::interpolator::NeonVector;
+use crate::{CmsError, DataColorSpace, InPlaceStage, InterpolationMethod};
+use std::arch::aarch64::*;
+
+pub(crate) struct ACurves3Neon<'a, const DEPTH: usize> {
+    pub(crate) curve0: Box<[f32; 65536]>,
+    pub(crate) curve1: Box<[f32; 65536]>,
+    pub(crate) curve2: Box<[f32; 65536]>,
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 3],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+pub(crate) struct ACurves3OptimizedNeon<'a> {
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 3],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+pub(crate) struct ACurves3InverseNeon<'a, const DEPTH: usize> {
+    pub(crate) curve0: Box<[f32; 65536]>,
+    pub(crate) curve1: Box<[f32; 65536]>,
+    pub(crate) curve2: Box<[f32; 65536]>,
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 3],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+impl<const DEPTH: usize> ACurves3Neon<'_, DEPTH> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> NeonVector>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let scale_value = (DEPTH - 1) as f32;
+
+        for dst in dst.chunks_exact_mut(3) {
+            let a0 = (dst[0] * scale_value).round().min(scale_value) as u16;
+            let a1 = (dst[1] * scale_value).round().min(scale_value) as u16;
+            let a2 = (dst[2] * scale_value).round().min(scale_value) as u16;
+            let b0 = self.curve0[a0 as usize];
+            let b1 = self.curve1[a1 as usize];
+            let b2 = self.curve2[a2 as usize];
+            let v = fetch(b0, b1, b2).v;
+            unsafe {
+                dst[0] = vgetq_lane_f32::<0>(v);
+                dst[1] = vgetq_lane_f32::<1>(v);
+                dst[2] = vgetq_lane_f32::<2>(v);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ACurves3OptimizedNeon<'_> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> NeonVector>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        for dst in dst.chunks_exact_mut(3) {
+            let a0 = dst[0];
+            let a1 = dst[1];
+            let a2 = dst[2];
+            let v = fetch(a0, a1, a2).v;
+            unsafe {
+                dst[0] = vgetq_lane_f32::<0>(v);
+                dst[1] = vgetq_lane_f32::<1>(v);
+                dst[2] = vgetq_lane_f32::<2>(v);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> InPlaceStage for ACurves3Neon<'_, DEPTH> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = CubeNeon::new(self.clut, self.grid_size, 3);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl InPlaceStage for ACurves3OptimizedNeon<'_> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = CubeNeon::new(self.clut, self.grid_size, 3);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab {
+            return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> ACurves3InverseNeon<'_, DEPTH> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32) -> NeonVector>(
+        &self,
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let v_scale_value = unsafe { vdupq_n_f32((DEPTH as u32 - 1u32) as f32) };
+
+        unsafe {
+            for dst in dst.chunks_exact_mut(3) {
+                let mut v = fetch(dst[0], dst[1], dst[2]).v;
+                v = vmulq_f32(v, v_scale_value);
+                v = vminq_f32(v, v_scale_value);
+                let c = vcvtaq_u32_f32(v);
+                let a0 = vgetq_lane_u32::<0>(c) as u16;
+                let a1 = vgetq_lane_u32::<1>(c) as u16;
+                let a2 = vgetq_lane_u32::<2>(c) as u16;
+                let b0 = self.curve0[a0 as usize];
+                let b1 = self.curve1[a1 as usize];
+                let b2 = self.curve2[a2 as usize];
+                dst[0] = b0;
+                dst[1] = b1;
+                dst[2] = b2;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> InPlaceStage for ACurves3InverseNeon<'_, DEPTH> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = CubeNeon::new(self.clut, self.grid_size, 3);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(dst, |x, y, z| lut.tetra_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(dst, |x, y, z| lut.pyramid_vec3(x, y, z))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(dst, |x, y, z| lut.prism_vec3(x, y, z))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(dst, |x, y, z| lut.trilinear_vec3(x, y, z))?;
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/a_curves4x3.rs
+++ b/vendor/moxcms/src/conversions/neon/a_curves4x3.rs
@@ -0,0 +1,168 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::neon::hypercube::HypercubeNeon;
+use crate::conversions::neon::interpolator::NeonVector;
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Stage};
+use std::arch::aarch64::vgetq_lane_f32;
+
+pub(crate) struct ACurves4x3Neon<'a, const DEPTH: usize> {
+    pub(crate) curve0: Box<[f32; 65536]>,
+    pub(crate) curve1: Box<[f32; 65536]>,
+    pub(crate) curve2: Box<[f32; 65536]>,
+    pub(crate) curve3: Box<[f32; 65536]>,
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 4],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+pub(crate) struct ACurves4x3NeonOptimizedNeon<'a> {
+    pub(crate) clut: &'a [f32],
+    pub(crate) grid_size: [u8; 4],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+impl<const DEPTH: usize> ACurves4x3Neon<'_, DEPTH> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> NeonVector>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let scale_value = (DEPTH - 1) as f32;
+
+        assert_eq!(src.len() / 4, dst.len() / 3);
+
+        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
+            let a0 = (src[0] * scale_value).round().min(scale_value) as u16;
+            let a1 = (src[1] * scale_value).round().min(scale_value) as u16;
+            let a2 = (src[2] * scale_value).round().min(scale_value) as u16;
+            let a3 = (src[3] * scale_value).round().min(scale_value) as u16;
+            let c = self.curve0[a0 as usize];
+            let m = self.curve1[a1 as usize];
+            let y = self.curve2[a2 as usize];
+            let k = self.curve3[a3 as usize];
+
+            let v = fetch(c, m, y, k).v;
+            unsafe {
+                dst[0] = vgetq_lane_f32::<0>(v);
+                dst[1] = vgetq_lane_f32::<1>(v);
+                dst[2] = vgetq_lane_f32::<2>(v);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ACurves4x3NeonOptimizedNeon<'_> {
+    fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> NeonVector>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        assert_eq!(src.len() / 4, dst.len() / 3);
+
+        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
+            let c = src[0];
+            let m = src[1];
+            let y = src[2];
+            let k = src[3];
+
+            let v = fetch(c, m, y, k).v;
+            unsafe {
+                dst[0] = vgetq_lane_f32::<0>(v);
+                dst[1] = vgetq_lane_f32::<1>(v);
+                dst[2] = vgetq_lane_f32::<2>(v);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<const DEPTH: usize> Stage for ACurves4x3Neon<'_, DEPTH> {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = HypercubeNeon::new(self.clut, self.grid_size, 3);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Stage for ACurves4x3NeonOptimizedNeon<'_> {
+    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+        let lut = HypercubeNeon::new(self.clut, self.grid_size, 3);
+
+        // If PCS is LAB then linear interpolation should be used
+        if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
+            return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
+        }
+
+        match self.interpolation_method {
+            #[cfg(feature = "options")]
+            InterpolationMethod::Tetrahedral => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Pyramid => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
+            }
+            #[cfg(feature = "options")]
+            InterpolationMethod::Prism => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
+            }
+            InterpolationMethod::Linear => {
+                self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/cube.rs
+++ b/vendor/moxcms/src/conversions/neon/cube.rs
@@ -0,0 +1,442 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::neon::interpolator::NeonVector;
+use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
+use std::arch::aarch64::*;
+use std::ops::{Add, Mul, Sub};
+
+/// 3D CLUT NEON helper
+///
+/// Represents hexahedron.
+pub(crate) struct CubeNeon<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+    grid_size: [u8; 3],
+}
+
+struct HexahedronFetch3<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+}
+
+trait CubeFetch<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
+}
+
+impl CubeFetch<NeonVector> for HexahedronFetch3<'_> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32) -> NeonVector {
+        let start = (x as u32 * self.x_stride + y as u32 * self.y_stride + z as u32) as usize * 3;
+        unsafe {
+            let k = self.array.get_unchecked(start..);
+            let lo = vld1_f32(k.as_ptr());
+            let hi = vld1_lane_f32::<0>(k.get_unchecked(2..).as_ptr(), vdup_n_f32(0.));
+            NeonVector {
+                v: vcombine_f32(lo, hi),
+            }
+        }
+    }
+}
+
+impl<'a> CubeNeon<'a> {
+    pub(crate) fn new(arr: &'a [f32], grid: [u8; 3], components: usize) -> Self {
+        // This is safety precondition, array size must be not less than full grid size * components.
+        // Needs to ensure that it is not missed somewhere else
+        assert_eq!(
+            grid[0] as usize * grid[1] as usize * grid[2] as usize * components,
+            arr.len()
+        );
+        let y_stride = grid[1] as u32;
+        let x_stride = y_stride * grid[0] as u32;
+        CubeNeon {
+            array: arr,
+            x_stride,
+            y_stride,
+            grid_size: grid,
+        }
+    }
+
+    #[inline(always)]
+    fn trilinear<
+        T: Copy
+            + From<f32>
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Add<T, Output = T>
+            + FusedMultiplyNegAdd<T>
+            + FusedMultiplyAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        fetch: impl CubeFetch<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+
+        let x_d = T::from(lin_x * scale_x - x as f32);
+        let y_d = T::from(lin_y * scale_y - y as f32);
+        let z_d = T::from(lin_z * scale_z - z as f32);
+
+        let c000 = fetch.fetch(x, y, z);
+        let c100 = fetch.fetch(x_n, y, z);
+        let c010 = fetch.fetch(x, y_n, z);
+        let c110 = fetch.fetch(x_n, y_n, z);
+        let c001 = fetch.fetch(x, y, z_n);
+        let c101 = fetch.fetch(x_n, y, z_n);
+        let c011 = fetch.fetch(x, y_n, z_n);
+        let c111 = fetch.fetch(x_n, y_n, z_n);
+
+        let c00 = c000.neg_mla(c000, x_d).mla(c100, x_d);
+        let c10 = c010.neg_mla(c010, x_d).mla(c110, x_d);
+        let c01 = c001.neg_mla(c001, x_d).mla(c101, x_d);
+        let c11 = c011.neg_mla(c011, x_d).mla(c111, x_d);
+
+        let c0 = c00.neg_mla(c00, y_d).mla(c10, y_d);
+        let c1 = c01.neg_mla(c01, y_d).mla(c11, y_d);
+
+        c0.neg_mla(c0, z_d).mla(c1, z_d)
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    fn pyramid<
+        T: Copy
+            + From<f32>
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Add<T, Output = T>
+            + FusedMultiplyAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        fetch: impl CubeFetch<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+
+        let dr = lin_x * scale_x - x as f32;
+        let dg = lin_y * scale_y - y as f32;
+        let db = lin_z * scale_z - z as f32;
+
+        let c0 = fetch.fetch(x, y, z);
+
+        if dr > db && dg > db {
+            let x0 = fetch.fetch(x_n, y_n, z_n);
+            let x1 = fetch.fetch(x_n, y_n, z);
+            let x2 = fetch.fetch(x_n, y, z);
+            let x3 = fetch.fetch(x, y_n, z);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = fetch.fetch(x, y, z_n);
+            let x1 = fetch.fetch(x_n, y_n, z_n);
+            let x2 = fetch.fetch(x, y_n, z_n);
+            let x3 = fetch.fetch(x, y_n, z);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dg * db))
+        } else {
+            let x0 = fetch.fetch(x, y, z_n);
+            let x1 = fetch.fetch(x_n, y, z);
+            let x2 = fetch.fetch(x_n, y, z_n);
+            let x3 = fetch.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(db * dr))
+        }
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    fn tetra<
+        T: Copy
+            + From<f32>
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Add<T, Output = T>
+            + FusedMultiplyAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        fetch: impl CubeFetch<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+
+        let rx = lin_x * scale_x - x as f32;
+        let ry = lin_y * scale_y - y as f32;
+        let rz = lin_z * scale_z - z as f32;
+
+        let c0 = fetch.fetch(x, y, z);
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = fetch.fetch(x_n, y, z) - c0;
+                c2 = fetch.fetch(x_n, y_n, z) - fetch.fetch(x_n, y, z);
+                c3 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = fetch.fetch(x_n, y, z) - c0;
+                c2 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x_n, y, z_n);
+                c3 = fetch.fetch(x_n, y, z_n) - fetch.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = fetch.fetch(x_n, y, z_n) - fetch.fetch(x, y, z_n);
+                c2 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x_n, y, z_n);
+                c3 = fetch.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = fetch.fetch(x_n, y_n, z) - fetch.fetch(x, y_n, z);
+            c2 = fetch.fetch(x, y_n, z) - c0;
+            c3 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x, y_n, z_n);
+            c2 = fetch.fetch(x, y_n, z) - c0;
+            c3 = fetch.fetch(x, y_n, z_n) - fetch.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = fetch.fetch(x_n, y_n, z_n) - fetch.fetch(x, y_n, z_n);
+            c2 = fetch.fetch(x, y_n, z_n) - fetch.fetch(x, y, z_n);
+            c3 = fetch.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, T::from(rx));
+        let s1 = s0.mla(c2, T::from(ry));
+        s1.mla(c3, T::from(rz))
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    fn prism<
+        T: Copy
+            + From<f32>
+            + Sub<T, Output = T>
+            + Mul<T, Output = T>
+            + Add<T, Output = T>
+            + FusedMultiplyAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        fetch: impl CubeFetch<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+
+        let dr = lin_x * scale_x - x as f32;
+        let dg = lin_y * scale_y - y as f32;
+        let db = lin_z * scale_z - z as f32;
+
+        let c0 = fetch.fetch(x, y, z);
+
+        if db >= dr {
+            let x0 = fetch.fetch(x, y, z_n);
+            let x1 = fetch.fetch(x_n, y, z_n);
+            let x2 = fetch.fetch(x, y_n, z);
+            let x3 = fetch.fetch(x, y_n, z_n);
+            let x4 = fetch.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        } else {
+            let x0 = fetch.fetch(x_n, y, z);
+            let x1 = fetch.fetch(x_n, y, z_n);
+            let x2 = fetch.fetch(x, y_n, z);
+            let x3 = fetch.fetch(x_n, y_n, z);
+            let x4 = fetch.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        }
+    }
+
+    #[inline]
+    pub(crate) fn trilinear_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32) -> NeonVector {
+        self.trilinear(
+            lin_x,
+            lin_y,
+            lin_z,
+            HexahedronFetch3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn prism_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32) -> NeonVector {
+        self.prism(
+            lin_x,
+            lin_y,
+            lin_z,
+            HexahedronFetch3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn pyramid_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32) -> NeonVector {
+        self.pyramid(
+            lin_x,
+            lin_y,
+            lin_z,
+            HexahedronFetch3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn tetra_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32) -> NeonVector {
+        self.tetra(
+            lin_x,
+            lin_y,
+            lin_z,
+            HexahedronFetch3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+            },
+        )
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/hypercube.rs
+++ b/vendor/moxcms/src/conversions/neon/hypercube.rs
@@ -0,0 +1,629 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::neon::interpolator::NeonVector;
+use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
+use crate::nd_array::lerp;
+use std::arch::aarch64::{vcombine_f32, vdup_n_f32, vld1_f32, vld1_lane_f32};
+use std::ops::{Add, Mul, Sub};
+
+/// 4D CLUT helper.
+///
+/// Represents hypercube.
+pub(crate) struct HypercubeNeon<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+    z_stride: u32,
+    grid_size: [u8; 4],
+}
+
+trait Fetcher4<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32, w: i32) -> T;
+}
+
+struct Fetch4Vec3<'a> {
+    array: &'a [f32],
+    x_stride: u32,
+    y_stride: u32,
+    z_stride: u32,
+}
+
+impl Fetcher4<NeonVector> for Fetch4Vec3<'_> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32, w: i32) -> NeonVector {
+        let start = (x as u32 * self.x_stride
+            + y as u32 * self.y_stride
+            + z as u32 * self.z_stride
+            + w as u32) as usize
+            * 3;
+        unsafe {
+            let k = self.array.get_unchecked(start..);
+            let lo = vld1_f32(k.as_ptr());
+            let hi = vld1_lane_f32::<0>(k.get_unchecked(2..).as_ptr(), vdup_n_f32(0.));
+            NeonVector {
+                v: vcombine_f32(lo, hi),
+            }
+        }
+    }
+}
+
+impl<'a> HypercubeNeon<'a> {
+    pub(crate) fn new(arr: &'a [f32], grid: [u8; 4], components: usize) -> Self {
+        // This is safety precondition, array size must be not less than full grid size * components.
+        // Needs to ensure that it is not missed somewhere else
+        assert_eq!(
+            grid[0] as usize * grid[1] as usize * grid[2] as usize * grid[3] as usize * components,
+            arr.len()
+        );
+        let z_stride = grid[2] as u32;
+        let y_stride = z_stride * grid[1] as u32;
+        let x_stride = y_stride * grid[0] as u32;
+        HypercubeNeon {
+            array: arr,
+            x_stride,
+            y_stride,
+            z_stride,
+            grid_size: grid,
+        }
+    }
+
+    #[inline(always)]
+    fn quadlinear<
+        T: From<f32>
+            + Add<T, Output = T>
+            + Mul<T, Output = T>
+            + FusedMultiplyAdd<T>
+            + Sub<T, Output = T>
+            + Copy
+            + FusedMultiplyNegAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+        r: impl Fetcher4<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+        let lin_w = lin_w.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+        let w = (lin_w * scale_w).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+        let w_n = (lin_w * scale_w).ceil() as i32;
+
+        let x_d = T::from(lin_x * scale_x - x as f32);
+        let y_d = T::from(lin_y * scale_y - y as f32);
+        let z_d = T::from(lin_z * scale_z - z as f32);
+        let w_d = T::from(lin_w * scale_w - w as f32);
+
+        let r_x1 = lerp(r.fetch(x, y, z, w), r.fetch(x_n, y, z, w), x_d);
+        let r_x2 = lerp(r.fetch(x, y_n, z, w), r.fetch(x_n, y_n, z, w), x_d);
+        let r_y1 = lerp(r_x1, r_x2, y_d);
+        let r_x3 = lerp(r.fetch(x, y, z_n, w), r.fetch(x_n, y, z_n, w), x_d);
+        let r_x4 = lerp(r.fetch(x, y_n, z_n, w), r.fetch(x_n, y_n, z_n, w), x_d);
+        let r_y2 = lerp(r_x3, r_x4, y_d);
+        let r_z1 = lerp(r_y1, r_y2, z_d);
+
+        let r_x1 = lerp(r.fetch(x, y, z, w_n), r.fetch(x_n, y, z, w_n), x_d);
+        let r_x2 = lerp(r.fetch(x, y_n, z, w_n), r.fetch(x_n, y_n, z, w_n), x_d);
+        let r_y1 = lerp(r_x1, r_x2, y_d);
+        let r_x3 = lerp(r.fetch(x, y, z_n, w_n), r.fetch(x_n, y, z_n, w_n), x_d);
+        let r_x4 = lerp(r.fetch(x, y_n, z_n, w_n), r.fetch(x_n, y_n, z_n, w_n), x_d);
+        let r_y2 = lerp(r_x3, r_x4, y_d);
+        let r_z2 = lerp(r_y1, r_y2, z_d);
+        lerp(r_z1, r_z2, w_d)
+    }
+
+    #[inline]
+    pub(crate) fn quadlinear_vec3(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+    ) -> NeonVector {
+        self.quadlinear(
+            lin_x,
+            lin_y,
+            lin_z,
+            lin_w,
+            Fetch4Vec3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+                z_stride: self.z_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    fn pyramid<
+        T: From<f32>
+            + Add<T, Output = T>
+            + Mul<T, Output = T>
+            + FusedMultiplyAdd<T>
+            + Sub<T, Output = T>
+            + Copy
+            + FusedMultiplyNegAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+        r: impl Fetcher4<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+        let lin_w = lin_w.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+        let w = (lin_w * scale_w).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+        let w_n = (lin_w * scale_w).ceil() as i32;
+
+        let dr = lin_x * scale_x - x as f32;
+        let dg = lin_y * scale_y - y as f32;
+        let db = lin_z * scale_z - z as f32;
+        let dw = lin_w * scale_w - w as f32;
+
+        let c0 = r.fetch(x, y, z, w);
+
+        let w0 = if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n, w);
+            let x1 = r.fetch(x_n, y_n, z, w);
+            let x2 = r.fetch(x_n, y, z, w);
+            let x3 = r.fetch(x, y_n, z, w);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n, w);
+            let x1 = r.fetch(x_n, y_n, z_n, w);
+            let x2 = r.fetch(x, y_n, z_n, w);
+            let x3 = r.fetch(x, y_n, z, w);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dg * db))
+        } else {
+            let x0 = r.fetch(x, y, z_n, w);
+            let x1 = r.fetch(x_n, y, z, w);
+            let x2 = r.fetch(x_n, y, z_n, w);
+            let x3 = r.fetch(x_n, y_n, z_n, w);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(db * dr))
+        };
+
+        let c0 = r.fetch(x, y, z, w_n);
+
+        let w1 = if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n, w_n);
+            let x1 = r.fetch(x_n, y_n, z, w_n);
+            let x2 = r.fetch(x_n, y, z, w_n);
+            let x3 = r.fetch(x, y_n, z, w_n);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n, w_n);
+            let x1 = r.fetch(x_n, y_n, z_n, w_n);
+            let x2 = r.fetch(x, y_n, z_n, w_n);
+            let x3 = r.fetch(x, y_n, z, w_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(dg * db))
+        } else {
+            let x0 = r.fetch(x, y, z_n, w_n);
+            let x1 = r.fetch(x_n, y, z, w_n);
+            let x2 = r.fetch(x_n, y, z_n, w_n);
+            let x3 = r.fetch(x_n, y_n, z_n, w_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            s2.mla(c4, T::from(db * dr))
+        };
+        w0.neg_mla(w0, T::from(dw)).mla(w1, T::from(dw))
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn pyramid_vec3(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+    ) -> NeonVector {
+        self.pyramid(
+            lin_x,
+            lin_y,
+            lin_z,
+            lin_w,
+            Fetch4Vec3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+                z_stride: self.z_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    fn prism<
+        T: From<f32>
+            + Add<T, Output = T>
+            + Mul<T, Output = T>
+            + FusedMultiplyAdd<T>
+            + Sub<T, Output = T>
+            + Copy
+            + FusedMultiplyNegAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+        r: impl Fetcher4<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+        let lin_w = lin_w.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+        let w = (lin_w * scale_w).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+        let w_n = (lin_w * scale_w).ceil() as i32;
+
+        let dr = lin_x * scale_x - x as f32;
+        let dg = lin_y * scale_y - y as f32;
+        let db = lin_z * scale_z - z as f32;
+        let dw = lin_w * scale_w - w as f32;
+
+        let c0 = r.fetch(x, y, z, w);
+
+        let w0 = if db >= dr {
+            let x0 = r.fetch(x, y, z_n, w);
+            let x1 = r.fetch(x_n, y, z_n, w);
+            let x2 = r.fetch(x, y_n, z, w);
+            let x3 = r.fetch(x, y_n, z_n, w);
+            let x4 = r.fetch(x_n, y_n, z_n, w);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        } else {
+            let x0 = r.fetch(x_n, y, z, w);
+            let x1 = r.fetch(x_n, y, z_n, w);
+            let x2 = r.fetch(x, y_n, z, w);
+            let x3 = r.fetch(x_n, y_n, z, w);
+            let x4 = r.fetch(x_n, y_n, z_n, w);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        };
+
+        let c0 = r.fetch(x, y, z, w_n);
+
+        let w1 = if db >= dr {
+            let x0 = r.fetch(x, y, z_n, w_n);
+            let x1 = r.fetch(x_n, y, z_n, w_n);
+            let x2 = r.fetch(x, y_n, z, w_n);
+            let x3 = r.fetch(x, y_n, z_n, w_n);
+            let x4 = r.fetch(x_n, y_n, z_n, w_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        } else {
+            let x0 = r.fetch(x_n, y, z, w_n);
+            let x1 = r.fetch(x_n, y, z_n, w_n);
+            let x2 = r.fetch(x, y_n, z, w_n);
+            let x3 = r.fetch(x_n, y_n, z, w_n);
+            let x4 = r.fetch(x_n, y_n, z_n, w_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, T::from(db));
+            let s1 = s0.mla(c2, T::from(dr));
+            let s2 = s1.mla(c3, T::from(dg));
+            let s3 = s2.mla(c4, T::from(dg * db));
+            s3.mla(c5, T::from(dr * dg))
+        };
+        w0.neg_mla(w0, T::from(dw)).mla(w1, T::from(dw))
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn prism_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32, lin_w: f32) -> NeonVector {
+        self.prism(
+            lin_x,
+            lin_y,
+            lin_z,
+            lin_w,
+            Fetch4Vec3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+                z_stride: self.z_stride,
+            },
+        )
+    }
+
+    #[cfg(feature = "options")]
+    #[inline(always)]
+    fn tetra<
+        T: From<f32>
+            + Add<T, Output = T>
+            + Mul<T, Output = T>
+            + FusedMultiplyAdd<T>
+            + Sub<T, Output = T>
+            + Copy
+            + FusedMultiplyNegAdd<T>,
+    >(
+        &self,
+        lin_x: f32,
+        lin_y: f32,
+        lin_z: f32,
+        lin_w: f32,
+        r: impl Fetcher4<T>,
+    ) -> T {
+        let lin_x = lin_x.max(0.0).min(1.0);
+        let lin_y = lin_y.max(0.0).min(1.0);
+        let lin_z = lin_z.max(0.0).min(1.0);
+        let lin_w = lin_w.max(0.0).min(1.0);
+
+        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
+        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
+        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
+        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
+
+        let x = (lin_x * scale_x).floor() as i32;
+        let y = (lin_y * scale_y).floor() as i32;
+        let z = (lin_z * scale_z).floor() as i32;
+        let w = (lin_w * scale_w).floor() as i32;
+
+        let x_n = (lin_x * scale_x).ceil() as i32;
+        let y_n = (lin_y * scale_y).ceil() as i32;
+        let z_n = (lin_z * scale_z).ceil() as i32;
+        let w_n = (lin_w * scale_w).ceil() as i32;
+
+        let rx = lin_x * scale_x - x as f32;
+        let ry = lin_y * scale_y - y as f32;
+        let rz = lin_z * scale_z - z as f32;
+        let rw = lin_w * scale_w - w as f32;
+
+        let c0 = r.fetch(x, y, z, w);
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z, w) - c0;
+                c2 = r.fetch(x_n, y_n, z, w) - r.fetch(x_n, y, z, w);
+                c3 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y_n, z, w);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z, w) - c0;
+                c2 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y, z_n, w);
+                c3 = r.fetch(x_n, y, z_n, w) - r.fetch(x_n, y, z, w);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n, w) - r.fetch(x, y, z_n, w);
+                c2 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y, z_n, w);
+                c3 = r.fetch(x, y, z_n, w) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z, w) - r.fetch(x, y_n, z, w);
+            c2 = r.fetch(x, y_n, z, w) - c0;
+            c3 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y_n, z, w);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x, y_n, z_n, w);
+            c2 = r.fetch(x, y_n, z, w) - c0;
+            c3 = r.fetch(x, y_n, z_n, w) - r.fetch(x, y_n, z, w);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x, y_n, z_n, w);
+            c2 = r.fetch(x, y_n, z_n, w) - r.fetch(x, y, z_n, w);
+            c3 = r.fetch(x, y, z_n, w) - c0;
+        }
+        let s0 = c0.mla(c1, T::from(rx));
+        let s1 = s0.mla(c2, T::from(ry));
+        let w0 = s1.mla(c3, T::from(rz));
+
+        let c0 = r.fetch(x, y, z, w_n);
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z, w_n) - c0;
+                c2 = r.fetch(x_n, y_n, z, w_n) - r.fetch(x_n, y, z, w_n);
+                c3 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y_n, z, w_n);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z, w_n) - c0;
+                c2 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y, z_n, w_n);
+                c3 = r.fetch(x_n, y, z_n, w_n) - r.fetch(x_n, y, z, w_n);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n, w_n) - r.fetch(x, y, z_n, w_n);
+                c2 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y, z_n, w_n);
+                c3 = r.fetch(x, y, z_n, w_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z, w_n) - r.fetch(x, y_n, z, w_n);
+            c2 = r.fetch(x, y_n, z, w_n) - c0;
+            c3 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y_n, z, w_n);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x, y_n, z_n, w_n);
+            c2 = r.fetch(x, y_n, z, w_n) - c0;
+            c3 = r.fetch(x, y_n, z_n, w_n) - r.fetch(x, y_n, z, w_n);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x, y_n, z_n, w_n);
+            c2 = r.fetch(x, y_n, z_n, w_n) - r.fetch(x, y, z_n, w_n);
+            c3 = r.fetch(x, y, z_n, w_n) - c0;
+        }
+        let s0 = c0.mla(c1, T::from(rx));
+        let s1 = s0.mla(c2, T::from(ry));
+        let w1 = s1.mla(c3, T::from(rz));
+        w0.neg_mla(w0, T::from(rw)).mla(w1, T::from(rw))
+    }
+
+    #[cfg(feature = "options")]
+    #[inline]
+    pub(crate) fn tetra_vec3(&self, lin_x: f32, lin_y: f32, lin_z: f32, lin_w: f32) -> NeonVector {
+        self.tetra(
+            lin_x,
+            lin_y,
+            lin_z,
+            lin_w,
+            Fetch4Vec3 {
+                array: self.array,
+                x_stride: self.x_stride,
+                y_stride: self.y_stride,
+                z_stride: self.z_stride,
+            },
+        )
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/interpolator.rs
+++ b/vendor/moxcms/src/conversions/neon/interpolator.rs
@@ -0,0 +1,905 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#![allow(dead_code)]
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::neon::rgb_xyz::NeonAlignedF32;
+use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+use std::ops::{Add, Mul, Sub};
+
+pub(crate) struct TetrahedralNeon<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [NeonAlignedF32],
+}
+
+pub(crate) struct PyramidalNeon<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [NeonAlignedF32],
+}
+
+pub(crate) struct TrilinearNeon<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [NeonAlignedF32],
+}
+
+pub(crate) struct PyramidalNeonDouble<'a, const GRID_SIZE: usize> {
+    pub(crate) cube0: &'a [NeonAlignedF32],
+    pub(crate) cube1: &'a [NeonAlignedF32],
+}
+
+pub(crate) struct PrismaticNeonDouble<'a, const GRID_SIZE: usize> {
+    pub(crate) cube0: &'a [NeonAlignedF32],
+    pub(crate) cube1: &'a [NeonAlignedF32],
+}
+
+pub(crate) struct TrilinearNeonDouble<'a, const GRID_SIZE: usize> {
+    pub(crate) cube0: &'a [NeonAlignedF32],
+    pub(crate) cube1: &'a [NeonAlignedF32],
+}
+
+pub(crate) struct TetrahedralNeonDouble<'a, const GRID_SIZE: usize> {
+    pub(crate) cube0: &'a [NeonAlignedF32],
+    pub(crate) cube1: &'a [NeonAlignedF32],
+}
+
+pub(crate) struct PrismaticNeon<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [NeonAlignedF32],
+}
+
+trait Fetcher<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
+}
+
+struct TetrahedralNeonFetchVector<'a, const GRID_SIZE: usize> {
+    cube: &'a [NeonAlignedF32],
+}
+
+struct TetrahedralNeonFetchVectorDouble<'a, const GRID_SIZE: usize> {
+    cube0: &'a [NeonAlignedF32],
+    cube1: &'a [NeonAlignedF32],
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct NeonVector {
+    pub(crate) v: float32x4_t,
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct NeonVectorDouble {
+    pub(crate) v0: float32x4_t,
+    pub(crate) v1: float32x4_t,
+}
+
+impl From<f32> for NeonVector {
+    #[inline(always)]
+    fn from(v: f32) -> Self {
+        NeonVector {
+            v: unsafe { vdupq_n_f32(v) },
+        }
+    }
+}
+
+impl From<f32> for NeonVectorDouble {
+    #[inline(always)]
+    fn from(v: f32) -> Self {
+        NeonVectorDouble {
+            v0: unsafe { vdupq_n_f32(v) },
+            v1: unsafe { vdupq_n_f32(v) },
+        }
+    }
+}
+
+impl Sub<NeonVector> for NeonVector {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: NeonVector) -> Self::Output {
+        NeonVector {
+            v: unsafe { vsubq_f32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Mul<NeonVector> for NeonVector {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: NeonVector) -> Self::Output {
+        NeonVector {
+            v: unsafe { vmulq_f32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Sub<NeonVectorDouble> for NeonVectorDouble {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: NeonVectorDouble) -> Self::Output {
+        NeonVectorDouble {
+            v0: unsafe { vsubq_f32(self.v0, rhs.v0) },
+            v1: unsafe { vsubq_f32(self.v1, rhs.v1) },
+        }
+    }
+}
+
+impl Mul<NeonVectorDouble> for NeonVectorDouble {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: NeonVectorDouble) -> Self::Output {
+        NeonVectorDouble {
+            v0: unsafe { vmulq_f32(self.v0, rhs.v0) },
+            v1: unsafe { vmulq_f32(self.v1, rhs.v1) },
+        }
+    }
+}
+
+impl Add<NeonVector> for NeonVector {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: NeonVector) -> Self::Output {
+        NeonVector {
+            v: unsafe { vaddq_f32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Add<NeonVectorDouble> for NeonVectorDouble {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: NeonVectorDouble) -> Self::Output {
+        NeonVectorDouble {
+            v0: unsafe { vaddq_f32(self.v0, rhs.v0) },
+            v1: unsafe { vaddq_f32(self.v1, rhs.v1) },
+        }
+    }
+}
+
+impl FusedMultiplyAdd<NeonVector> for NeonVector {
+    #[inline(always)]
+    fn mla(&self, b: NeonVector, c: NeonVector) -> NeonVector {
+        NeonVector {
+            v: unsafe { vfmaq_f32(self.v, b.v, c.v) },
+        }
+    }
+}
+
+impl FusedMultiplyNegAdd<NeonVector> for NeonVector {
+    #[inline(always)]
+    fn neg_mla(&self, b: NeonVector, c: NeonVector) -> NeonVector {
+        NeonVector {
+            v: unsafe { vfmsq_f32(self.v, b.v, c.v) },
+        }
+    }
+}
+
+impl NeonVectorDouble {
+    #[inline(always)]
+    fn neg_mla(&self, b: NeonVectorDouble, c: NeonVectorDouble) -> NeonVectorDouble {
+        NeonVectorDouble {
+            v0: unsafe { vfmsq_f32(self.v0, b.v0, c.v0) },
+            v1: unsafe { vfmsq_f32(self.v1, b.v1, c.v1) },
+        }
+    }
+}
+
+impl NeonVectorDouble {
+    #[inline(always)]
+    fn mla(&self, b: NeonVectorDouble, c: NeonVector) -> NeonVectorDouble {
+        NeonVectorDouble {
+            v0: unsafe { vfmaq_f32(self.v0, b.v0, c.v) },
+            v1: unsafe { vfmaq_f32(self.v1, b.v1, c.v) },
+        }
+    }
+
+    #[inline(always)]
+    pub(crate) fn split(self) -> (NeonVector, NeonVector) {
+        (NeonVector { v: self.v0 }, NeonVector { v: self.v1 })
+    }
+}
+
+impl<const GRID_SIZE: usize> Fetcher<NeonVector> for TetrahedralNeonFetchVector<'_, GRID_SIZE> {
+    fn fetch(&self, x: i32, y: i32, z: i32) -> NeonVector {
+        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+            + y as u32 * GRID_SIZE as u32
+            + z as u32) as usize;
+        let jx = unsafe { self.cube.get_unchecked(offset..) };
+        NeonVector {
+            v: unsafe { vld1q_f32(jx.as_ptr() as *const f32) },
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> Fetcher<NeonVectorDouble>
+    for TetrahedralNeonFetchVectorDouble<'_, GRID_SIZE>
+{
+    fn fetch(&self, x: i32, y: i32, z: i32) -> NeonVectorDouble {
+        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+            + y as u32 * GRID_SIZE as u32
+            + z as u32) as usize;
+        let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
+        let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
+        NeonVectorDouble {
+            v0: unsafe { vld1q_f32(jx0.as_ptr() as *const f32) },
+            v1: unsafe { vld1q_f32(jx1.as_ptr() as *const f32) },
+        }
+    }
+}
+
+pub(crate) trait NeonMdInterpolation<'a, const GRID_SIZE: usize> {
+    fn new(table: &'a [NeonAlignedF32]) -> Self;
+    fn inter3_neon<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+    ) -> NeonVector;
+}
+
+pub(crate) trait NeonMdInterpolationDouble<'a, const GRID_SIZE: usize> {
+    fn new(table0: &'a [NeonAlignedF32], table1: &'a [NeonAlignedF32]) -> Self;
+    fn inter3_neon<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+    ) -> (NeonVector, NeonVector);
+}
+
+impl<const GRID_SIZE: usize> TetrahedralNeon<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<NeonVector>,
+    ) -> NeonVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let rx = lut_r.w;
+        let ry = lut_g.w;
+        let rz = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
+                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
+            c3 = r.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, NeonVector::from(rx));
+        let s1 = s0.mla(c2, NeonVector::from(ry));
+        s1.mla(c3, NeonVector::from(rz))
+    }
+}
+
+impl<const GRID_SIZE: usize> TetrahedralNeonDouble<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<NeonVectorDouble>,
+    ) -> (NeonVector, NeonVector) {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let rx = lut_r.w;
+        let ry = lut_g.w;
+        let rz = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
+                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
+            c3 = r.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, NeonVector::from(rx));
+        let s1 = s0.mla(c2, NeonVector::from(ry));
+        s1.mla(c3, NeonVector::from(rz)).split()
+    }
+}
+
+macro_rules! define_md_inter_neon {
+    ($interpolator: ident) => {
+        impl<'a, const GRID_SIZE: usize> NeonMdInterpolation<'a, GRID_SIZE>
+            for $interpolator<'a, GRID_SIZE>
+        {
+            #[inline(always)]
+            fn new(table: &'a [NeonAlignedF32]) -> Self {
+                Self { cube: table }
+            }
+
+            #[inline(always)]
+            fn inter3_neon<U: AsPrimitive<usize>, const BINS: usize>(
+                &self,
+                in_r: U,
+                in_g: U,
+                in_b: U,
+                lut: &[BarycentricWeight<f32>; BINS],
+            ) -> NeonVector {
+                self.interpolate(
+                    in_r,
+                    in_g,
+                    in_b,
+                    lut,
+                    TetrahedralNeonFetchVector::<GRID_SIZE> { cube: self.cube },
+                )
+            }
+        }
+    };
+}
+
+macro_rules! define_md_inter_neon_d {
+    ($interpolator: ident) => {
+        impl<'a, const GRID_SIZE: usize> NeonMdInterpolationDouble<'a, GRID_SIZE>
+            for $interpolator<'a, GRID_SIZE>
+        {
+            #[inline(always)]
+            fn new(table0: &'a [NeonAlignedF32], table1: &'a [NeonAlignedF32]) -> Self {
+                Self {
+                    cube0: table0,
+                    cube1: table1,
+                }
+            }
+
+            #[inline(always)]
+            fn inter3_neon<U: AsPrimitive<usize>, const BINS: usize>(
+                &self,
+                in_r: U,
+                in_g: U,
+                in_b: U,
+                lut: &[BarycentricWeight<f32>; BINS],
+            ) -> (NeonVector, NeonVector) {
+                self.interpolate(
+                    in_r,
+                    in_g,
+                    in_b,
+                    lut,
+                    TetrahedralNeonFetchVectorDouble::<GRID_SIZE> {
+                        cube0: self.cube0,
+                        cube1: self.cube1,
+                    },
+                )
+            }
+        }
+    };
+}
+
+define_md_inter_neon!(TetrahedralNeon);
+define_md_inter_neon!(PyramidalNeon);
+define_md_inter_neon!(PrismaticNeon);
+define_md_inter_neon!(TrilinearNeon);
+define_md_inter_neon_d!(PrismaticNeonDouble);
+define_md_inter_neon_d!(PyramidalNeonDouble);
+define_md_inter_neon_d!(TetrahedralNeonDouble);
+define_md_inter_neon_d!(TrilinearNeonDouble);
+
+impl<const GRID_SIZE: usize> PyramidalNeon<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<NeonVector>,
+    ) -> NeonVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n);
+            let x1 = r.fetch(x_n, y_n, z);
+            let x2 = r.fetch(x_n, y, z);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, NeonVector::from(db));
+            let s1 = s0.mla(c2, NeonVector::from(dr));
+            let s2 = s1.mla(c3, NeonVector::from(dg));
+            s2.mla(c4, NeonVector::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y_n, z_n);
+            let x2 = r.fetch(x, y_n, z_n);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, NeonVector::from(db));
+            let s1 = s0.mla(c2, NeonVector::from(dr));
+            let s2 = s1.mla(c3, NeonVector::from(dg));
+            s2.mla(c4, NeonVector::from(dg * db))
+        } else {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z);
+            let x2 = r.fetch(x_n, y, z_n);
+            let x3 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, NeonVector::from(db));
+            let s1 = s0.mla(c2, NeonVector::from(dr));
+            let s2 = s1.mla(c3, NeonVector::from(dg));
+            s2.mla(c4, NeonVector::from(db * dr))
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> PyramidalNeonDouble<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<NeonVectorDouble>,
+    ) -> (NeonVector, NeonVector) {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let w0 = NeonVector::from(db);
+        let w1 = NeonVector::from(dr);
+        let w2 = NeonVector::from(dg);
+
+        if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n);
+            let x1 = r.fetch(x_n, y_n, z);
+            let x2 = r.fetch(x_n, y, z);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let w3 = NeonVector::from(dr * dg);
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3).split()
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y_n, z_n);
+            let x2 = r.fetch(x, y_n, z_n);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let w3 = NeonVector::from(dg * db);
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3).split()
+        } else {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z);
+            let x2 = r.fetch(x_n, y, z_n);
+            let x3 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let w3 = NeonVector::from(db * dr);
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3).split()
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> PrismaticNeon<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<NeonVector>,
+    ) -> NeonVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        if db > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x, y_n, z_n);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, NeonVector::from(db));
+            let s1 = s0.mla(c2, NeonVector::from(dr));
+            let s2 = s1.mla(c3, NeonVector::from(dg));
+            let s3 = s2.mla(c4, NeonVector::from(dg * db));
+            s3.mla(c5, NeonVector::from(dr * dg))
+        } else {
+            let x0 = r.fetch(x_n, y, z);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x_n, y_n, z);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, NeonVector::from(db));
+            let s1 = s0.mla(c2, NeonVector::from(dr));
+            let s2 = s1.mla(c3, NeonVector::from(dg));
+            let s3 = s2.mla(c4, NeonVector::from(dg * db));
+            s3.mla(c5, NeonVector::from(dr * dg))
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> PrismaticNeonDouble<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        rv: impl Fetcher<NeonVectorDouble>,
+    ) -> (NeonVector, NeonVector) {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = rv.fetch(x, y, z);
+
+        let w0 = NeonVector::from(db);
+        let w1 = NeonVector::from(dr);
+        let w2 = NeonVector::from(dg);
+        let w3 = NeonVector::from(dg * db);
+        let w4 = NeonVector::from(dr * dg);
+
+        if db > dr {
+            let x0 = rv.fetch(x, y, z_n);
+            let x1 = rv.fetch(x_n, y, z_n);
+            let x2 = rv.fetch(x, y_n, z);
+            let x3 = rv.fetch(x, y_n, z_n);
+            let x4 = rv.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            let s3 = s2.mla(c4, w3);
+            s3.mla(c5, w4).split()
+        } else {
+            let x0 = rv.fetch(x_n, y, z);
+            let x1 = rv.fetch(x_n, y, z_n);
+            let x2 = rv.fetch(x, y_n, z);
+            let x3 = rv.fetch(x_n, y_n, z);
+            let x4 = rv.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            let s3 = s2.mla(c4, w3);
+            s3.mla(c5, w4).split()
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> TrilinearNeonDouble<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<NeonVectorDouble>,
+    ) -> (NeonVector, NeonVector) {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let w0 = NeonVector::from(dr);
+        let w1 = NeonVector::from(dg);
+        let w2 = NeonVector::from(db);
+
+        let c000 = r.fetch(x, y, z);
+        let c100 = r.fetch(x_n, y, z);
+        let c010 = r.fetch(x, y_n, z);
+        let c110 = r.fetch(x_n, y_n, z);
+        let c001 = r.fetch(x, y, z_n);
+        let c101 = r.fetch(x_n, y, z_n);
+        let c011 = r.fetch(x, y_n, z_n);
+        let c111 = r.fetch(x_n, y_n, z_n);
+
+        let dx = NeonVectorDouble::from(dr);
+
+        let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
+        let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
+        let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
+        let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
+
+        let dy = NeonVectorDouble::from(dg);
+
+        let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
+        let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
+
+        let dz = NeonVectorDouble::from(db);
+
+        c0.neg_mla(c0, dz).mla(c1, w2).split()
+    }
+}
+
+impl<const GRID_SIZE: usize> TrilinearNeon<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<NeonVector>,
+    ) -> NeonVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let w0 = NeonVector::from(dr);
+        let w1 = NeonVector::from(dg);
+        let w2 = NeonVector::from(db);
+
+        let c000 = r.fetch(x, y, z);
+        let c100 = r.fetch(x_n, y, z);
+        let c010 = r.fetch(x, y_n, z);
+        let c110 = r.fetch(x_n, y_n, z);
+        let c001 = r.fetch(x, y, z_n);
+        let c101 = r.fetch(x_n, y, z_n);
+        let c011 = r.fetch(x, y_n, z_n);
+        let c111 = r.fetch(x_n, y_n, z_n);
+
+        let dx = NeonVector::from(dr);
+
+        let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
+        let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
+        let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
+        let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
+
+        let dy = NeonVector::from(dg);
+
+        let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
+        let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
+
+        let dz = NeonVector::from(db);
+
+        c0.neg_mla(c0, dz).mla(c1, w2)
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/interpolator_q0_15.rs
+++ b/vendor/moxcms/src/conversions/neon/interpolator_q0_15.rs
@@ -0,0 +1,947 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::math::FusedMultiplyAdd;
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+use std::ops::{Add, Mul, Sub};
+
+#[repr(align(8), C)]
+pub(crate) struct NeonAlignedI16x4(pub(crate) [i16; 4]);
+
+#[cfg(feature = "options")]
+pub(crate) struct TetrahedralNeonQ0_15<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [NeonAlignedI16x4],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct PyramidalNeonQ0_15<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [NeonAlignedI16x4],
+}
+
+pub(crate) struct TrilinearNeonQ0_15<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [NeonAlignedI16x4],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct PyramidalNeonQ0_15Double<'a, const GRID_SIZE: usize> {
+    pub(crate) cube0: &'a [NeonAlignedI16x4],
+    pub(crate) cube1: &'a [NeonAlignedI16x4],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct PrismaticNeonQ0_15Double<'a, const GRID_SIZE: usize> {
+    pub(crate) cube0: &'a [NeonAlignedI16x4],
+    pub(crate) cube1: &'a [NeonAlignedI16x4],
+}
+
+pub(crate) struct TrilinearNeonQ0_15Double<'a, const GRID_SIZE: usize> {
+    pub(crate) cube0: &'a [NeonAlignedI16x4],
+    pub(crate) cube1: &'a [NeonAlignedI16x4],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct TetrahedralNeonQ0_15Double<'a, const GRID_SIZE: usize> {
+    pub(crate) cube0: &'a [NeonAlignedI16x4],
+    pub(crate) cube1: &'a [NeonAlignedI16x4],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct PrismaticNeonQ0_15<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [NeonAlignedI16x4],
+}
+
+trait Fetcher<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
+}
+
+struct TetrahedralNeonQ0_15FetchVector<'a, const GRID_SIZE: usize> {
+    cube: &'a [NeonAlignedI16x4],
+}
+
+struct TetrahedralNeonQ0_15FetchVectorDouble<'a, const GRID_SIZE: usize> {
+    cube0: &'a [NeonAlignedI16x4],
+    cube1: &'a [NeonAlignedI16x4],
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct NeonVectorQ0_15 {
+    pub(crate) v: int16x4_t,
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct NeonVectorQ0_15Double {
+    pub(crate) v: int16x8_t,
+}
+
+impl From<i16> for NeonVectorQ0_15 {
+    #[inline(always)]
+    fn from(v: i16) -> Self {
+        NeonVectorQ0_15 {
+            v: unsafe { vdup_n_s16(v) },
+        }
+    }
+}
+
+impl From<i16> for NeonVectorQ0_15Double {
+    #[inline(always)]
+    fn from(v: i16) -> Self {
+        NeonVectorQ0_15Double {
+            v: unsafe { vdupq_n_s16(v) },
+        }
+    }
+}
+
+impl Sub<NeonVectorQ0_15> for NeonVectorQ0_15 {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: NeonVectorQ0_15) -> Self::Output {
+        NeonVectorQ0_15 {
+            v: unsafe { vsub_s16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Mul<NeonVectorQ0_15> for NeonVectorQ0_15 {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: NeonVectorQ0_15) -> Self::Output {
+        NeonVectorQ0_15 {
+            v: unsafe { vqrdmulh_s16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Sub<NeonVectorQ0_15Double> for NeonVectorQ0_15Double {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: NeonVectorQ0_15Double) -> Self::Output {
+        NeonVectorQ0_15Double {
+            v: unsafe { vsubq_s16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Mul<NeonVectorQ0_15Double> for NeonVectorQ0_15Double {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: NeonVectorQ0_15Double) -> Self::Output {
+        NeonVectorQ0_15Double {
+            v: unsafe { vqrdmulhq_s16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Add<NeonVectorQ0_15> for NeonVectorQ0_15 {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: NeonVectorQ0_15) -> Self::Output {
+        NeonVectorQ0_15 {
+            v: unsafe { vadd_s16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Add<NeonVectorQ0_15Double> for NeonVectorQ0_15Double {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: NeonVectorQ0_15Double) -> Self::Output {
+        NeonVectorQ0_15Double {
+            v: unsafe { vaddq_s16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl FusedMultiplyAdd<NeonVectorQ0_15> for NeonVectorQ0_15 {
+    #[inline(always)]
+    fn mla(&self, b: NeonVectorQ0_15, c: NeonVectorQ0_15) -> NeonVectorQ0_15 {
+        NeonVectorQ0_15 {
+            v: unsafe { vqrdmlah_s16(self.v, b.v, c.v) },
+        }
+    }
+}
+
+impl NeonVectorQ0_15 {
+    #[inline(always)]
+    fn neg_mla(&self, b: NeonVectorQ0_15, c: NeonVectorQ0_15) -> NeonVectorQ0_15 {
+        NeonVectorQ0_15 {
+            v: unsafe { vqrdmlsh_s16(self.v, b.v, c.v) },
+        }
+    }
+}
+
+impl NeonVectorQ0_15Double {
+    #[inline(always)]
+    fn neg_mla(&self, b: NeonVectorQ0_15Double, c: NeonVectorQ0_15Double) -> NeonVectorQ0_15Double {
+        NeonVectorQ0_15Double {
+            v: unsafe { vqrdmlshq_s16(self.v, b.v, c.v) },
+        }
+    }
+}
+
+impl NeonVectorQ0_15Double {
+    #[inline(always)]
+    fn mla(&self, b: NeonVectorQ0_15Double, c: NeonVectorQ0_15) -> NeonVectorQ0_15Double {
+        NeonVectorQ0_15Double {
+            v: unsafe { vqrdmlahq_s16(self.v, b.v, vcombine_s16(c.v, c.v)) },
+        }
+    }
+
+    #[inline(always)]
+    pub(crate) fn split(self) -> (NeonVectorQ0_15, NeonVectorQ0_15) {
+        unsafe {
+            (
+                NeonVectorQ0_15 {
+                    v: vget_low_s16(self.v),
+                },
+                NeonVectorQ0_15 {
+                    v: vget_high_s16(self.v),
+                },
+            )
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> Fetcher<NeonVectorQ0_15>
+    for TetrahedralNeonQ0_15FetchVector<'_, GRID_SIZE>
+{
+    fn fetch(&self, x: i32, y: i32, z: i32) -> NeonVectorQ0_15 {
+        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+            + y as u32 * GRID_SIZE as u32
+            + z as u32) as usize;
+        let jx = unsafe { self.cube.get_unchecked(offset..) };
+        NeonVectorQ0_15 {
+            v: unsafe { vld1_s16(jx.as_ptr() as *const i16) },
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> Fetcher<NeonVectorQ0_15Double>
+    for TetrahedralNeonQ0_15FetchVectorDouble<'_, GRID_SIZE>
+{
+    fn fetch(&self, x: i32, y: i32, z: i32) -> NeonVectorQ0_15Double {
+        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+            + y as u32 * GRID_SIZE as u32
+            + z as u32) as usize;
+        let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
+        let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
+        NeonVectorQ0_15Double {
+            v: unsafe {
+                vcombine_s16(
+                    vld1_s16(jx0.as_ptr() as *const i16),
+                    vld1_s16(jx1.as_ptr() as *const i16),
+                )
+            },
+        }
+    }
+}
+
+pub(crate) trait NeonMdInterpolationQ0_15<'a, const GRID_SIZE: usize> {
+    fn new(table: &'a [NeonAlignedI16x4]) -> Self;
+    fn inter3_neon<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+    ) -> NeonVectorQ0_15;
+}
+
+pub(crate) trait NeonMdInterpolationQ0_15Double<'a, const GRID_SIZE: usize> {
+    fn new(table0: &'a [NeonAlignedI16x4], table1: &'a [NeonAlignedI16x4]) -> Self;
+    fn inter3_neon<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+    ) -> (NeonVectorQ0_15, NeonVectorQ0_15);
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> TetrahedralNeonQ0_15<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<NeonVectorQ0_15>,
+    ) -> NeonVectorQ0_15 {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let rx = lut_r.w;
+        let ry = lut_g.w;
+        let rz = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
+                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
+            c3 = r.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, NeonVectorQ0_15::from(rx));
+        let s1 = s0.mla(c2, NeonVectorQ0_15::from(ry));
+        s1.mla(c3, NeonVectorQ0_15::from(rz))
+    }
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> TetrahedralNeonQ0_15Double<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<NeonVectorQ0_15Double>,
+    ) -> (NeonVectorQ0_15, NeonVectorQ0_15) {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let rx = lut_r.w;
+        let ry = lut_g.w;
+        let rz = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
+                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
+            c3 = r.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, NeonVectorQ0_15::from(rx));
+        let s1 = s0.mla(c2, NeonVectorQ0_15::from(ry));
+        s1.mla(c3, NeonVectorQ0_15::from(rz)).split()
+    }
+}
+
+macro_rules! define_md_inter_neon {
+    ($interpolator: ident) => {
+        impl<'a, const GRID_SIZE: usize> NeonMdInterpolationQ0_15<'a, GRID_SIZE>
+            for $interpolator<'a, GRID_SIZE>
+        {
+            #[inline(always)]
+            fn new(table: &'a [NeonAlignedI16x4]) -> Self {
+                Self { cube: table }
+            }
+
+            #[inline(always)]
+            fn inter3_neon<U: AsPrimitive<usize>, const BINS: usize>(
+                &self,
+                in_r: U,
+                in_g: U,
+                in_b: U,
+                lut: &[BarycentricWeight<i16>; BINS],
+            ) -> NeonVectorQ0_15 {
+                self.interpolate(
+                    in_r,
+                    in_g,
+                    in_b,
+                    lut,
+                    TetrahedralNeonQ0_15FetchVector::<GRID_SIZE> { cube: self.cube },
+                )
+            }
+        }
+    };
+}
+
+macro_rules! define_md_inter_neon_d {
+    ($interpolator: ident) => {
+        impl<'a, const GRID_SIZE: usize> NeonMdInterpolationQ0_15Double<'a, GRID_SIZE>
+            for $interpolator<'a, GRID_SIZE>
+        {
+            #[inline(always)]
+            fn new(table0: &'a [NeonAlignedI16x4], table1: &'a [NeonAlignedI16x4]) -> Self {
+                Self {
+                    cube0: table0,
+                    cube1: table1,
+                }
+            }
+
+            #[inline(always)]
+            fn inter3_neon<U: AsPrimitive<usize>, const BINS: usize>(
+                &self,
+                in_r: U,
+                in_g: U,
+                in_b: U,
+                lut: &[BarycentricWeight<i16>; BINS],
+            ) -> (NeonVectorQ0_15, NeonVectorQ0_15) {
+                self.interpolate(
+                    in_r,
+                    in_g,
+                    in_b,
+                    lut,
+                    TetrahedralNeonQ0_15FetchVectorDouble::<GRID_SIZE> {
+                        cube0: self.cube0,
+                        cube1: self.cube1,
+                    },
+                )
+            }
+        }
+    };
+}
+
+#[cfg(feature = "options")]
+define_md_inter_neon!(TetrahedralNeonQ0_15);
+#[cfg(feature = "options")]
+define_md_inter_neon!(PyramidalNeonQ0_15);
+#[cfg(feature = "options")]
+define_md_inter_neon!(PrismaticNeonQ0_15);
+define_md_inter_neon!(TrilinearNeonQ0_15);
+#[cfg(feature = "options")]
+define_md_inter_neon_d!(PrismaticNeonQ0_15Double);
+#[cfg(feature = "options")]
+define_md_inter_neon_d!(PyramidalNeonQ0_15Double);
+#[cfg(feature = "options")]
+define_md_inter_neon_d!(TetrahedralNeonQ0_15Double);
+define_md_inter_neon_d!(TrilinearNeonQ0_15Double);
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> PyramidalNeonQ0_15<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<NeonVectorQ0_15>,
+    ) -> NeonVectorQ0_15 {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let w0 = NeonVectorQ0_15::from(db);
+        let w1 = NeonVectorQ0_15::from(dr);
+        let w2 = NeonVectorQ0_15::from(dg);
+
+        if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n);
+            let x1 = r.fetch(x_n, y_n, z);
+            let x2 = r.fetch(x_n, y, z);
+            let x3 = r.fetch(x, y_n, z);
+
+            let w3 = w1 * w2;
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3)
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y_n, z_n);
+            let x2 = r.fetch(x, y_n, z_n);
+            let x3 = r.fetch(x, y_n, z);
+
+            let w3 = w2 * w0;
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3)
+        } else {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z);
+            let x2 = r.fetch(x_n, y, z_n);
+            let x3 = r.fetch(x_n, y_n, z_n);
+
+            let w3 = w0 * w1;
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3)
+        }
+    }
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> PyramidalNeonQ0_15Double<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<NeonVectorQ0_15Double>,
+    ) -> (NeonVectorQ0_15, NeonVectorQ0_15) {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let w0 = NeonVectorQ0_15::from(db);
+        let w1 = NeonVectorQ0_15::from(dr);
+        let w2 = NeonVectorQ0_15::from(dg);
+
+        if dr > db && dg > db {
+            let w3 = NeonVectorQ0_15::from(dr) * NeonVectorQ0_15::from(dg);
+            let x0 = r.fetch(x_n, y_n, z_n);
+            let x1 = r.fetch(x_n, y_n, z);
+            let x2 = r.fetch(x_n, y, z);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3).split()
+        } else if db > dr && dg > dr {
+            let w3 = NeonVectorQ0_15::from(dg) * NeonVectorQ0_15::from(db);
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y_n, z_n);
+            let x2 = r.fetch(x, y_n, z_n);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3).split()
+        } else {
+            let w3 = NeonVectorQ0_15::from(db) * NeonVectorQ0_15::from(dr);
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z);
+            let x2 = r.fetch(x_n, y, z_n);
+            let x3 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            s2.mla(c4, w3).split()
+        }
+    }
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> PrismaticNeonQ0_15<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<NeonVectorQ0_15>,
+    ) -> NeonVectorQ0_15 {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let w0 = NeonVectorQ0_15::from(db);
+        let w1 = NeonVectorQ0_15::from(dr);
+        let w2 = NeonVectorQ0_15::from(dg);
+
+        if db > dr {
+            let w3 = w2 * w0;
+            let w4 = w1 * w2;
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x, y_n, z_n);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            let s3 = s2.mla(c4, w3);
+            s3.mla(c5, w4)
+        } else {
+            let w3 = w2 * w0;
+            let w4 = w1 * w2;
+            let x0 = r.fetch(x_n, y, z);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x_n, y_n, z);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            let s3 = s2.mla(c4, w3);
+            s3.mla(c5, w4)
+        }
+    }
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> PrismaticNeonQ0_15Double<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        rv: impl Fetcher<NeonVectorQ0_15Double>,
+    ) -> (NeonVectorQ0_15, NeonVectorQ0_15) {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = rv.fetch(x, y, z);
+
+        let w0 = NeonVectorQ0_15::from(db);
+        let w1 = NeonVectorQ0_15::from(dr);
+        let w2 = NeonVectorQ0_15::from(dg);
+        let w3 = NeonVectorQ0_15::from(dg) * NeonVectorQ0_15::from(db);
+        let w4 = NeonVectorQ0_15::from(dr) * NeonVectorQ0_15::from(dg);
+
+        if db > dr {
+            let x0 = rv.fetch(x, y, z_n);
+            let x1 = rv.fetch(x_n, y, z_n);
+            let x2 = rv.fetch(x, y_n, z);
+            let x3 = rv.fetch(x, y_n, z_n);
+            let x4 = rv.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            let s3 = s2.mla(c4, w3);
+            s3.mla(c5, w4).split()
+        } else {
+            let x0 = rv.fetch(x_n, y, z);
+            let x1 = rv.fetch(x_n, y, z_n);
+            let x2 = rv.fetch(x, y_n, z);
+            let x3 = rv.fetch(x_n, y_n, z);
+            let x4 = rv.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, w0);
+            let s1 = s0.mla(c2, w1);
+            let s2 = s1.mla(c3, w2);
+            let s3 = s2.mla(c4, w3);
+            s3.mla(c5, w4).split()
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> TrilinearNeonQ0_15Double<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<NeonVectorQ0_15Double>,
+    ) -> (NeonVectorQ0_15, NeonVectorQ0_15) {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let w0 = NeonVectorQ0_15::from(dr);
+        let w1 = NeonVectorQ0_15::from(dg);
+        let w2 = NeonVectorQ0_15::from(db);
+
+        let c000 = r.fetch(x, y, z);
+        let c100 = r.fetch(x_n, y, z);
+        let c010 = r.fetch(x, y_n, z);
+        let c110 = r.fetch(x_n, y_n, z);
+        let c001 = r.fetch(x, y, z_n);
+        let c101 = r.fetch(x_n, y, z_n);
+        let c011 = r.fetch(x, y_n, z_n);
+        let c111 = r.fetch(x_n, y_n, z_n);
+
+        let dx = NeonVectorQ0_15Double::from(dr);
+
+        let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
+        let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
+        let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
+        let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
+
+        let dy = NeonVectorQ0_15Double::from(dg);
+
+        let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
+        let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
+
+        let dz = NeonVectorQ0_15Double::from(db);
+
+        c0.neg_mla(c0, dz).mla(c1, w2).split()
+    }
+}
+
+impl<const GRID_SIZE: usize> TrilinearNeonQ0_15<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<NeonVectorQ0_15>,
+    ) -> NeonVectorQ0_15 {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let w0 = NeonVectorQ0_15::from(dr);
+        let w1 = NeonVectorQ0_15::from(dg);
+        let w2 = NeonVectorQ0_15::from(db);
+
+        let c000 = r.fetch(x, y, z);
+        let c100 = r.fetch(x_n, y, z);
+        let c010 = r.fetch(x, y_n, z);
+        let c110 = r.fetch(x_n, y_n, z);
+        let c001 = r.fetch(x, y, z_n);
+        let c101 = r.fetch(x_n, y, z_n);
+        let c011 = r.fetch(x, y_n, z_n);
+        let c111 = r.fetch(x_n, y_n, z_n);
+
+        let dx = NeonVectorQ0_15::from(dr);
+
+        let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
+        let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
+        let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
+        let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
+
+        let dy = NeonVectorQ0_15::from(dg);
+
+        let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
+        let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
+
+        let dz = NeonVectorQ0_15::from(db);
+
+        c0.neg_mla(c0, dz).mla(c1, w2)
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/lut4_to_3.rs
+++ b/vendor/moxcms/src/conversions/neon/lut4_to_3.rs
@@ -0,0 +1,321 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::lut_transforms::Lut4x3Factory;
+use crate::conversions::neon::interpolator::*;
+use crate::conversions::neon::interpolator_q0_15::NeonAlignedI16x4;
+use crate::conversions::neon::lut4_to_3_q0_15::TransformLut4To3NeonQ0_15;
+use crate::conversions::neon::rgb_xyz::NeonAlignedF32;
+use crate::transform::PointeeSizeExpressible;
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+use std::marker::PhantomData;
+
+struct TransformLut4To3Neon<
+    T,
+    U,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    lut: Vec<NeonAlignedF32>,
+    _phantom: PhantomData<T>,
+    _phantom1: PhantomData<U>,
+    interpolation_method: InterpolationMethod,
+    weights: Box<[BarycentricWeight<f32>; BINS]>,
+    color_space: DataColorSpace,
+    is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut4To3Neon<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    fn transform_chunk<'b, Interpolator: NeonMdInterpolationDouble<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        let grid_size = GRID_SIZE as i32;
+        let grid_size3 = grid_size * grid_size * grid_size;
+
+        let value_scale = unsafe { vdupq_n_f32(((1 << BIT_DEPTH) - 1) as f32) };
+        let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
+
+        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
+            let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[0],
+            );
+            let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[1],
+            );
+            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[2],
+            );
+            let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[3],
+            );
+
+            let k_weights = self.weights[k.as_()];
+
+            let w: i32 = k_weights.x;
+            let w_n: i32 = k_weights.x_n;
+            let t: f32 = k_weights.w;
+
+            let table1 = &self.lut[(w * grid_size3) as usize..];
+            let table2 = &self.lut[(w_n * grid_size3) as usize..];
+
+            let tetrahedral1 = Interpolator::new(table1, table2);
+            let (a0, b0) = tetrahedral1.inter3_neon(c, m, y, &self.weights);
+            let (a0, b0) = (a0.v, b0.v);
+
+            if T::FINITE {
+                unsafe {
+                    let t0 = vdupq_n_f32(t);
+                    let hp = vfmsq_f32(a0, a0, t0);
+                    let mut v = vfmaq_f32(hp, b0, t0);
+                    v = vmulq_f32(v, value_scale);
+                    v = vminq_f32(v, value_scale);
+
+                    let jvx = vcvtaq_u32_f32(v);
+
+                    dst[cn.r_i()] = vgetq_lane_u32::<0>(jvx).as_();
+                    dst[cn.g_i()] = vgetq_lane_u32::<1>(jvx).as_();
+                    dst[cn.b_i()] = vgetq_lane_u32::<2>(jvx).as_();
+                }
+            } else {
+                unsafe {
+                    let t0 = vdupq_n_f32(t);
+                    let hp = vfmsq_f32(a0, a0, t0);
+                    let v = vfmaq_f32(hp, b0, t0);
+
+                    dst[cn.r_i()] = vgetq_lane_f32::<0>(v).as_();
+                    dst[cn.g_i()] = vgetq_lane_f32::<1>(v).as_();
+                    dst[cn.b_i()] = vgetq_lane_f32::<2>(v).as_();
+                }
+            }
+            if channels == 4 {
+                dst[cn.a_i()] = max_value;
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut4To3Neon<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        if src.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / 4;
+        let dst_chunks = dst.len() / channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        if self.color_space == DataColorSpace::Lab
+            || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+            || self.color_space == DataColorSpace::Xyz
+        {
+            self.transform_chunk::<TrilinearNeonDouble<GRID_SIZE>>(src, dst);
+        } else {
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    self.transform_chunk::<TetrahedralNeonDouble<GRID_SIZE>>(src, dst);
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    self.transform_chunk::<PyramidalNeonDouble<GRID_SIZE>>(src, dst);
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    self.transform_chunk::<PrismaticNeonDouble<GRID_SIZE>>(src, dst);
+                }
+                InterpolationMethod::Linear => {
+                    self.transform_chunk::<TrilinearNeonDouble<GRID_SIZE>>(src, dst);
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) struct NeonLut4x3Factory {}
+
+impl Lut4x3Factory for NeonLut4x3Factory {
+    fn make_transform_4x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Sync + Send>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>,
+    {
+        if options.prefer_fixed_point
+            && BIT_DEPTH < 16
+            && std::arch::is_aarch64_feature_detected!("rdm")
+        {
+            let q: f32 = if T::FINITE {
+                ((1i32 << BIT_DEPTH as i32) - 1) as f32
+            } else {
+                ((1i32 << 14i32) - 1) as f32
+            };
+            let lut = lut
+                .chunks_exact(3)
+                .map(|x| {
+                    NeonAlignedI16x4([
+                        (x[0] * q).round() as i16,
+                        (x[1] * q).round() as i16,
+                        (x[2] * q).round() as i16,
+                        0,
+                    ])
+                })
+                .collect::<Vec<_>>();
+            return match options.barycentric_weight_scale {
+                BarycentricWeightScale::Low => Box::new(TransformLut4To3NeonQ0_15::<
+                    T,
+                    u8,
+                    LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    256,
+                    256,
+                > {
+                    lut,
+                    _phantom: PhantomData,
+                    _phantom1: PhantomData,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
+                    color_space,
+                    is_linear,
+                }),
+                #[cfg(feature = "options")]
+                BarycentricWeightScale::High => Box::new(TransformLut4To3NeonQ0_15::<
+                    T,
+                    u16,
+                    LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    65536,
+                    65536,
+                > {
+                    lut,
+                    _phantom: PhantomData,
+                    _phantom1: PhantomData,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
+                    color_space,
+                    is_linear,
+                }),
+            };
+        }
+        let lut = lut
+            .chunks_exact(3)
+            .map(|x| NeonAlignedF32([x[0], x[1], x[2], 0f32]))
+            .collect::<Vec<_>>();
+        match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => {
+                Box::new(
+                    TransformLut4To3Neon::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
+                        lut,
+                        _phantom: PhantomData,
+                        _phantom1: PhantomData,
+                        interpolation_method: options.interpolation_method,
+                        weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                        color_space,
+                        is_linear,
+                    },
+                )
+            }
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => {
+                Box::new(
+                    TransformLut4To3Neon::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
+                        lut,
+                        _phantom: PhantomData,
+                        _phantom1: PhantomData,
+                        interpolation_method: options.interpolation_method,
+                        weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                        color_space,
+                        is_linear,
+                    },
+                )
+            }
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/lut4_to_3_q0_15.rs
+++ b/vendor/moxcms/src/conversions/neon/lut4_to_3_q0_15.rs
@@ -0,0 +1,202 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::neon::interpolator_q0_15::*;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformLut4To3NeonQ0_15<
+    T,
+    U,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    pub(crate) lut: Vec<NeonAlignedI16x4>,
+    pub(crate) _phantom: PhantomData<T>,
+    pub(crate) _phantom1: PhantomData<U>,
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
+    pub(crate) color_space: DataColorSpace,
+    pub(crate) is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut4To3NeonQ0_15<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "rdm")]
+    unsafe fn transform_chunk<'b, Interpolator: NeonMdInterpolationQ0_15Double<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        unsafe {
+            let cn = Layout::from(LAYOUT);
+            let channels = cn.channels();
+            let grid_size = GRID_SIZE as i32;
+            let grid_size3 = grid_size * grid_size * grid_size;
+
+            let f_value_scale = vdupq_n_f32(1. / ((1 << 14i32) - 1) as f32);
+            let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+            let v_max_scale = if T::FINITE {
+                vdup_n_s16(((1i32 << BIT_DEPTH) - 1) as i16)
+            } else {
+                vdup_n_s16(((1i32 << 14i32) - 1) as i16)
+            };
+
+            for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
+                let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[0],
+                );
+                let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[1],
+                );
+                let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[2],
+                );
+                let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[3],
+                );
+
+                let k_weights = self.weights[k.as_()];
+
+                let w: i32 = k_weights.x;
+                let w_n: i32 = k_weights.x_n;
+                let t: i16 = k_weights.w;
+
+                let table1 = &self.lut[(w * grid_size3) as usize..];
+                let table2 = &self.lut[(w_n * grid_size3) as usize..];
+
+                let tetrahedral1 = Interpolator::new(table1, table2);
+                let (a0, b0) = tetrahedral1.inter3_neon(c, m, y, &self.weights);
+                let (a0, b0) = (a0.v, b0.v);
+
+                let t0 = vdup_n_s16(t);
+                let hp = vqrdmlsh_s16(a0, a0, t0);
+                let mut v = vqrdmlah_s16(hp, b0, t0);
+
+                if T::FINITE {
+                    v = vmax_s16(v, vdup_n_s16(0));
+                    v = vmin_s16(v, v_max_scale);
+                    dst[cn.r_i()] = (vget_lane_s16::<0>(v) as u32).as_();
+                    dst[cn.g_i()] = (vget_lane_s16::<1>(v) as u32).as_();
+                    dst[cn.b_i()] = (vget_lane_s16::<2>(v) as u32).as_();
+                } else {
+                    let o = vcvtq_f32_s32(vmovl_s16(v));
+                    let r = vmulq_f32(o, f_value_scale);
+                    dst[cn.r_i()] = vgetq_lane_f32::<0>(r).as_();
+                    dst[cn.g_i()] = vgetq_lane_f32::<1>(r).as_();
+                    dst[cn.b_i()] = vgetq_lane_f32::<2>(r).as_();
+                }
+                if channels == 4 {
+                    dst[cn.a_i()] = max_value;
+                }
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut4To3NeonQ0_15<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        if src.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / 4;
+        let dst_chunks = dst.len() / channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearNeonQ0_15Double<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralNeonQ0_15Double<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidalNeonQ0_15Double<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticNeonQ0_15Double<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearNeonQ0_15Double<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/mod.rs
+++ b/vendor/moxcms/src/conversions/neon/mod.rs
@@ -0,0 +1,55 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+mod a_curves3;
+mod a_curves4x3;
+mod cube;
+mod hypercube;
+mod interpolator;
+mod interpolator_q0_15;
+mod lut4_to_3;
+mod lut4_to_3_q0_15;
+mod preheat_lut4x3;
+mod rgb_xyz;
+mod rgb_xyz_opt;
+mod rgb_xyz_q1_30_opt;
+mod rgb_xyz_q2_13;
+mod rgb_xyz_q2_13_opt;
+mod t_lut3_to_3;
+mod t_lut3_to_3_q0_15;
+
+pub(crate) use a_curves3::{ACurves3InverseNeon, ACurves3Neon, ACurves3OptimizedNeon};
+pub(crate) use a_curves4x3::{ACurves4x3Neon, ACurves4x3NeonOptimizedNeon};
+pub(crate) use lut4_to_3::NeonLut4x3Factory;
+pub(crate) use preheat_lut4x3::Lut4x3Neon;
+pub(crate) use rgb_xyz::TransformShaperRgbNeon;
+pub(crate) use rgb_xyz_opt::TransformShaperRgbOptNeon;
+pub(crate) use rgb_xyz_q1_30_opt::TransformShaperQ1_30NeonOpt;
+pub(crate) use rgb_xyz_q2_13::TransformShaperQ2_13Neon;
+pub(crate) use rgb_xyz_q2_13_opt::TransformShaperQ2_13NeonOpt;
+pub(crate) use t_lut3_to_3::NeonLut3x3Factory;
--- a/vendor/moxcms/src/conversions/neon/preheat_lut4x3.rs
+++ b/vendor/moxcms/src/conversions/neon/preheat_lut4x3.rs
@@ -0,0 +1,129 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::neon::hypercube::HypercubeNeon;
+use crate::conversions::neon::interpolator::NeonVector;
+use crate::trc::{lut_interp_linear_float, lut_interp_linear_float_clamped};
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Stage};
+use std::arch::aarch64::{vdupq_n_f32, vgetq_lane_f32, vmaxq_f32, vminq_f32};
+
+#[derive(Default)]
+pub(crate) struct Lut4x3Neon {
+    pub(crate) linearization: [Vec<f32>; 4],
+    pub(crate) clut: Vec<f32>,
+    pub(crate) grid_size: u8,
+    pub(crate) output: [Vec<f32>; 3],
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) pcs: DataColorSpace,
+}
+
+impl Lut4x3Neon {
+    fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> NeonVector>(
+        &self,
+        src: &[f32],
+        dst: &mut [f32],
+        fetch: Fetch,
+    ) -> Result<(), CmsError> {
+        let linearization_0 = &self.linearization[0];
+        let linearization_1 = &self.linearization[1];
+        let linearization_2 = &self.linearization[2];
+        let linearization_3 = &self.linearization[3];
+        for (dest, src) in dst.chunks_exact_mut(3).zip(src.chunks_exact(4)) {
+            debug_assert!(self.grid_size as i32 >= 1);
+            let linear_x = lut_interp_linear_float(src[0], linearization_0);
+            let linear_y = lut_interp_linear_float(src[1], linearization_1);
+            let linear_z = lut_interp_linear_float(src[2], linearization_2);
+            let linear_w = lut_interp_linear_float(src[3], linearization_3);
+
+            unsafe {
+                let mut v = fetch(linear_x, linear_y, linear_z, linear_w).v;
+                v = vmaxq_f32(v, vdupq_n_f32(0.));
+                v = vminq_f32(v, vdupq_n_f32(1.));
+
+                let pcs_x =
+                    lut_interp_linear_float_clamped(vgetq_lane_f32::<0>(v), &self.output[0]);
+                let pcs_y =
+                    lut_interp_linear_float_clamped(vgetq_lane_f32::<1>(v), &self.output[1]);
+                let pcs_z =
+                    lut_interp_linear_float_clamped(vgetq_lane_f32::<2>(v), &self.output[2]);
+                dest[0] = pcs_x;
+                dest[1] = pcs_y;
+                dest[2] = pcs_z;
+            }
+        }
+        Ok(())
+    }
+}
+
+macro_rules! dispatch_preheat {
+    ($heater: ident) => {
+        impl Stage for $heater {
+            fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
+                let l_tbl = HypercubeNeon::new(
+                    &self.clut,
+                    [
+                        self.grid_size,
+                        self.grid_size,
+                        self.grid_size,
+                        self.grid_size,
+                    ],
+                    3,
+                );
+
+                // If Source PCS is LAB trilinear should be used
+                if self.pcs == DataColorSpace::Lab {
+                    return self
+                        .transform_impl(src, dst, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w));
+                }
+
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_impl(src, dst, |x, y, z, w| l_tbl.tetra_vec3(x, y, z, w))?;
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_impl(src, dst, |x, y, z, w| l_tbl.pyramid_vec3(x, y, z, w))?;
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_impl(src, dst, |x, y, z, w| l_tbl.prism_vec3(x, y, z, w))?
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_impl(src, dst, |x, y, z, w| {
+                            l_tbl.quadlinear_vec3(x, y, z, w)
+                        })?
+                    }
+                }
+                Ok(())
+            }
+        }
+    };
+}
+
+dispatch_preheat!(Lut4x3Neon);
--- a/vendor/moxcms/src/conversions/neon/rgb_xyz.rs
+++ b/vendor/moxcms/src/conversions/neon/rgb_xyz.rs
@@ -0,0 +1,427 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::TransformMatrixShaper;
+use crate::conversions::neon::rgb_xyz_q2_13::{split_by_twos, split_by_twos_mut};
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+
+#[repr(align(16), C)]
+pub(crate) struct NeonAlignedU16(pub(crate) [u16; 8]);
+
+#[repr(align(16), C)]
+pub(crate) struct NeonAlignedF32(pub(crate) [f32; 4]);
+
+pub(crate) struct TransformShaperRgbNeon<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> {
+    pub(crate) profile: TransformMatrixShaper<T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformExecutor<T> for TransformShaperRgbNeon<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary0 = NeonAlignedU16([0; 8]);
+        let mut temporary1 = NeonAlignedU16([0; 8]);
+        let mut temporary2 = NeonAlignedU16([0; 8]);
+        let mut temporary3 = NeonAlignedU16([0; 8]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        let (src_chunks, src_remainder) = split_by_twos(src, src_channels);
+        let (dst_chunks, dst_remainder) = split_by_twos_mut(dst, dst_channels);
+
+        unsafe {
+            let m0 = vld1q_f32([t.v[0][0], t.v[0][1], t.v[0][2], 0.].as_ptr());
+            let m1 = vld1q_f32([t.v[1][0], t.v[1][1], t.v[1][2], 0.].as_ptr());
+            let m2 = vld1q_f32([t.v[2][0], t.v[2][1], t.v[2][2], 0.].as_ptr());
+
+            let v_scale = vdupq_n_f32(scale);
+
+            let rnd = vdupq_n_f32(0.5);
+
+            if !src_chunks.is_empty() {
+                let (src0, src1) = src_chunks.split_at(src_chunks.len() / 2);
+                let (dst0, dst1) = dst_chunks.split_at_mut(dst_chunks.len() / 2);
+                let mut src_iter0 = src0.chunks_exact(src_channels * 2);
+                let mut src_iter1 = src1.chunks_exact(src_channels * 2);
+
+                let (mut r0, mut g0, mut b0, mut a0);
+                let (mut r1, mut g1, mut b1, mut a1);
+                let (mut r2, mut g2, mut b2, mut a2);
+                let (mut r3, mut g3, mut b3, mut a3);
+
+                if let (Some(src0), Some(src1)) = (src_iter0.next(), src_iter1.next()) {
+                    let r0p = &self.profile.r_linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.g_linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.b_linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.r_linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.g_linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.b_linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.r_linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.g_linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.b_linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.r_linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.g_linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.b_linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1q_dup_f32(r0p);
+                    g0 = vld1q_dup_f32(g0p);
+                    b0 = vld1q_dup_f32(b0p);
+
+                    r1 = vld1q_dup_f32(r1p);
+                    g1 = vld1q_dup_f32(g1p);
+                    b1 = vld1q_dup_f32(b1p);
+
+                    r2 = vld1q_dup_f32(r2p);
+                    g2 = vld1q_dup_f32(g2p);
+                    b2 = vld1q_dup_f32(b2p);
+
+                    r3 = vld1q_dup_f32(r3p);
+                    g3 = vld1q_dup_f32(g3p);
+                    b3 = vld1q_dup_f32(b3p);
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                } else {
+                    r0 = vdupq_n_f32(0.);
+                    g0 = vdupq_n_f32(0.);
+                    b0 = vdupq_n_f32(0.);
+                    r1 = vdupq_n_f32(0.);
+                    g1 = vdupq_n_f32(0.);
+                    b1 = vdupq_n_f32(0.);
+                    r2 = vdupq_n_f32(0.);
+                    g2 = vdupq_n_f32(0.);
+                    b2 = vdupq_n_f32(0.);
+                    r3 = vdupq_n_f32(0.);
+                    g3 = vdupq_n_f32(0.);
+                    b3 = vdupq_n_f32(0.);
+                    a0 = max_colors;
+                    a1 = max_colors;
+                    a2 = max_colors;
+                    a3 = max_colors;
+                }
+
+                for (((src0, src1), dst0), dst1) in src_iter0
+                    .zip(src_iter1)
+                    .zip(dst0.chunks_exact_mut(dst_channels * 2))
+                    .zip(dst1.chunks_exact_mut(dst_channels * 2))
+                {
+                    let v0_0 = vmulq_f32(r0, m0);
+                    let v0_1 = vmulq_f32(r1, m0);
+                    let v0_2 = vmulq_f32(r2, m0);
+                    let v0_3 = vmulq_f32(r3, m0);
+
+                    let v1_0 = vfmaq_f32(v0_0, g0, m1);
+                    let v1_1 = vfmaq_f32(v0_1, g1, m1);
+                    let v1_2 = vfmaq_f32(v0_2, g2, m1);
+                    let v1_3 = vfmaq_f32(v0_3, g3, m1);
+
+                    let mut vr0 = vfmaq_f32(v1_0, b0, m2);
+                    let mut vr1 = vfmaq_f32(v1_1, b1, m2);
+                    let mut vr2 = vfmaq_f32(v1_2, b2, m2);
+                    let mut vr3 = vfmaq_f32(v1_3, b3, m2);
+
+                    vr0 = vfmaq_f32(rnd, vr0, v_scale);
+                    vr1 = vfmaq_f32(rnd, vr1, v_scale);
+                    vr2 = vfmaq_f32(rnd, vr2, v_scale);
+                    vr3 = vfmaq_f32(rnd, vr3, v_scale);
+
+                    vr0 = vminq_f32(vr0, v_scale);
+                    vr1 = vminq_f32(vr1, v_scale);
+                    vr2 = vminq_f32(vr2, v_scale);
+                    vr3 = vminq_f32(vr3, v_scale);
+
+                    let zx0 = vcvtq_u32_f32(vr0);
+                    let zx1 = vcvtq_u32_f32(vr1);
+                    let zx2 = vcvtq_u32_f32(vr2);
+                    let zx3 = vcvtq_u32_f32(vr3);
+                    vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx0);
+                    vst1q_u32(temporary1.0.as_mut_ptr() as *mut _, zx1);
+                    vst1q_u32(temporary2.0.as_mut_ptr() as *mut _, zx2);
+                    vst1q_u32(temporary3.0.as_mut_ptr() as *mut _, zx3);
+
+                    let r0p = &self.profile.r_linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.g_linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.b_linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.r_linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.g_linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.b_linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.r_linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.g_linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.b_linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.r_linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.g_linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.b_linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1q_dup_f32(r0p);
+                    g0 = vld1q_dup_f32(g0p);
+                    b0 = vld1q_dup_f32(b0p);
+
+                    r1 = vld1q_dup_f32(r1p);
+                    g1 = vld1q_dup_f32(g1p);
+                    b1 = vld1q_dup_f32(b1p);
+
+                    r2 = vld1q_dup_f32(r2p);
+                    g2 = vld1q_dup_f32(g2p);
+                    b2 = vld1q_dup_f32(b2p);
+
+                    r3 = vld1q_dup_f32(r3p);
+                    g3 = vld1q_dup_f32(g3p);
+                    b3 = vld1q_dup_f32(b3p);
+
+                    dst0[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                    dst0[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                    dst0[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.r_gamma[temporary1.0[0] as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.g_gamma[temporary1.0[2] as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.b_gamma[temporary1.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.r_gamma[temporary2.0[0] as usize];
+                    dst1[dst_cn.g_i()] = self.profile.g_gamma[temporary2.0[2] as usize];
+                    dst1[dst_cn.b_i()] = self.profile.b_gamma[temporary2.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.r_gamma[temporary3.0[0] as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.g_gamma[temporary3.0[2] as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.b_gamma[temporary3.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                }
+
+                if let (Some(dst0), Some(dst1)) = (
+                    dst0.chunks_exact_mut(dst_channels * 2).last(),
+                    dst1.chunks_exact_mut(dst_channels * 2).last(),
+                ) {
+                    let v0_0 = vmulq_f32(r0, m0);
+                    let v0_1 = vmulq_f32(r1, m0);
+                    let v0_2 = vmulq_f32(r2, m0);
+                    let v0_3 = vmulq_f32(r3, m0);
+
+                    let v1_0 = vfmaq_f32(v0_0, g0, m1);
+                    let v1_1 = vfmaq_f32(v0_1, g1, m1);
+                    let v1_2 = vfmaq_f32(v0_2, g2, m1);
+                    let v1_3 = vfmaq_f32(v0_3, g3, m1);
+
+                    let mut vr0 = vfmaq_f32(v1_0, b0, m2);
+                    let mut vr1 = vfmaq_f32(v1_1, b1, m2);
+                    let mut vr2 = vfmaq_f32(v1_2, b2, m2);
+                    let mut vr3 = vfmaq_f32(v1_3, b3, m2);
+
+                    vr0 = vfmaq_f32(rnd, vr0, v_scale);
+                    vr1 = vfmaq_f32(rnd, vr1, v_scale);
+                    vr2 = vfmaq_f32(rnd, vr2, v_scale);
+                    vr3 = vfmaq_f32(rnd, vr3, v_scale);
+
+                    vr0 = vminq_f32(vr0, v_scale);
+                    vr1 = vminq_f32(vr1, v_scale);
+                    vr2 = vminq_f32(vr2, v_scale);
+                    vr3 = vminq_f32(vr3, v_scale);
+
+                    let zx0 = vcvtq_u32_f32(vr0);
+                    let zx1 = vcvtq_u32_f32(vr1);
+                    let zx2 = vcvtq_u32_f32(vr2);
+                    let zx3 = vcvtq_u32_f32(vr3);
+                    vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx0);
+                    vst1q_u32(temporary1.0.as_mut_ptr() as *mut _, zx1);
+                    vst1q_u32(temporary2.0.as_mut_ptr() as *mut _, zx2);
+                    vst1q_u32(temporary3.0.as_mut_ptr() as *mut _, zx3);
+
+                    dst0[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                    dst0[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                    dst0[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.r_gamma[temporary1.0[0] as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.g_gamma[temporary1.0[2] as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.b_gamma[temporary1.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.r_gamma[temporary2.0[0] as usize];
+                    dst1[dst_cn.g_i()] = self.profile.g_gamma[temporary2.0[2] as usize];
+                    dst1[dst_cn.b_i()] = self.profile.b_gamma[temporary2.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.r_gamma[temporary3.0[0] as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.g_gamma[temporary3.0[2] as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.b_gamma[temporary3.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+                }
+            }
+
+            for (src, dst) in src_remainder
+                .chunks_exact(src_channels)
+                .zip(dst_remainder.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.r_linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.g_linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.b_linear[src[src_cn.b_i()]._as_usize()];
+                let r = vld1q_dup_f32(rp);
+                let g = vld1q_dup_f32(gp);
+                let b = vld1q_dup_f32(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let v0 = vmulq_f32(r, m0);
+                let v1 = vfmaq_f32(v0, g, m1);
+                let mut v = vfmaq_f32(v1, b, m2);
+
+                v = vfmaq_f32(rnd, v, v_scale);
+                v = vminq_f32(v, v_scale);
+
+                let zx = vcvtq_u32_f32(v);
+                vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/rgb_xyz_opt.rs
+++ b/vendor/moxcms/src/conversions/neon/rgb_xyz_opt.rs
@@ -0,0 +1,423 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::neon::rgb_xyz::NeonAlignedU16;
+use crate::conversions::neon::rgb_xyz_q2_13::{split_by_twos, split_by_twos_mut};
+use crate::conversions::rgbxyz::TransformMatrixShaperOptimized;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+
+pub(crate) struct TransformShaperRgbOptNeon<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> {
+    pub(crate) profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformExecutor<T>
+    for TransformShaperRgbOptNeon<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary0 = NeonAlignedU16([0; 8]);
+        let mut temporary1 = NeonAlignedU16([0; 8]);
+        let mut temporary2 = NeonAlignedU16([0; 8]);
+        let mut temporary3 = NeonAlignedU16([0; 8]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        let (src_chunks, src_remainder) = split_by_twos(src, src_channels);
+        let (dst_chunks, dst_remainder) = split_by_twos_mut(dst, dst_channels);
+
+        unsafe {
+            let m0 = vld1q_f32([t.v[0][0], t.v[0][1], t.v[0][2], 0.].as_ptr());
+            let m1 = vld1q_f32([t.v[1][0], t.v[1][1], t.v[1][2], 0.].as_ptr());
+            let m2 = vld1q_f32([t.v[2][0], t.v[2][1], t.v[2][2], 0.].as_ptr());
+
+            let v_scale = vdupq_n_f32(scale);
+
+            let rnd = vdupq_n_f32(0.5);
+
+            if !src_chunks.is_empty() {
+                let (src0, src1) = src_chunks.split_at(src_chunks.len() / 2);
+                let (dst0, dst1) = dst_chunks.split_at_mut(dst_chunks.len() / 2);
+                let mut src_iter0 = src0.chunks_exact(src_channels * 2);
+                let mut src_iter1 = src1.chunks_exact(src_channels * 2);
+
+                let (mut r0, mut g0, mut b0, mut a0);
+                let (mut r1, mut g1, mut b1, mut a1);
+                let (mut r2, mut g2, mut b2, mut a2);
+                let (mut r3, mut g3, mut b3, mut a3);
+
+                if let (Some(src0), Some(src1)) = (src_iter0.next(), src_iter1.next()) {
+                    let r0p = &self.profile.linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1q_dup_f32(r0p);
+                    g0 = vld1q_dup_f32(g0p);
+                    b0 = vld1q_dup_f32(b0p);
+
+                    r1 = vld1q_dup_f32(r1p);
+                    g1 = vld1q_dup_f32(g1p);
+                    b1 = vld1q_dup_f32(b1p);
+
+                    r2 = vld1q_dup_f32(r2p);
+                    g2 = vld1q_dup_f32(g2p);
+                    b2 = vld1q_dup_f32(b2p);
+
+                    r3 = vld1q_dup_f32(r3p);
+                    g3 = vld1q_dup_f32(g3p);
+                    b3 = vld1q_dup_f32(b3p);
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                } else {
+                    r0 = vdupq_n_f32(0.);
+                    g0 = vdupq_n_f32(0.);
+                    b0 = vdupq_n_f32(0.);
+                    r1 = vdupq_n_f32(0.);
+                    g1 = vdupq_n_f32(0.);
+                    b1 = vdupq_n_f32(0.);
+                    r2 = vdupq_n_f32(0.);
+                    g2 = vdupq_n_f32(0.);
+                    b2 = vdupq_n_f32(0.);
+                    r3 = vdupq_n_f32(0.);
+                    g3 = vdupq_n_f32(0.);
+                    b3 = vdupq_n_f32(0.);
+                    a0 = max_colors;
+                    a1 = max_colors;
+                    a2 = max_colors;
+                    a3 = max_colors;
+                }
+
+                for (((src0, src1), dst0), dst1) in src_iter0
+                    .zip(src_iter1)
+                    .zip(dst0.chunks_exact_mut(dst_channels * 2))
+                    .zip(dst1.chunks_exact_mut(dst_channels * 2))
+                {
+                    let v0_0 = vmulq_f32(r0, m0);
+                    let v0_1 = vmulq_f32(r1, m0);
+                    let v0_2 = vmulq_f32(r2, m0);
+                    let v0_3 = vmulq_f32(r3, m0);
+
+                    let v1_0 = vfmaq_f32(v0_0, g0, m1);
+                    let v1_1 = vfmaq_f32(v0_1, g1, m1);
+                    let v1_2 = vfmaq_f32(v0_2, g2, m1);
+                    let v1_3 = vfmaq_f32(v0_3, g3, m1);
+
+                    let mut vr0 = vfmaq_f32(v1_0, b0, m2);
+                    let mut vr1 = vfmaq_f32(v1_1, b1, m2);
+                    let mut vr2 = vfmaq_f32(v1_2, b2, m2);
+                    let mut vr3 = vfmaq_f32(v1_3, b3, m2);
+
+                    vr0 = vfmaq_f32(rnd, vr0, v_scale);
+                    vr1 = vfmaq_f32(rnd, vr1, v_scale);
+                    vr2 = vfmaq_f32(rnd, vr2, v_scale);
+                    vr3 = vfmaq_f32(rnd, vr3, v_scale);
+
+                    vr0 = vminq_f32(vr0, v_scale);
+                    vr1 = vminq_f32(vr1, v_scale);
+                    vr2 = vminq_f32(vr2, v_scale);
+                    vr3 = vminq_f32(vr3, v_scale);
+
+                    let zx0 = vcvtq_u32_f32(vr0);
+                    let zx1 = vcvtq_u32_f32(vr1);
+                    let zx2 = vcvtq_u32_f32(vr2);
+                    let zx3 = vcvtq_u32_f32(vr3);
+                    vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx0);
+                    vst1q_u32(temporary1.0.as_mut_ptr() as *mut _, zx1);
+                    vst1q_u32(temporary2.0.as_mut_ptr() as *mut _, zx2);
+                    vst1q_u32(temporary3.0.as_mut_ptr() as *mut _, zx3);
+
+                    let r0p = &self.profile.linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1q_dup_f32(r0p);
+                    g0 = vld1q_dup_f32(g0p);
+                    b0 = vld1q_dup_f32(b0p);
+
+                    r1 = vld1q_dup_f32(r1p);
+                    g1 = vld1q_dup_f32(g1p);
+                    b1 = vld1q_dup_f32(b1p);
+
+                    r2 = vld1q_dup_f32(r2p);
+                    g2 = vld1q_dup_f32(g2p);
+                    b2 = vld1q_dup_f32(b2p);
+
+                    r3 = vld1q_dup_f32(r3p);
+                    g3 = vld1q_dup_f32(g3p);
+                    b3 = vld1q_dup_f32(b3p);
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[0] as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[2] as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[temporary2.0[0] as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[temporary2.0[2] as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[temporary2.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary3.0[0] as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary3.0[2] as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary3.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                }
+
+                if let (Some(dst0), Some(dst1)) = (
+                    dst0.chunks_exact_mut(dst_channels * 2).last(),
+                    dst1.chunks_exact_mut(dst_channels * 2).last(),
+                ) {
+                    let v0_0 = vmulq_f32(r0, m0);
+                    let v0_1 = vmulq_f32(r1, m0);
+                    let v0_2 = vmulq_f32(r2, m0);
+                    let v0_3 = vmulq_f32(r3, m0);
+
+                    let v1_0 = vfmaq_f32(v0_0, g0, m1);
+                    let v1_1 = vfmaq_f32(v0_1, g1, m1);
+                    let v1_2 = vfmaq_f32(v0_2, g2, m1);
+                    let v1_3 = vfmaq_f32(v0_3, g3, m1);
+
+                    let mut vr0 = vfmaq_f32(v1_0, b0, m2);
+                    let mut vr1 = vfmaq_f32(v1_1, b1, m2);
+                    let mut vr2 = vfmaq_f32(v1_2, b2, m2);
+                    let mut vr3 = vfmaq_f32(v1_3, b3, m2);
+
+                    vr0 = vfmaq_f32(rnd, vr0, v_scale);
+                    vr1 = vfmaq_f32(rnd, vr1, v_scale);
+                    vr2 = vfmaq_f32(rnd, vr2, v_scale);
+                    vr3 = vfmaq_f32(rnd, vr3, v_scale);
+
+                    vr0 = vminq_f32(vr0, v_scale);
+                    vr1 = vminq_f32(vr1, v_scale);
+                    vr2 = vminq_f32(vr2, v_scale);
+                    vr3 = vminq_f32(vr3, v_scale);
+
+                    let zx0 = vcvtq_u32_f32(vr0);
+                    let zx1 = vcvtq_u32_f32(vr1);
+                    let zx2 = vcvtq_u32_f32(vr2);
+                    let zx3 = vcvtq_u32_f32(vr3);
+                    vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx0);
+                    vst1q_u32(temporary1.0.as_mut_ptr() as *mut _, zx1);
+                    vst1q_u32(temporary2.0.as_mut_ptr() as *mut _, zx2);
+                    vst1q_u32(temporary3.0.as_mut_ptr() as *mut _, zx3);
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[0] as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[2] as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary1.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[temporary2.0[0] as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[temporary2.0[2] as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[temporary2.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[temporary3.0[0] as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[temporary3.0[2] as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[temporary3.0[4] as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+                }
+            }
+
+            for (src, dst) in src_remainder
+                .chunks_exact(src_channels)
+                .zip(dst_remainder.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.linear[src[src_cn.b_i()]._as_usize()];
+                let r = vld1q_dup_f32(rp);
+                let g = vld1q_dup_f32(gp);
+                let b = vld1q_dup_f32(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let v0 = vmulq_f32(r, m0);
+                let v1 = vfmaq_f32(v0, g, m1);
+                let mut v = vfmaq_f32(v1, b, m2);
+
+                v = vfmaq_f32(rnd, v, v_scale);
+                v = vminq_f32(v, v_scale);
+
+                let zx = vcvtq_u32_f32(v);
+                vst1q_u32(temporary0.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/rgb_xyz_q1_30_opt.rs
+++ b/vendor/moxcms/src/conversions/neon/rgb_xyz_q1_30_opt.rs
@@ -0,0 +1,437 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::neon::rgb_xyz_q2_13::{split_by_twos, split_by_twos_mut};
+use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPointOpt;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+
+pub(crate) struct TransformShaperQ1_30NeonOpt<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPointOpt<i32, i32, T, LINEAR_CAP>,
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+    const PRECISION: i32,
+>
+    TransformShaperQ1_30NeonOpt<
+        T,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        LINEAR_CAP,
+        GAMMA_LUT,
+        BIT_DEPTH,
+        PRECISION,
+    >
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "rdm")]
+    unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+        let max_colors: T = ((1 << BIT_DEPTH) - 1).as_();
+
+        let (src_chunks, src_remainder) = split_by_twos(src, src_channels);
+        let (dst_chunks, dst_remainder) = split_by_twos_mut(dst, dst_channels);
+
+        unsafe {
+            let m0 = vld1q_s32([t.v[0][0], t.v[0][1], t.v[0][2], 0].as_ptr());
+            let m1 = vld1q_s32([t.v[1][0], t.v[1][1], t.v[1][2], 0].as_ptr());
+            let m2 = vld1q_s32([t.v[2][0], t.v[2][1], t.v[2][2], 0].as_ptr());
+
+            let v_max_value = vdup_n_u16((GAMMA_LUT - 1) as u16);
+
+            if !src_chunks.is_empty() {
+                let (src0, src1) = src_chunks.split_at(src_chunks.len() / 2);
+                let (dst0, dst1) = dst_chunks.split_at_mut(dst_chunks.len() / 2);
+                let mut src_iter0 = src0.chunks_exact(src_channels * 2);
+                let mut src_iter1 = src1.chunks_exact(src_channels * 2);
+
+                let (mut r0, mut g0, mut b0, mut a0);
+                let (mut r1, mut g1, mut b1, mut a1);
+                let (mut r2, mut g2, mut b2, mut a2);
+                let (mut r3, mut g3, mut b3, mut a3);
+
+                if let (Some(src0), Some(src1)) = (src_iter0.next(), src_iter1.next()) {
+                    let r0p = &self.profile.linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1q_dup_s32(r0p);
+                    g0 = vld1q_dup_s32(g0p);
+                    b0 = vld1q_dup_s32(b0p);
+
+                    r1 = vld1q_dup_s32(r1p);
+                    g1 = vld1q_dup_s32(g1p);
+                    b1 = vld1q_dup_s32(b1p);
+
+                    r2 = vld1q_dup_s32(r2p);
+                    g2 = vld1q_dup_s32(g2p);
+                    b2 = vld1q_dup_s32(b2p);
+
+                    r3 = vld1q_dup_s32(r3p);
+                    g3 = vld1q_dup_s32(g3p);
+                    b3 = vld1q_dup_s32(b3p);
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                } else {
+                    r0 = vdupq_n_s32(0);
+                    g0 = vdupq_n_s32(0);
+                    b0 = vdupq_n_s32(0);
+                    r1 = vdupq_n_s32(0);
+                    g1 = vdupq_n_s32(0);
+                    b1 = vdupq_n_s32(0);
+                    r2 = vdupq_n_s32(0);
+                    g2 = vdupq_n_s32(0);
+                    b2 = vdupq_n_s32(0);
+                    r3 = vdupq_n_s32(0);
+                    g3 = vdupq_n_s32(0);
+                    b3 = vdupq_n_s32(0);
+                    a0 = max_colors;
+                    a1 = max_colors;
+                    a2 = max_colors;
+                    a3 = max_colors;
+                }
+
+                for (((src0, src1), dst0), dst1) in src_iter0
+                    .zip(src_iter1)
+                    .zip(dst0.chunks_exact_mut(dst_channels * 2))
+                    .zip(dst1.chunks_exact_mut(dst_channels * 2))
+                {
+                    let v0_0 = vqrdmulhq_s32(r0, m0);
+                    let v0_1 = vqrdmulhq_s32(r1, m0);
+                    let v0_2 = vqrdmulhq_s32(r2, m0);
+                    let v0_3 = vqrdmulhq_s32(r3, m0);
+
+                    let v1_0 = vqrdmlahq_s32(v0_0, g0, m1);
+                    let v1_1 = vqrdmlahq_s32(v0_1, g1, m1);
+                    let v1_2 = vqrdmlahq_s32(v0_2, g2, m1);
+                    let v1_3 = vqrdmlahq_s32(v0_3, g3, m1);
+
+                    let vr0 = vqrdmlahq_s32(v1_0, b0, m2);
+                    let vr1 = vqrdmlahq_s32(v1_1, b1, m2);
+                    let vr2 = vqrdmlahq_s32(v1_2, b2, m2);
+                    let vr3 = vqrdmlahq_s32(v1_3, b3, m2);
+
+                    let mut vr0 = vqmovun_s32(vr0);
+                    let mut vr1 = vqmovun_s32(vr1);
+                    let mut vr2 = vqmovun_s32(vr2);
+                    let mut vr3 = vqmovun_s32(vr3);
+
+                    if BIT_DEPTH != 16 {
+                        vr0 = vmin_u16(vr0, v_max_value);
+                        vr1 = vmin_u16(vr1, v_max_value);
+                        vr2 = vmin_u16(vr2, v_max_value);
+                        vr3 = vmin_u16(vr3, v_max_value);
+                    }
+
+                    let r0p = &self.profile.linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1q_dup_s32(r0p);
+                    g0 = vld1q_dup_s32(g0p);
+                    b0 = vld1q_dup_s32(b0p);
+
+                    r1 = vld1q_dup_s32(r1p);
+                    g1 = vld1q_dup_s32(g1p);
+                    b1 = vld1q_dup_s32(b1p);
+
+                    r2 = vld1q_dup_s32(r2p);
+                    g2 = vld1q_dup_s32(g2p);
+                    b2 = vld1q_dup_s32(b2p);
+
+                    r3 = vld1q_dup_s32(r3p);
+                    g3 = vld1q_dup_s32(g3p);
+                    b3 = vld1q_dup_s32(b3p);
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr0) as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr0) as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<0>(vr1) as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<1>(vr1) as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr2) as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr2) as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr2) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<0>(vr3) as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<1>(vr3) as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<2>(vr3) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                }
+
+                if let (Some(dst0), Some(dst1)) = (
+                    dst0.chunks_exact_mut(dst_channels * 2).last(),
+                    dst1.chunks_exact_mut(dst_channels * 2).last(),
+                ) {
+                    let v0_0 = vqrdmulhq_s32(r0, m0);
+                    let v0_1 = vqrdmulhq_s32(r1, m0);
+                    let v0_2 = vqrdmulhq_s32(r2, m0);
+                    let v0_3 = vqrdmulhq_s32(r3, m0);
+
+                    let v1_0 = vqrdmlahq_s32(v0_0, g0, m1);
+                    let v1_1 = vqrdmlahq_s32(v0_1, g1, m1);
+                    let v1_2 = vqrdmlahq_s32(v0_2, g2, m1);
+                    let v1_3 = vqrdmlahq_s32(v0_3, g3, m1);
+
+                    let vr0 = vqrdmlahq_s32(v1_0, b0, m2);
+                    let vr1 = vqrdmlahq_s32(v1_1, b1, m2);
+                    let vr2 = vqrdmlahq_s32(v1_2, b2, m2);
+                    let vr3 = vqrdmlahq_s32(v1_3, b3, m2);
+
+                    let mut vr0 = vqmovun_s32(vr0);
+                    let mut vr1 = vqmovun_s32(vr1);
+                    let mut vr2 = vqmovun_s32(vr2);
+                    let mut vr3 = vqmovun_s32(vr3);
+
+                    if BIT_DEPTH != 16 {
+                        vr0 = vmin_u16(vr0, v_max_value);
+                        vr1 = vmin_u16(vr1, v_max_value);
+                        vr2 = vmin_u16(vr2, v_max_value);
+                        vr3 = vmin_u16(vr3, v_max_value);
+                    }
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr0) as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr0) as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<0>(vr1) as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<1>(vr1) as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr2) as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr2) as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr2) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<0>(vr3) as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<1>(vr3) as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<2>(vr3) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+                }
+            }
+
+            for (src, dst) in src_remainder
+                .chunks_exact(src_channels)
+                .zip(dst_remainder.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.linear[src[src_cn.b_i()]._as_usize()];
+                let r = vld1q_dup_s32(rp);
+                let g = vld1q_dup_s32(gp);
+                let b = vld1q_dup_s32(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let v0 = vqrdmulhq_s32(r, m0);
+                let v1 = vqrdmlahq_s32(v0, g, m1);
+                let v = vqrdmlahq_s32(v1, b, m2);
+
+                let mut vr0 = vqmovun_s32(v);
+                if BIT_DEPTH != 16 {
+                    vr0 = vmin_u16(vr0, v_max_value);
+                }
+
+                dst[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr0) as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr0) as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformShaperQ1_30NeonOpt<
+        T,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        LINEAR_CAP,
+        GAMMA_LUT,
+        BIT_DEPTH,
+        PRECISION,
+    >
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/rgb_xyz_q2_13.rs
+++ b/vendor/moxcms/src/conversions/neon/rgb_xyz_q2_13.rs
@@ -0,0 +1,412 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPoint;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+
+#[allow(dead_code)]
+#[inline]
+pub(crate) fn split_by_twos<T: Copy>(data: &[T], channels: usize) -> (&[T], &[T]) {
+    let len = data.len() / (channels * 4);
+    let split_point = len * 4;
+    data.split_at(split_point * channels)
+}
+
+#[allow(dead_code)]
+#[inline]
+pub(crate) fn split_by_twos_mut<T: Copy>(data: &mut [T], channels: usize) -> (&mut [T], &mut [T]) {
+    let len = data.len() / (channels * 4);
+    let split_point = len * 4;
+    data.split_at_mut(split_point * channels)
+}
+
+pub(crate) struct TransformShaperQ2_13Neon<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPoint<i16, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformShaperQ2_13Neon<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        let (src_chunks, src_remainder) = split_by_twos(src, src_channels);
+        let (dst_chunks, dst_remainder) = split_by_twos_mut(dst, dst_channels);
+
+        unsafe {
+            let m0 = vld1_s16([t.v[0][0], t.v[0][1], t.v[0][2], 0].as_ptr());
+            let m1 = vld1_s16([t.v[1][0], t.v[1][1], t.v[1][2], 0].as_ptr());
+            let m2 = vld1_s16([t.v[2][0], t.v[2][1], t.v[2][2], 0].as_ptr());
+
+            let v_max_value = vdup_n_u16((GAMMA_LUT - 1) as u16);
+
+            let rnd = vdupq_n_s32(1 << (PRECISION - 1));
+
+            if !src_chunks.is_empty() {
+                let (src0, src1) = src_chunks.split_at(src_chunks.len() / 2);
+                let (dst0, dst1) = dst_chunks.split_at_mut(dst_chunks.len() / 2);
+                let mut src_iter0 = src0.chunks_exact(src_channels * 2);
+                let mut src_iter1 = src1.chunks_exact(src_channels * 2);
+
+                let (mut r0, mut g0, mut b0, mut a0);
+                let (mut r1, mut g1, mut b1, mut a1);
+                let (mut r2, mut g2, mut b2, mut a2);
+                let (mut r3, mut g3, mut b3, mut a3);
+
+                if let (Some(src0), Some(src1)) = (src_iter0.next(), src_iter1.next()) {
+                    let r0p = &self.profile.r_linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.g_linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.b_linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.r_linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.g_linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.b_linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.r_linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.g_linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.b_linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.r_linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.g_linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.b_linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1_dup_s16(r0p);
+                    g0 = vld1_dup_s16(g0p);
+                    b0 = vld1_dup_s16(b0p);
+
+                    r1 = vld1_dup_s16(r1p);
+                    g1 = vld1_dup_s16(g1p);
+                    b1 = vld1_dup_s16(b1p);
+
+                    r2 = vld1_dup_s16(r2p);
+                    g2 = vld1_dup_s16(g2p);
+                    b2 = vld1_dup_s16(b2p);
+
+                    r3 = vld1_dup_s16(r3p);
+                    g3 = vld1_dup_s16(g3p);
+                    b3 = vld1_dup_s16(b3p);
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                } else {
+                    r0 = vdup_n_s16(0);
+                    g0 = vdup_n_s16(0);
+                    b0 = vdup_n_s16(0);
+                    r1 = vdup_n_s16(0);
+                    g1 = vdup_n_s16(0);
+                    b1 = vdup_n_s16(0);
+                    r2 = vdup_n_s16(0);
+                    g2 = vdup_n_s16(0);
+                    b2 = vdup_n_s16(0);
+                    r3 = vdup_n_s16(0);
+                    g3 = vdup_n_s16(0);
+                    b3 = vdup_n_s16(0);
+                    a0 = max_colors;
+                    a1 = max_colors;
+                    a2 = max_colors;
+                    a3 = max_colors;
+                }
+
+                for (((src0, src1), dst0), dst1) in src_iter0
+                    .zip(src_iter1)
+                    .zip(dst0.chunks_exact_mut(dst_channels * 2))
+                    .zip(dst1.chunks_exact_mut(dst_channels * 2))
+                {
+                    let v0_0 = vmlal_s16(rnd, r0, m0);
+                    let v0_1 = vmlal_s16(rnd, r1, m0);
+                    let v0_2 = vmlal_s16(rnd, r2, m0);
+                    let v0_3 = vmlal_s16(rnd, r3, m0);
+
+                    let v1_0 = vmlal_s16(v0_0, g0, m1);
+                    let v1_1 = vmlal_s16(v0_1, g1, m1);
+                    let v1_2 = vmlal_s16(v0_2, g2, m1);
+                    let v1_3 = vmlal_s16(v0_3, g3, m1);
+
+                    let vr0 = vmlal_s16(v1_0, b0, m2);
+                    let vr1 = vmlal_s16(v1_1, b1, m2);
+                    let vr2 = vmlal_s16(v1_2, b2, m2);
+                    let vr3 = vmlal_s16(v1_3, b3, m2);
+
+                    let mut vr0 = vqshrun_n_s32::<PRECISION>(vr0);
+                    let mut vr1 = vqshrun_n_s32::<PRECISION>(vr1);
+                    let mut vr2 = vqshrun_n_s32::<PRECISION>(vr2);
+                    let mut vr3 = vqshrun_n_s32::<PRECISION>(vr3);
+
+                    vr0 = vmin_u16(vr0, v_max_value);
+                    vr1 = vmin_u16(vr1, v_max_value);
+                    vr2 = vmin_u16(vr2, v_max_value);
+                    vr3 = vmin_u16(vr3, v_max_value);
+
+                    let r0p = &self.profile.r_linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.g_linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.b_linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.r_linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.g_linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.b_linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.r_linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.g_linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.b_linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.r_linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.g_linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.b_linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1_dup_s16(r0p);
+                    g0 = vld1_dup_s16(g0p);
+                    b0 = vld1_dup_s16(b0p);
+
+                    r1 = vld1_dup_s16(r1p);
+                    g1 = vld1_dup_s16(g1p);
+                    b1 = vld1_dup_s16(b1p);
+
+                    r2 = vld1_dup_s16(r2p);
+                    g2 = vld1_dup_s16(g2p);
+                    b2 = vld1_dup_s16(b2p);
+
+                    r3 = vld1_dup_s16(r3p);
+                    g3 = vld1_dup_s16(g3p);
+                    b3 = vld1_dup_s16(b3p);
+
+                    dst0[dst_cn.r_i()] = self.profile.r_gamma[vget_lane_u16::<0>(vr0) as usize];
+                    dst0[dst_cn.g_i()] = self.profile.g_gamma[vget_lane_u16::<1>(vr0) as usize];
+                    dst0[dst_cn.b_i()] = self.profile.b_gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.r_gamma[vget_lane_u16::<0>(vr1) as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.g_gamma[vget_lane_u16::<1>(vr1) as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.b_gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.r_gamma[vget_lane_u16::<0>(vr2) as usize];
+                    dst1[dst_cn.g_i()] = self.profile.g_gamma[vget_lane_u16::<1>(vr2) as usize];
+                    dst1[dst_cn.b_i()] = self.profile.b_gamma[vget_lane_u16::<2>(vr2) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.r_gamma[vget_lane_u16::<0>(vr3) as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.g_gamma[vget_lane_u16::<1>(vr3) as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.b_gamma[vget_lane_u16::<2>(vr3) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                }
+
+                if let (Some(dst0), Some(dst1)) = (
+                    dst0.chunks_exact_mut(dst_channels * 2).last(),
+                    dst1.chunks_exact_mut(dst_channels * 2).last(),
+                ) {
+                    let v0_0 = vmlal_s16(rnd, r0, m0);
+                    let v0_1 = vmlal_s16(rnd, r1, m0);
+                    let v0_2 = vmlal_s16(rnd, r2, m0);
+                    let v0_3 = vmlal_s16(rnd, r3, m0);
+
+                    let v1_0 = vmlal_s16(v0_0, g0, m1);
+                    let v1_1 = vmlal_s16(v0_1, g1, m1);
+                    let v1_2 = vmlal_s16(v0_2, g2, m1);
+                    let v1_3 = vmlal_s16(v0_3, g3, m1);
+
+                    let vr0 = vmlal_s16(v1_0, b0, m2);
+                    let vr1 = vmlal_s16(v1_1, b1, m2);
+                    let vr2 = vmlal_s16(v1_2, b2, m2);
+                    let vr3 = vmlal_s16(v1_3, b3, m2);
+
+                    let mut vr0 = vqshrun_n_s32::<PRECISION>(vr0);
+                    let mut vr1 = vqshrun_n_s32::<PRECISION>(vr1);
+                    let mut vr2 = vqshrun_n_s32::<PRECISION>(vr2);
+                    let mut vr3 = vqshrun_n_s32::<PRECISION>(vr3);
+
+                    vr0 = vmin_u16(vr0, v_max_value);
+                    vr1 = vmin_u16(vr1, v_max_value);
+                    vr2 = vmin_u16(vr2, v_max_value);
+                    vr3 = vmin_u16(vr3, v_max_value);
+
+                    dst0[dst_cn.r_i()] = self.profile.r_gamma[vget_lane_u16::<0>(vr0) as usize];
+                    dst0[dst_cn.g_i()] = self.profile.g_gamma[vget_lane_u16::<1>(vr0) as usize];
+                    dst0[dst_cn.b_i()] = self.profile.b_gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.r_gamma[vget_lane_u16::<0>(vr1) as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.g_gamma[vget_lane_u16::<1>(vr1) as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.b_gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.r_gamma[vget_lane_u16::<0>(vr2) as usize];
+                    dst1[dst_cn.g_i()] = self.profile.g_gamma[vget_lane_u16::<1>(vr2) as usize];
+                    dst1[dst_cn.b_i()] = self.profile.b_gamma[vget_lane_u16::<2>(vr2) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.r_gamma[vget_lane_u16::<0>(vr3) as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.g_gamma[vget_lane_u16::<1>(vr3) as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.b_gamma[vget_lane_u16::<2>(vr3) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+                }
+            }
+
+            for (src, dst) in src_remainder
+                .chunks_exact(src_channels)
+                .zip(dst_remainder.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.r_linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.g_linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.b_linear[src[src_cn.b_i()]._as_usize()];
+                let r = vld1_dup_s16(rp);
+                let g = vld1_dup_s16(gp);
+                let b = vld1_dup_s16(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let v0 = vmlal_s16(rnd, r, m0);
+                let v1 = vmlal_s16(v0, g, m1);
+                let v = vmlal_s16(v1, b, m2);
+
+                let mut vr0 = vqshrun_n_s32::<PRECISION>(v);
+                vr0 = vmin_u16(vr0, v_max_value);
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[vget_lane_u16::<0>(vr0) as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[vget_lane_u16::<1>(vr0) as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[vget_lane_u16::<2>(vr0) as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/rgb_xyz_q2_13_opt.rs
+++ b/vendor/moxcms/src/conversions/neon/rgb_xyz_q2_13_opt.rs
@@ -0,0 +1,397 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::neon::rgb_xyz_q2_13::{split_by_twos, split_by_twos_mut};
+use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPointOpt;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+
+pub(crate) struct TransformShaperQ2_13NeonOpt<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPointOpt<i16, i16, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformShaperQ2_13NeonOpt<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        let (src_chunks, src_remainder) = split_by_twos(src, src_channels);
+        let (dst_chunks, dst_remainder) = split_by_twos_mut(dst, dst_channels);
+
+        unsafe {
+            let m0 = vld1_s16([t.v[0][0], t.v[0][1], t.v[0][2], 0].as_ptr());
+            let m1 = vld1_s16([t.v[1][0], t.v[1][1], t.v[1][2], 0].as_ptr());
+            let m2 = vld1_s16([t.v[2][0], t.v[2][1], t.v[2][2], 0].as_ptr());
+
+            let v_max_value = vdup_n_u16((GAMMA_LUT - 1) as u16);
+
+            let rnd = vdupq_n_s32(1 << (PRECISION - 1));
+
+            if !src_chunks.is_empty() {
+                let (src0, src1) = src_chunks.split_at(src_chunks.len() / 2);
+                let (dst0, dst1) = dst_chunks.split_at_mut(dst_chunks.len() / 2);
+                let mut src_iter0 = src0.chunks_exact(src_channels * 2);
+                let mut src_iter1 = src1.chunks_exact(src_channels * 2);
+
+                let (mut r0, mut g0, mut b0, mut a0);
+                let (mut r1, mut g1, mut b1, mut a1);
+                let (mut r2, mut g2, mut b2, mut a2);
+                let (mut r3, mut g3, mut b3, mut a3);
+
+                if let (Some(src0), Some(src1)) = (src_iter0.next(), src_iter1.next()) {
+                    let r0p = &self.profile.linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1_dup_s16(r0p);
+                    g0 = vld1_dup_s16(g0p);
+                    b0 = vld1_dup_s16(b0p);
+
+                    r1 = vld1_dup_s16(r1p);
+                    g1 = vld1_dup_s16(g1p);
+                    b1 = vld1_dup_s16(b1p);
+
+                    r2 = vld1_dup_s16(r2p);
+                    g2 = vld1_dup_s16(g2p);
+                    b2 = vld1_dup_s16(b2p);
+
+                    r3 = vld1_dup_s16(r3p);
+                    g3 = vld1_dup_s16(g3p);
+                    b3 = vld1_dup_s16(b3p);
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                } else {
+                    r0 = vdup_n_s16(0);
+                    g0 = vdup_n_s16(0);
+                    b0 = vdup_n_s16(0);
+                    r1 = vdup_n_s16(0);
+                    g1 = vdup_n_s16(0);
+                    b1 = vdup_n_s16(0);
+                    r2 = vdup_n_s16(0);
+                    g2 = vdup_n_s16(0);
+                    b2 = vdup_n_s16(0);
+                    r3 = vdup_n_s16(0);
+                    g3 = vdup_n_s16(0);
+                    b3 = vdup_n_s16(0);
+                    a0 = max_colors;
+                    a1 = max_colors;
+                    a2 = max_colors;
+                    a3 = max_colors;
+                }
+
+                for (((src0, src1), dst0), dst1) in src_iter0
+                    .zip(src_iter1)
+                    .zip(dst0.chunks_exact_mut(dst_channels * 2))
+                    .zip(dst1.chunks_exact_mut(dst_channels * 2))
+                {
+                    let v0_0 = vmlal_s16(rnd, r0, m0);
+                    let v0_1 = vmlal_s16(rnd, r1, m0);
+                    let v0_2 = vmlal_s16(rnd, r2, m0);
+                    let v0_3 = vmlal_s16(rnd, r3, m0);
+
+                    let v1_0 = vmlal_s16(v0_0, g0, m1);
+                    let v1_1 = vmlal_s16(v0_1, g1, m1);
+                    let v1_2 = vmlal_s16(v0_2, g2, m1);
+                    let v1_3 = vmlal_s16(v0_3, g3, m1);
+
+                    let vr0 = vmlal_s16(v1_0, b0, m2);
+                    let vr1 = vmlal_s16(v1_1, b1, m2);
+                    let vr2 = vmlal_s16(v1_2, b2, m2);
+                    let vr3 = vmlal_s16(v1_3, b3, m2);
+
+                    let mut vr0 = vqshrun_n_s32::<PRECISION>(vr0);
+                    let mut vr1 = vqshrun_n_s32::<PRECISION>(vr1);
+                    let mut vr2 = vqshrun_n_s32::<PRECISION>(vr2);
+                    let mut vr3 = vqshrun_n_s32::<PRECISION>(vr3);
+
+                    vr0 = vmin_u16(vr0, v_max_value);
+                    vr1 = vmin_u16(vr1, v_max_value);
+                    vr2 = vmin_u16(vr2, v_max_value);
+                    vr3 = vmin_u16(vr3, v_max_value);
+
+                    let r0p = &self.profile.linear[src0[src_cn.r_i()]._as_usize()];
+                    let g0p = &self.profile.linear[src0[src_cn.g_i()]._as_usize()];
+                    let b0p = &self.profile.linear[src0[src_cn.b_i()]._as_usize()];
+
+                    let r1p = &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()];
+                    let g1p = &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()];
+                    let b1p = &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()];
+
+                    let r2p = &self.profile.linear[src1[src_cn.r_i()]._as_usize()];
+                    let g2p = &self.profile.linear[src1[src_cn.g_i()]._as_usize()];
+                    let b2p = &self.profile.linear[src1[src_cn.b_i()]._as_usize()];
+
+                    let r3p = &self.profile.linear[src1[src_cn.r_i() + src_channels]._as_usize()];
+                    let g3p = &self.profile.linear[src1[src_cn.g_i() + src_channels]._as_usize()];
+                    let b3p = &self.profile.linear[src1[src_cn.b_i() + src_channels]._as_usize()];
+
+                    r0 = vld1_dup_s16(r0p);
+                    g0 = vld1_dup_s16(g0p);
+                    b0 = vld1_dup_s16(b0p);
+
+                    r1 = vld1_dup_s16(r1p);
+                    g1 = vld1_dup_s16(g1p);
+                    b1 = vld1_dup_s16(b1p);
+
+                    r2 = vld1_dup_s16(r2p);
+                    g2 = vld1_dup_s16(g2p);
+                    b2 = vld1_dup_s16(b2p);
+
+                    r3 = vld1_dup_s16(r3p);
+                    g3 = vld1_dup_s16(g3p);
+                    b3 = vld1_dup_s16(b3p);
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr0) as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr0) as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<0>(vr1) as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<1>(vr1) as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr2) as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr2) as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr2) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<0>(vr3) as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<1>(vr3) as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<2>(vr3) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+
+                    a0 = if src_channels == 4 {
+                        src0[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a1 = if src_channels == 4 {
+                        src0[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+
+                    a2 = if src_channels == 4 {
+                        src1[src_cn.a_i()]
+                    } else {
+                        max_colors
+                    };
+
+                    a3 = if src_channels == 4 {
+                        src1[src_cn.a_i() + src_channels]
+                    } else {
+                        max_colors
+                    };
+                }
+
+                if let (Some(dst0), Some(dst1)) = (
+                    dst0.chunks_exact_mut(dst_channels * 2).last(),
+                    dst1.chunks_exact_mut(dst_channels * 2).last(),
+                ) {
+                    let v0_0 = vmlal_s16(rnd, r0, m0);
+                    let v0_1 = vmlal_s16(rnd, r1, m0);
+                    let v0_2 = vmlal_s16(rnd, r2, m0);
+                    let v0_3 = vmlal_s16(rnd, r3, m0);
+
+                    let v1_0 = vmlal_s16(v0_0, g0, m1);
+                    let v1_1 = vmlal_s16(v0_1, g1, m1);
+                    let v1_2 = vmlal_s16(v0_2, g2, m1);
+                    let v1_3 = vmlal_s16(v0_3, g3, m1);
+
+                    let vr0 = vmlal_s16(v1_0, b0, m2);
+                    let vr1 = vmlal_s16(v1_1, b1, m2);
+                    let vr2 = vmlal_s16(v1_2, b2, m2);
+                    let vr3 = vmlal_s16(v1_3, b3, m2);
+
+                    let mut vr0 = vqshrun_n_s32::<PRECISION>(vr0);
+                    let mut vr1 = vqshrun_n_s32::<PRECISION>(vr1);
+                    let mut vr2 = vqshrun_n_s32::<PRECISION>(vr2);
+                    let mut vr3 = vqshrun_n_s32::<PRECISION>(vr3);
+
+                    vr0 = vmin_u16(vr0, v_max_value);
+                    vr1 = vmin_u16(vr1, v_max_value);
+                    vr2 = vmin_u16(vr2, v_max_value);
+                    vr3 = vmin_u16(vr3, v_max_value);
+
+                    dst0[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr0) as usize];
+                    dst0[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr0) as usize];
+                    dst0[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i()] = a0;
+                    }
+
+                    dst0[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<0>(vr1) as usize];
+                    dst0[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<1>(vr1) as usize];
+                    dst0[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                    if dst_channels == 4 {
+                        dst0[dst_cn.a_i() + dst_channels] = a1;
+                    }
+
+                    dst1[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr2) as usize];
+                    dst1[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr2) as usize];
+                    dst1[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr2) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i()] = a2;
+                    }
+
+                    dst1[dst_cn.r_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<0>(vr3) as usize];
+                    dst1[dst_cn.g_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<1>(vr3) as usize];
+                    dst1[dst_cn.b_i() + dst_channels] =
+                        self.profile.gamma[vget_lane_u16::<2>(vr3) as usize];
+                    if dst_channels == 4 {
+                        dst1[dst_cn.a_i() + dst_channels] = a3;
+                    }
+                }
+            }
+
+            for (src, dst) in src_remainder
+                .chunks_exact(src_channels)
+                .zip(dst_remainder.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.linear[src[src_cn.b_i()]._as_usize()];
+                let r = vld1_dup_s16(rp);
+                let g = vld1_dup_s16(gp);
+                let b = vld1_dup_s16(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                let v0 = vmlal_s16(rnd, r, m0);
+                let v1 = vmlal_s16(v0, g, m1);
+                let v = vmlal_s16(v1, b, m2);
+
+                let mut vr0 = vqshrun_n_s32::<PRECISION>(v);
+                vr0 = vmin_u16(vr0, v_max_value);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[vget_lane_u16::<0>(vr0) as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[vget_lane_u16::<1>(vr0) as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[vget_lane_u16::<2>(vr0) as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/t_lut3_to_3.rs
+++ b/vendor/moxcms/src/conversions/neon/t_lut3_to_3.rs
@@ -0,0 +1,335 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::lut_transforms::Lut3x3Factory;
+use crate::conversions::neon::interpolator::*;
+use crate::conversions::neon::interpolator_q0_15::NeonAlignedI16x4;
+use crate::conversions::neon::rgb_xyz::NeonAlignedF32;
+use crate::conversions::neon::t_lut3_to_3_q0_15::TransformLut3x3NeonQ0_15;
+use crate::transform::PointeeSizeExpressible;
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+use std::marker::PhantomData;
+
+struct TransformLut3x3Neon<
+    T,
+    U,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    lut: Vec<NeonAlignedF32>,
+    _phantom: PhantomData<T>,
+    _phantom1: PhantomData<U>,
+    interpolation_method: InterpolationMethod,
+    weights: Box<[BarycentricWeight<f32>; BINS]>,
+    color_space: DataColorSpace,
+    is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut3x3Neon<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[inline(always)]
+    fn transform_chunk<'b, Interpolator: NeonMdInterpolation<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        unsafe {
+            let src_cn = Layout::from(SRC_LAYOUT);
+            let src_channels = src_cn.channels();
+
+            let dst_cn = Layout::from(DST_LAYOUT);
+            let dst_channels = dst_cn.channels();
+
+            let value_scale = vdupq_n_f32(((1 << BIT_DEPTH) - 1) as f32);
+            let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.r_i()],
+                );
+                let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.g_i()],
+                );
+                let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.b_i()],
+                );
+
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_value
+                };
+
+                let tetrahedral = Interpolator::new(&self.lut);
+                let v = tetrahedral.inter3_neon(x, y, z, &self.weights);
+                if T::FINITE {
+                    let mut r = vfmaq_f32(vdupq_n_f32(0.5f32), v.v, value_scale);
+                    r = vminq_f32(r, value_scale);
+                    let jvx = vcvtaq_u32_f32(r);
+
+                    dst[dst_cn.r_i()] = vgetq_lane_u32::<0>(jvx).as_();
+                    dst[dst_cn.g_i()] = vgetq_lane_u32::<1>(jvx).as_();
+                    dst[dst_cn.b_i()] = vgetq_lane_u32::<2>(jvx).as_();
+                } else {
+                    dst[dst_cn.r_i()] = vgetq_lane_f32::<0>(v.v).as_();
+                    dst[dst_cn.g_i()] = vgetq_lane_f32::<1>(v.v).as_();
+                    dst[dst_cn.b_i()] = vgetq_lane_f32::<2>(v.v).as_();
+                }
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut3x3Neon<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / src_channels;
+        let dst_chunks = dst.len() / dst_channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        if self.color_space == DataColorSpace::Lab
+            || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+            || self.color_space == DataColorSpace::Xyz
+        {
+            self.transform_chunk::<TrilinearNeon<GRID_SIZE>>(src, dst);
+        } else {
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    self.transform_chunk::<TetrahedralNeon<GRID_SIZE>>(src, dst);
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    self.transform_chunk::<PyramidalNeon<GRID_SIZE>>(src, dst);
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    self.transform_chunk::<PrismaticNeon<GRID_SIZE>>(src, dst);
+                }
+                InterpolationMethod::Linear => {
+                    self.transform_chunk::<TrilinearNeon<GRID_SIZE>>(src, dst);
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) struct NeonLut3x3Factory {}
+
+impl Lut3x3Factory for NeonLut3x3Factory {
+    fn make_transform_3x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const SRC_LAYOUT: u8,
+        const DST_LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Send + Sync>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>,
+    {
+        if options.prefer_fixed_point
+            && BIT_DEPTH < 16
+            && std::arch::is_aarch64_feature_detected!("rdm")
+        {
+            let q: f32 = if T::FINITE {
+                ((1i32 << BIT_DEPTH as i32) - 1) as f32
+            } else {
+                ((1i32 << 14i32) - 1) as f32
+            };
+            let lut = lut
+                .chunks_exact(3)
+                .map(|x| {
+                    NeonAlignedI16x4([
+                        (x[0] * q).round() as i16,
+                        (x[1] * q).round() as i16,
+                        (x[2] * q).round() as i16,
+                        0,
+                    ])
+                })
+                .collect::<Vec<_>>();
+            return match options.barycentric_weight_scale {
+                BarycentricWeightScale::Low => Box::new(TransformLut3x3NeonQ0_15::<
+                    T,
+                    u8,
+                    SRC_LAYOUT,
+                    DST_LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    256,
+                    256,
+                > {
+                    lut,
+                    _phantom: PhantomData,
+                    _phantom1: PhantomData,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
+                    color_space,
+                    is_linear,
+                }),
+                #[cfg(feature = "options")]
+                BarycentricWeightScale::High => Box::new(TransformLut3x3NeonQ0_15::<
+                    T,
+                    u16,
+                    SRC_LAYOUT,
+                    DST_LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    65536,
+                    65536,
+                > {
+                    lut,
+                    _phantom: PhantomData,
+                    _phantom1: PhantomData,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
+                    color_space,
+                    is_linear,
+                }),
+            };
+        }
+        let lut = lut
+            .chunks_exact(3)
+            .map(|x| NeonAlignedF32([x[0], x[1], x[2], 0f32]))
+            .collect::<Vec<_>>();
+        match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => Box::new(TransformLut3x3Neon::<
+                T,
+                u8,
+                SRC_LAYOUT,
+                DST_LAYOUT,
+                GRID_SIZE,
+                BIT_DEPTH,
+                256,
+                256,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom1: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                color_space,
+                is_linear,
+            }),
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => Box::new(TransformLut3x3Neon::<
+                T,
+                u16,
+                SRC_LAYOUT,
+                DST_LAYOUT,
+                GRID_SIZE,
+                BIT_DEPTH,
+                65536,
+                65536,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom1: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                color_space,
+                is_linear,
+            }),
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/neon/t_lut3_to_3_q0_15.rs
+++ b/vendor/moxcms/src/conversions/neon/t_lut3_to_3_q0_15.rs
@@ -0,0 +1,219 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::neon::interpolator_q0_15::*;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::arch::aarch64::*;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformLut3x3NeonQ0_15<
+    T,
+    U,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    pub(crate) lut: Vec<NeonAlignedI16x4>,
+    pub(crate) _phantom: PhantomData<T>,
+    pub(crate) _phantom1: PhantomData<U>,
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
+    pub(crate) color_space: DataColorSpace,
+    pub(crate) is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+>
+    TransformLut3x3NeonQ0_15<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "rdm")]
+    unsafe fn transform_chunk<'b, Interpolator: NeonMdInterpolationQ0_15<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        unsafe {
+            let src_cn = Layout::from(SRC_LAYOUT);
+            let src_channels = src_cn.channels();
+
+            let dst_cn = Layout::from(DST_LAYOUT);
+            let dst_channels = dst_cn.channels();
+
+            let f_value_scale = vdupq_n_f32(1. / ((1 << 14i32) - 1) as f32);
+            let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+            let v_max_scale = if T::FINITE {
+                vdup_n_s16(((1i32 << BIT_DEPTH) - 1) as i16)
+            } else {
+                vdup_n_s16(((1i32 << 14i32) - 1) as i16)
+            };
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.r_i()],
+                );
+                let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.g_i()],
+                );
+                let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.b_i()],
+                );
+
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_value
+                };
+
+                let tetrahedral = Interpolator::new(&self.lut);
+                let v = tetrahedral.inter3_neon(x, y, z, &self.weights);
+                if T::FINITE {
+                    let mut o = vmax_s16(v.v, vdup_n_s16(0));
+                    o = vmin_s16(o, v_max_scale);
+                    dst[dst_cn.r_i()] = (vget_lane_s16::<0>(o) as u32).as_();
+                    dst[dst_cn.g_i()] = (vget_lane_s16::<1>(o) as u32).as_();
+                    dst[dst_cn.b_i()] = (vget_lane_s16::<2>(o) as u32).as_();
+                } else {
+                    let o = vcvtq_f32_s32(vmovl_s16(v.v));
+                    let r = vmulq_f32(o, f_value_scale);
+                    dst[dst_cn.r_i()] = vgetq_lane_f32::<0>(r).as_();
+                    dst[dst_cn.g_i()] = vgetq_lane_f32::<1>(r).as_();
+                    dst[dst_cn.b_i()] = vgetq_lane_f32::<2>(r).as_();
+                }
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut3x3NeonQ0_15<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / src_channels;
+        let dst_chunks = dst.len() / dst_channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearNeonQ0_15<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralNeonQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidalNeonQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticNeonQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearNeonQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/prelude_lut_xyz_rgb.rs
+++ b/vendor/moxcms/src/conversions/prelude_lut_xyz_rgb.rs
@@ -0,0 +1,327 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 4/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::lut3x4::create_lut3_samples;
+use crate::mlaf::mlaf;
+use crate::trc::ToneCurveEvaluator;
+use crate::{
+    CmsError, ColorProfile, GammaLutInterpolate, InPlaceStage, Matrix3f, PointeeSizeExpressible,
+    RenderingIntent, Rgb, TransformOptions, filmlike_clip,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+pub(crate) struct XyzToRgbStage<T: Clone> {
+    pub(crate) r_gamma: Box<[T; 65536]>,
+    pub(crate) g_gamma: Box<[T; 65536]>,
+    pub(crate) b_gamma: Box<[T; 65536]>,
+    pub(crate) matrices: Vec<Matrix3f>,
+    pub(crate) intent: RenderingIntent,
+    pub(crate) bit_depth: usize,
+    pub(crate) gamma_lut: usize,
+}
+
+impl<T: Clone + AsPrimitive<f32>> InPlaceStage for XyzToRgbStage<T> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        assert!(self.bit_depth > 0);
+        if !self.matrices.is_empty() {
+            let m = self.matrices[0];
+            for dst in dst.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(x * m.v[0][0], y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(x * m.v[1][0], y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(x * m.v[2][0], y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+
+        for m in self.matrices.iter().skip(1) {
+            for dst in dst.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(x * m.v[0][0], y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(x * m.v[1][0], y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(x * m.v[2][0], y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+
+        let max_colors = (1 << self.bit_depth) - 1;
+        let color_scale = 1f32 / max_colors as f32;
+        let lut_cap = (self.gamma_lut - 1) as f32;
+
+        if self.intent != RenderingIntent::AbsoluteColorimetric {
+            for dst in dst.chunks_exact_mut(3) {
+                let mut rgb = Rgb::new(dst[0], dst[1], dst[2]);
+                if rgb.is_out_of_gamut() {
+                    rgb = filmlike_clip(rgb);
+                }
+                let r = mlaf(0.5f32, rgb.r, lut_cap).min(lut_cap).max(0f32) as u16;
+                let g = mlaf(0.5f32, rgb.g, lut_cap).min(lut_cap).max(0f32) as u16;
+                let b = mlaf(0.5f32, rgb.b, lut_cap).min(lut_cap).max(0f32) as u16;
+
+                dst[0] = self.r_gamma[r as usize].as_() * color_scale;
+                dst[1] = self.g_gamma[g as usize].as_() * color_scale;
+                dst[2] = self.b_gamma[b as usize].as_() * color_scale;
+            }
+        } else {
+            for dst in dst.chunks_exact_mut(3) {
+                let rgb = Rgb::new(dst[0], dst[1], dst[2]);
+                let r = mlaf(0.5f32, rgb.r, lut_cap).min(lut_cap).max(0f32) as u16;
+                let g = mlaf(0.5f32, rgb.g, lut_cap).min(lut_cap).max(0f32) as u16;
+                let b = mlaf(0.5f32, rgb.b, lut_cap).min(lut_cap).max(0f32) as u16;
+
+                dst[0] = self.r_gamma[r as usize].as_() * color_scale;
+                dst[1] = self.g_gamma[g as usize].as_() * color_scale;
+                dst[2] = self.b_gamma[b as usize].as_() * color_scale;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) struct XyzToRgbStageExtended<T: Clone> {
+    pub(crate) gamma_evaluator: Box<dyn ToneCurveEvaluator>,
+    pub(crate) matrices: Vec<Matrix3f>,
+    pub(crate) phantom_data: PhantomData<T>,
+}
+
+impl<T: Clone + AsPrimitive<f32>> InPlaceStage for XyzToRgbStageExtended<T> {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        if !self.matrices.is_empty() {
+            let m = self.matrices[0];
+            for dst in dst.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(x * m.v[0][0], y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(x * m.v[1][0], y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(x * m.v[2][0], y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+
+        for m in self.matrices.iter().skip(1) {
+            for dst in dst.chunks_exact_mut(3) {
+                let x = dst[0];
+                let y = dst[1];
+                let z = dst[2];
+                dst[0] = mlaf(mlaf(x * m.v[0][0], y, m.v[0][1]), z, m.v[0][2]);
+                dst[1] = mlaf(mlaf(x * m.v[1][0], y, m.v[1][1]), z, m.v[1][2]);
+                dst[2] = mlaf(mlaf(x * m.v[2][0], y, m.v[2][1]), z, m.v[2][2]);
+            }
+        }
+
+        for dst in dst.chunks_exact_mut(3) {
+            let mut rgb = Rgb::new(dst[0], dst[1], dst[2]);
+            rgb = self.gamma_evaluator.evaluate_tristimulus(rgb);
+            dst[0] = rgb.r.as_();
+            dst[1] = rgb.g.as_();
+            dst[2] = rgb.b.as_();
+        }
+
+        Ok(())
+    }
+}
+
+struct RgbLinearizationStage<T: Clone, const LINEAR_CAP: usize, const SAMPLES: usize> {
+    r_lin: Box<[f32; LINEAR_CAP]>,
+    g_lin: Box<[f32; LINEAR_CAP]>,
+    b_lin: Box<[f32; LINEAR_CAP]>,
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+impl<
+    T: Clone + AsPrimitive<usize> + PointeeSizeExpressible,
+    const LINEAR_CAP: usize,
+    const SAMPLES: usize,
+> RgbLinearizationStage<T, LINEAR_CAP, SAMPLES>
+{
+    fn transform(&self, src: &[T], dst: &mut [f32]) -> Result<(), CmsError> {
+        if src.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % 3 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let scale = if T::FINITE {
+            ((1 << self.bit_depth) - 1) as f32 / (SAMPLES as f32 - 1f32)
+        } else {
+            (T::NOT_FINITE_LINEAR_TABLE_SIZE - 1) as f32 / (SAMPLES as f32 - 1f32)
+        };
+
+        let capped_value = if T::FINITE {
+            (1 << self.bit_depth) - 1
+        } else {
+            T::NOT_FINITE_LINEAR_TABLE_SIZE - 1
+        };
+
+        for (src, dst) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
+            let j_r = src[0].as_() as f32 * scale;
+            let j_g = src[1].as_() as f32 * scale;
+            let j_b = src[2].as_() as f32 * scale;
+            dst[0] = self.r_lin[(j_r.round().max(0.0).min(capped_value as f32) as u16) as usize];
+            dst[1] = self.g_lin[(j_g.round().max(0.0).min(capped_value as f32) as u16) as usize];
+            dst[2] = self.b_lin[(j_b.round().max(0.0).min(capped_value as f32) as u16) as usize];
+        }
+        Ok(())
+    }
+}
+
+pub(crate) fn create_rgb_lin_lut<
+    T: Copy + Default + AsPrimitive<f32> + Send + Sync + AsPrimitive<usize> + PointeeSizeExpressible,
+    const BIT_DEPTH: usize,
+    const LINEAR_CAP: usize,
+    const GRID_SIZE: usize,
+>(
+    source: &ColorProfile,
+    opts: TransformOptions,
+) -> Result<Vec<f32>, CmsError>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    let lut_origins = create_lut3_samples::<T, GRID_SIZE>();
+
+    let lin_r =
+        source.build_r_linearize_table::<T, LINEAR_CAP, BIT_DEPTH>(opts.allow_use_cicp_transfer)?;
+    let lin_g =
+        source.build_g_linearize_table::<T, LINEAR_CAP, BIT_DEPTH>(opts.allow_use_cicp_transfer)?;
+    let lin_b =
+        source.build_b_linearize_table::<T, LINEAR_CAP, BIT_DEPTH>(opts.allow_use_cicp_transfer)?;
+
+    let lin_stage = RgbLinearizationStage::<T, LINEAR_CAP, GRID_SIZE> {
+        r_lin: lin_r,
+        g_lin: lin_g,
+        b_lin: lin_b,
+        _phantom: PhantomData,
+        bit_depth: BIT_DEPTH,
+    };
+
+    let mut lut = vec![0f32; lut_origins.len()];
+    lin_stage.transform(&lut_origins, &mut lut)?;
+
+    let xyz_to_rgb = source.rgb_to_xyz_matrix();
+
+    let matrices = vec![
+        xyz_to_rgb.to_f32(),
+        Matrix3f {
+            v: [
+                [32768.0 / 65535.0, 0.0, 0.0],
+                [0.0, 32768.0 / 65535.0, 0.0],
+                [0.0, 0.0, 32768.0 / 65535.0],
+            ],
+        },
+    ];
+
+    let matrix_stage = crate::conversions::lut_transforms::MatrixStage { matrices };
+    matrix_stage.transform(&mut lut)?;
+    Ok(lut)
+}
+
+pub(crate) fn prepare_inverse_lut_rgb_xyz<
+    T: Copy
+        + Default
+        + AsPrimitive<f32>
+        + Send
+        + Sync
+        + AsPrimitive<usize>
+        + PointeeSizeExpressible
+        + GammaLutInterpolate,
+    const BIT_DEPTH: usize,
+    const GAMMA_LUT: usize,
+>(
+    dest: &ColorProfile,
+    lut: &mut [f32],
+    options: TransformOptions,
+) -> Result<(), CmsError>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+{
+    if !T::FINITE {
+        if let Some(extended_gamma) = dest.try_extended_gamma_evaluator() {
+            let xyz_to_rgb = dest.rgb_to_xyz_matrix().inverse();
+
+            let mut matrices = vec![Matrix3f {
+                v: [
+                    [65535.0 / 32768.0, 0.0, 0.0],
+                    [0.0, 65535.0 / 32768.0, 0.0],
+                    [0.0, 0.0, 65535.0 / 32768.0],
+                ],
+            }];
+
+            matrices.push(xyz_to_rgb.to_f32());
+            let xyz_to_rgb_stage = XyzToRgbStageExtended::<T> {
+                gamma_evaluator: extended_gamma,
+                matrices,
+                phantom_data: PhantomData,
+            };
+            xyz_to_rgb_stage.transform(lut)?;
+            return Ok(());
+        }
+    }
+    let gamma_map_r = dest.build_gamma_table::<T, 65536, GAMMA_LUT, BIT_DEPTH>(
+        &dest.red_trc,
+        options.allow_use_cicp_transfer,
+    )?;
+    let gamma_map_g = dest.build_gamma_table::<T, 65536, GAMMA_LUT, BIT_DEPTH>(
+        &dest.green_trc,
+        options.allow_use_cicp_transfer,
+    )?;
+    let gamma_map_b = dest.build_gamma_table::<T, 65536, GAMMA_LUT, BIT_DEPTH>(
+        &dest.blue_trc,
+        options.allow_use_cicp_transfer,
+    )?;
+
+    let xyz_to_rgb = dest.rgb_to_xyz_matrix().inverse();
+
+    let mut matrices = vec![Matrix3f {
+        v: [
+            [65535.0 / 32768.0, 0.0, 0.0],
+            [0.0, 65535.0 / 32768.0, 0.0],
+            [0.0, 0.0, 65535.0 / 32768.0],
+        ],
+    }];
+
+    matrices.push(xyz_to_rgb.to_f32());
+    let xyz_to_rgb_stage = XyzToRgbStage::<T> {
+        r_gamma: gamma_map_r,
+        g_gamma: gamma_map_g,
+        b_gamma: gamma_map_b,
+        matrices,
+        intent: options.rendering_intent,
+        gamma_lut: GAMMA_LUT,
+        bit_depth: BIT_DEPTH,
+    };
+    xyz_to_rgb_stage.transform(lut)?;
+    Ok(())
+}
--- a/vendor/moxcms/src/conversions/rgb2gray.rs
+++ b/vendor/moxcms/src/conversions/rgb2gray.rs
@@ -0,0 +1,190 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::mlaf::mlaf;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor, Vector3f};
+use num_traits::AsPrimitive;
+
+#[derive(Clone)]
+pub(crate) struct ToneReproductionRgbToGray<T, const BUCKET: usize> {
+    pub(crate) r_linear: Box<[f32; BUCKET]>,
+    pub(crate) g_linear: Box<[f32; BUCKET]>,
+    pub(crate) b_linear: Box<[f32; BUCKET]>,
+    pub(crate) gray_gamma: Box<[T; 65536]>,
+}
+
+#[derive(Clone)]
+struct TransformRgbToGrayExecutor<
+    T,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const BUCKET: usize,
+    const GAMMA_LUT: usize,
+> {
+    trc_box: ToneReproductionRgbToGray<T, BUCKET>,
+    weights: Vector3f,
+    bit_depth: usize,
+}
+
+pub(crate) fn make_rgb_to_gray<
+    T: Copy + Default + PointeeSizeExpressible + Send + Sync + 'static,
+    const BUCKET: usize,
+    const BIT_DEPTH: usize,
+    const GAMMA_LUT: usize,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    trc: ToneReproductionRgbToGray<T, BUCKET>,
+    weights: Vector3f,
+) -> Box<dyn TransformExecutor<T> + Send + Sync>
+where
+    u32: AsPrimitive<T>,
+{
+    match src_layout {
+        Layout::Rgb => match dst_layout {
+            Layout::Rgb => unreachable!(),
+            Layout::Rgba => unreachable!(),
+            Layout::Gray => Box::new(TransformRgbToGrayExecutor::<
+                T,
+                { Layout::Rgb as u8 },
+                { Layout::Gray as u8 },
+                BUCKET,
+                GAMMA_LUT,
+            > {
+                trc_box: trc,
+                weights,
+                bit_depth: BIT_DEPTH,
+            }),
+            Layout::GrayAlpha => Box::new(TransformRgbToGrayExecutor::<
+                T,
+                { Layout::Rgb as u8 },
+                { Layout::GrayAlpha as u8 },
+                BUCKET,
+                GAMMA_LUT,
+            > {
+                trc_box: trc,
+                weights,
+                bit_depth: BIT_DEPTH,
+            }),
+            _ => unreachable!(),
+        },
+        Layout::Rgba => match dst_layout {
+            Layout::Rgb => unreachable!(),
+            Layout::Rgba => unreachable!(),
+            Layout::Gray => Box::new(TransformRgbToGrayExecutor::<
+                T,
+                { Layout::Rgba as u8 },
+                { Layout::Gray as u8 },
+                BUCKET,
+                GAMMA_LUT,
+            > {
+                trc_box: trc,
+                weights,
+                bit_depth: BIT_DEPTH,
+            }),
+            Layout::GrayAlpha => Box::new(TransformRgbToGrayExecutor::<
+                T,
+                { Layout::Rgba as u8 },
+                { Layout::GrayAlpha as u8 },
+                BUCKET,
+                GAMMA_LUT,
+            > {
+                trc_box: trc,
+                weights,
+                bit_depth: BIT_DEPTH,
+            }),
+            _ => unreachable!(),
+        },
+        Layout::Gray => unreachable!(),
+        Layout::GrayAlpha => unreachable!(),
+        _ => unreachable!(),
+    }
+}
+
+impl<
+    T: Copy + Default + PointeeSizeExpressible + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const BUCKET: usize,
+    const GAMMA_LUT: usize,
+> TransformExecutor<T> for TransformRgbToGrayExecutor<T, SRC_LAYOUT, DST_LAYOUT, BUCKET, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let scale_value = (GAMMA_LUT - 1) as f32;
+        let max_value = ((1u32 << self.bit_depth) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let r = self.trc_box.r_linear[src[src_cn.r_i()]._as_usize()];
+            let g = self.trc_box.g_linear[src[src_cn.g_i()]._as_usize()];
+            let b = self.trc_box.b_linear[src[src_cn.b_i()]._as_usize()];
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_value
+            };
+            let grey = mlaf(
+                0.5,
+                mlaf(
+                    mlaf(self.weights.v[0] * r, self.weights.v[1], g),
+                    self.weights.v[2],
+                    b,
+                )
+                .min(1.)
+                .max(0.),
+                scale_value,
+            );
+            dst[0] = self.trc_box.gray_gamma[(grey as u16) as usize];
+            if dst_channels == 2 {
+                dst[1] = a;
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/rgb2gray_extended.rs
+++ b/vendor/moxcms/src/conversions/rgb2gray_extended.rs
@@ -0,0 +1,181 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::mlaf::mlaf;
+use crate::transform::PointeeSizeExpressible;
+use crate::trc::ToneCurveEvaluator;
+use crate::{CmsError, Layout, Rgb, TransformExecutor, Vector3f};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+struct TransformRgbToGrayExtendedExecutor<T, const SRC_LAYOUT: u8, const DST_LAYOUT: u8> {
+    linear_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    gamma_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    weights: Vector3f,
+    _phantom: PhantomData<T>,
+    bit_depth: usize,
+}
+
+pub(crate) fn make_rgb_to_gray_extended<
+    T: Copy + Default + PointeeSizeExpressible + Send + Sync + 'static + AsPrimitive<f32>,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    linear_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    gamma_eval: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    weights: Vector3f,
+    bit_depth: usize,
+) -> Box<dyn TransformExecutor<T> + Send + Sync>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    match src_layout {
+        Layout::Rgb => match dst_layout {
+            Layout::Rgb => unreachable!(),
+            Layout::Rgba => unreachable!(),
+            Layout::Gray => Box::new(TransformRgbToGrayExtendedExecutor::<
+                T,
+                { Layout::Rgb as u8 },
+                { Layout::Gray as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                weights,
+                _phantom: PhantomData,
+                bit_depth,
+            }),
+            Layout::GrayAlpha => Box::new(TransformRgbToGrayExtendedExecutor::<
+                T,
+                { Layout::Rgb as u8 },
+                { Layout::GrayAlpha as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                weights,
+                _phantom: PhantomData,
+                bit_depth,
+            }),
+            _ => unreachable!(),
+        },
+        Layout::Rgba => match dst_layout {
+            Layout::Rgb => unreachable!(),
+            Layout::Rgba => unreachable!(),
+            Layout::Gray => Box::new(TransformRgbToGrayExtendedExecutor::<
+                T,
+                { Layout::Rgba as u8 },
+                { Layout::Gray as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                weights,
+                _phantom: PhantomData,
+                bit_depth,
+            }),
+            Layout::GrayAlpha => Box::new(TransformRgbToGrayExtendedExecutor::<
+                T,
+                { Layout::Rgba as u8 },
+                { Layout::GrayAlpha as u8 },
+            > {
+                linear_eval,
+                gamma_eval,
+                weights,
+                _phantom: PhantomData,
+                bit_depth,
+            }),
+            _ => unreachable!(),
+        },
+        Layout::Gray => unreachable!(),
+        Layout::GrayAlpha => unreachable!(),
+        _ => unreachable!(),
+    }
+}
+
+impl<
+    T: Copy + Default + PointeeSizeExpressible + 'static + AsPrimitive<f32>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+> TransformExecutor<T> for TransformRgbToGrayExtendedExecutor<T, SRC_LAYOUT, DST_LAYOUT>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let max_value = ((1u32 << self.bit_depth) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let in_tristimulus = Rgb::<f32>::new(
+                src[src_cn.r_i()].as_(),
+                src[src_cn.g_i()].as_(),
+                src[src_cn.b_i()].as_(),
+            );
+            let lin_tristimulus = self.linear_eval.evaluate_tristimulus(in_tristimulus);
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_value
+            };
+            let grey = mlaf(
+                mlaf(
+                    self.weights.v[0] * lin_tristimulus.r,
+                    self.weights.v[1],
+                    lin_tristimulus.g,
+                ),
+                self.weights.v[2],
+                lin_tristimulus.b,
+            )
+            .min(1.)
+            .max(0.);
+            let gamma_value = self.gamma_eval.evaluate_value(grey);
+            dst[0] = gamma_value.as_();
+            if dst_channels == 2 {
+                dst[1] = a;
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/rgb_xyz_factory.rs
+++ b/vendor/moxcms/src/conversions/rgb_xyz_factory.rs
@@ -0,0 +1,437 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 4/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::TransformMatrixShaper;
+use crate::conversions::rgbxyz::{
+    TransformMatrixShaperOptimized, make_rgb_xyz_rgb_transform, make_rgb_xyz_rgb_transform_opt,
+};
+use crate::conversions::rgbxyz_fixed::{make_rgb_xyz_q2_13, make_rgb_xyz_q2_13_opt};
+use crate::{CmsError, Layout, TransformExecutor, TransformOptions};
+use num_traits::AsPrimitive;
+
+const FIXED_POINT_SCALE: i32 = 13; // Q2.13;
+
+pub(crate) trait RgbXyzFactory<T: Clone + AsPrimitive<usize> + Default> {
+    fn make_transform<const LINEAR_CAP: usize, const GAMMA_LUT: usize, const BIT_DEPTH: usize>(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaper<T, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>;
+}
+
+pub(crate) trait RgbXyzFactoryOpt<T: Clone + AsPrimitive<usize> + Default> {
+    fn make_optimized_transform<
+        const LINEAR_CAP: usize,
+        const GAMMA_LUT: usize,
+        const BIT_DEPTH: usize,
+    >(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>;
+}
+
+impl RgbXyzFactory<u16> for u16 {
+    fn make_transform<const LINEAR_CAP: usize, const GAMMA_LUT: usize, const BIT_DEPTH: usize>(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaper<u16, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<u16> + Send + Sync>, CmsError> {
+        if BIT_DEPTH < 16 && transform_options.prefer_fixed_point {
+            #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_avx2;
+                if std::arch::is_x86_feature_detected!("avx2") {
+                    return make_rgb_xyz_q2_13_transform_avx2::<
+                        u16,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        BIT_DEPTH,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_sse_41;
+                if std::arch::is_x86_feature_detected!("sse4.1") {
+                    return make_rgb_xyz_q2_13_transform_sse_41::<
+                        u16,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        BIT_DEPTH,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+            {
+                return make_rgb_xyz_q2_13::<
+                    u16,
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    BIT_DEPTH,
+                    FIXED_POINT_SCALE,
+                >(src_layout, dst_layout, profile);
+            }
+        }
+        make_rgb_xyz_rgb_transform::<u16, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        )
+    }
+}
+
+impl RgbXyzFactory<f32> for f32 {
+    fn make_transform<const LINEAR_CAP: usize, const GAMMA_LUT: usize, const BIT_DEPTH: usize>(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaper<f32, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<f32> + Send + Sync>, CmsError> {
+        if transform_options.prefer_fixed_point {
+            #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_avx2;
+                if std::arch::is_x86_feature_detected!("avx2") {
+                    return make_rgb_xyz_q2_13_transform_avx2::<
+                        f32,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        BIT_DEPTH,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_sse_41;
+                if std::arch::is_x86_feature_detected!("sse4.1") {
+                    return make_rgb_xyz_q2_13_transform_sse_41::<
+                        f32,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        BIT_DEPTH,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+            {
+                return make_rgb_xyz_q2_13::<
+                    f32,
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    BIT_DEPTH,
+                    FIXED_POINT_SCALE,
+                >(src_layout, dst_layout, profile);
+            }
+        }
+        make_rgb_xyz_rgb_transform::<f32, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        )
+    }
+}
+
+impl RgbXyzFactory<f64> for f64 {
+    fn make_transform<const LINEAR_CAP: usize, const GAMMA_LUT: usize, const BIT_DEPTH: usize>(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaper<f64, LINEAR_CAP>,
+        _: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<f64> + Send + Sync>, CmsError> {
+        make_rgb_xyz_rgb_transform::<f64, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        )
+    }
+}
+
+impl RgbXyzFactory<u8> for u8 {
+    fn make_transform<const LINEAR_CAP: usize, const GAMMA_LUT: usize, const BIT_DEPTH: usize>(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaper<u8, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<u8> + Send + Sync>, CmsError> {
+        if transform_options.prefer_fixed_point {
+            #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_avx2;
+                if std::arch::is_x86_feature_detected!("avx2") {
+                    return make_rgb_xyz_q2_13_transform_avx2::<
+                        u8,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        8,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_sse_41;
+                if std::arch::is_x86_feature_detected!("sse4.1") {
+                    return make_rgb_xyz_q2_13_transform_sse_41::<
+                        u8,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        8,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            make_rgb_xyz_q2_13::<u8, LINEAR_CAP, GAMMA_LUT, 8, FIXED_POINT_SCALE>(
+                src_layout, dst_layout, profile,
+            )
+        } else {
+            make_rgb_xyz_rgb_transform::<u8, LINEAR_CAP, GAMMA_LUT, 8>(
+                src_layout, dst_layout, profile,
+            )
+        }
+    }
+}
+
+// Optimized factories
+
+impl RgbXyzFactoryOpt<u16> for u16 {
+    fn make_optimized_transform<
+        const LINEAR_CAP: usize,
+        const GAMMA_LUT: usize,
+        const BIT_DEPTH: usize,
+    >(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaperOptimized<u16, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<u16> + Send + Sync>, CmsError> {
+        if BIT_DEPTH >= 12 && transform_options.prefer_fixed_point {
+            #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+            {
+                if std::arch::is_aarch64_feature_detected!("rdm") {
+                    use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q1_30_opt;
+                    return make_rgb_xyz_q1_30_opt::<u16, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH, 30>(
+                        src_layout, dst_layout, profile,
+                    );
+                }
+            }
+        }
+        if BIT_DEPTH < 16 && transform_options.prefer_fixed_point {
+            #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_avx2_opt;
+                if std::arch::is_x86_feature_detected!("avx2") {
+                    return make_rgb_xyz_q2_13_transform_avx2_opt::<
+                        u16,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        BIT_DEPTH,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_sse_41_opt;
+                if std::arch::is_x86_feature_detected!("sse4.1") {
+                    return make_rgb_xyz_q2_13_transform_sse_41_opt::<
+                        u16,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        BIT_DEPTH,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+            {
+                return make_rgb_xyz_q2_13_opt::<
+                    u16,
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    BIT_DEPTH,
+                    FIXED_POINT_SCALE,
+                >(src_layout, dst_layout, profile);
+            }
+        }
+        make_rgb_xyz_rgb_transform_opt::<u16, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        )
+    }
+}
+
+impl RgbXyzFactoryOpt<f32> for f32 {
+    fn make_optimized_transform<
+        const LINEAR_CAP: usize,
+        const GAMMA_LUT: usize,
+        const BIT_DEPTH: usize,
+    >(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaperOptimized<f32, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<f32> + Send + Sync>, CmsError> {
+        if transform_options.prefer_fixed_point {
+            #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_avx2_opt;
+                if std::arch::is_x86_feature_detected!("avx2") {
+                    return make_rgb_xyz_q2_13_transform_avx2_opt::<
+                        f32,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        BIT_DEPTH,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_sse_41_opt;
+                if std::arch::is_x86_feature_detected!("sse4.1") {
+                    return make_rgb_xyz_q2_13_transform_sse_41_opt::<
+                        f32,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        BIT_DEPTH,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+            {
+                return if std::arch::is_aarch64_feature_detected!("rdm") {
+                    use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q1_30_opt;
+                    make_rgb_xyz_q1_30_opt::<f32, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH, 30>(
+                        src_layout, dst_layout, profile,
+                    )
+                } else {
+                    make_rgb_xyz_q2_13_opt::<f32, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH, FIXED_POINT_SCALE>(
+                        src_layout, dst_layout, profile,
+                    )
+                };
+            }
+        }
+        make_rgb_xyz_rgb_transform_opt::<f32, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        )
+    }
+}
+
+impl RgbXyzFactoryOpt<f64> for f64 {
+    fn make_optimized_transform<
+        const LINEAR_CAP: usize,
+        const GAMMA_LUT: usize,
+        const BIT_DEPTH: usize,
+    >(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaperOptimized<f64, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<f64> + Send + Sync>, CmsError> {
+        if transform_options.prefer_fixed_point {
+            #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+            {
+                if std::arch::is_aarch64_feature_detected!("rdm") {
+                    use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q1_30_opt;
+                    return make_rgb_xyz_q1_30_opt::<f64, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH, 30>(
+                        src_layout, dst_layout, profile,
+                    );
+                }
+            }
+        }
+        make_rgb_xyz_rgb_transform_opt::<f64, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        )
+    }
+}
+
+impl RgbXyzFactoryOpt<u8> for u8 {
+    fn make_optimized_transform<
+        const LINEAR_CAP: usize,
+        const GAMMA_LUT: usize,
+        const BIT_DEPTH: usize,
+    >(
+        src_layout: Layout,
+        dst_layout: Layout,
+        profile: TransformMatrixShaperOptimized<u8, LINEAR_CAP>,
+        transform_options: TransformOptions,
+    ) -> Result<Box<dyn TransformExecutor<u8> + Send + Sync>, CmsError> {
+        if transform_options.prefer_fixed_point {
+            #[cfg(all(target_arch = "x86_64", feature = "avx512"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_avx512_opt;
+                if std::arch::is_x86_feature_detected!("avx512bw")
+                    && std::arch::is_x86_feature_detected!("avx512vl")
+                {
+                    return make_rgb_xyz_q2_13_transform_avx512_opt::<
+                        u8,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        8,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_avx2_opt;
+                if std::arch::is_x86_feature_detected!("avx2") {
+                    return make_rgb_xyz_q2_13_transform_avx2_opt::<
+                        u8,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        8,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+            {
+                use crate::conversions::rgbxyz_fixed::make_rgb_xyz_q2_13_transform_sse_41_opt;
+                if std::arch::is_x86_feature_detected!("sse4.1") {
+                    return make_rgb_xyz_q2_13_transform_sse_41_opt::<
+                        u8,
+                        LINEAR_CAP,
+                        GAMMA_LUT,
+                        8,
+                        FIXED_POINT_SCALE,
+                    >(src_layout, dst_layout, profile);
+                }
+            }
+            make_rgb_xyz_q2_13_opt::<u8, LINEAR_CAP, GAMMA_LUT, 8, FIXED_POINT_SCALE>(
+                src_layout, dst_layout, profile,
+            )
+        } else {
+            make_rgb_xyz_rgb_transform_opt::<u8, LINEAR_CAP, GAMMA_LUT, 8>(
+                src_layout, dst_layout, profile,
+            )
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/rgbxyz.rs
+++ b/vendor/moxcms/src/conversions/rgbxyz.rs
@@ -0,0 +1,701 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::{CmsError, Layout, Matrix3, Matrix3f, TransformExecutor};
+use num_traits::AsPrimitive;
+
+pub(crate) struct TransformMatrixShaper<T: Clone, const BUCKET: usize> {
+    pub(crate) r_linear: Box<[f32; BUCKET]>,
+    pub(crate) g_linear: Box<[f32; BUCKET]>,
+    pub(crate) b_linear: Box<[f32; BUCKET]>,
+    pub(crate) r_gamma: Box<[T; 65536]>,
+    pub(crate) g_gamma: Box<[T; 65536]>,
+    pub(crate) b_gamma: Box<[T; 65536]>,
+    pub(crate) adaptation_matrix: Matrix3f,
+}
+
+/// Low memory footprint optimized routine for matrix shaper profiles with the same
+/// Gamma and linear curves.
+pub(crate) struct TransformMatrixShaperOptimized<T: Clone, const BUCKET: usize> {
+    pub(crate) linear: Box<[f32; BUCKET]>,
+    pub(crate) gamma: Box<[T; 65536]>,
+    pub(crate) adaptation_matrix: Matrix3f,
+}
+
+impl<T: Clone + PointeeSizeExpressible, const BUCKET: usize> TransformMatrixShaper<T, BUCKET> {
+    pub(crate) fn to_q2_13_n<
+        R: Copy + 'static + Default,
+        const PRECISION: i32,
+        const LINEAR_CAP: usize,
+        const GAMMA_LUT: usize,
+        const BIT_DEPTH: usize,
+    >(
+        &self,
+    ) -> TransformMatrixShaperFixedPoint<R, T, BUCKET>
+    where
+        f32: AsPrimitive<R>,
+    {
+        let linear_scale = if T::FINITE {
+            let lut_scale = (GAMMA_LUT - 1) as f32 / ((1 << BIT_DEPTH) - 1) as f32;
+            ((1 << BIT_DEPTH) - 1) as f32 * lut_scale
+        } else {
+            let lut_scale = (GAMMA_LUT - 1) as f32 / (T::NOT_FINITE_LINEAR_TABLE_SIZE - 1) as f32;
+            (T::NOT_FINITE_LINEAR_TABLE_SIZE - 1) as f32 * lut_scale
+        };
+        let mut new_box_r = Box::new([R::default(); BUCKET]);
+        let mut new_box_g = Box::new([R::default(); BUCKET]);
+        let mut new_box_b = Box::new([R::default(); BUCKET]);
+        for (dst, &src) in new_box_r.iter_mut().zip(self.r_linear.iter()) {
+            *dst = (src * linear_scale).round().as_();
+        }
+        for (dst, &src) in new_box_g.iter_mut().zip(self.g_linear.iter()) {
+            *dst = (src * linear_scale).round().as_();
+        }
+        for (dst, &src) in new_box_b.iter_mut().zip(self.b_linear.iter()) {
+            *dst = (src * linear_scale).round().as_();
+        }
+        let scale: f32 = (1i32 << PRECISION) as f32;
+        let source_matrix = self.adaptation_matrix;
+        let mut dst_matrix = Matrix3::<i16> { v: [[0i16; 3]; 3] };
+        for i in 0..3 {
+            for j in 0..3 {
+                dst_matrix.v[i][j] = (source_matrix.v[i][j] * scale) as i16;
+            }
+        }
+        TransformMatrixShaperFixedPoint {
+            r_linear: new_box_r,
+            g_linear: new_box_g,
+            b_linear: new_box_b,
+            r_gamma: self.r_gamma.clone(),
+            g_gamma: self.g_gamma.clone(),
+            b_gamma: self.b_gamma.clone(),
+            adaptation_matrix: dst_matrix,
+        }
+    }
+}
+
+impl<T: Clone + PointeeSizeExpressible, const BUCKET: usize>
+    TransformMatrixShaperOptimized<T, BUCKET>
+{
+    pub(crate) fn to_q2_13_n<
+        R: Copy + 'static + Default,
+        const PRECISION: i32,
+        const LINEAR_CAP: usize,
+        const GAMMA_LUT: usize,
+        const BIT_DEPTH: usize,
+    >(
+        &self,
+    ) -> TransformMatrixShaperFixedPointOpt<R, i16, T, BUCKET>
+    where
+        f32: AsPrimitive<R>,
+    {
+        let linear_scale = if T::FINITE {
+            let lut_scale = (GAMMA_LUT - 1) as f32 / ((1 << BIT_DEPTH) - 1) as f32;
+            ((1 << BIT_DEPTH) - 1) as f32 * lut_scale
+        } else {
+            let lut_scale = (GAMMA_LUT - 1) as f32 / (T::NOT_FINITE_LINEAR_TABLE_SIZE - 1) as f32;
+            (T::NOT_FINITE_LINEAR_TABLE_SIZE - 1) as f32 * lut_scale
+        };
+        let mut new_box_linear = Box::new([R::default(); BUCKET]);
+        for (dst, src) in new_box_linear.iter_mut().zip(self.linear.iter()) {
+            *dst = (*src * linear_scale).round().as_();
+        }
+        let scale: f32 = (1i32 << PRECISION) as f32;
+        let source_matrix = self.adaptation_matrix;
+        let mut dst_matrix = Matrix3::<i16> {
+            v: [[i16::default(); 3]; 3],
+        };
+        for i in 0..3 {
+            for j in 0..3 {
+                dst_matrix.v[i][j] = (source_matrix.v[i][j] * scale) as i16;
+            }
+        }
+        TransformMatrixShaperFixedPointOpt {
+            linear: new_box_linear,
+            gamma: self.gamma.clone(),
+            adaptation_matrix: dst_matrix,
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn to_q1_30_n<
+        R: Copy + 'static + Default,
+        const PRECISION: i32,
+        const LINEAR_CAP: usize,
+        const GAMMA_LUT: usize,
+        const BIT_DEPTH: usize,
+    >(
+        &self,
+    ) -> TransformMatrixShaperFixedPointOpt<R, i32, T, BUCKET>
+    where
+        f32: AsPrimitive<R>,
+        f64: AsPrimitive<R>,
+    {
+        // It is important to scale 1 bit more to compensate vqrdmlah Q0.31, because we're going to use Q1.30
+        let table_size = if T::FINITE {
+            (1 << BIT_DEPTH) - 1
+        } else {
+            T::NOT_FINITE_LINEAR_TABLE_SIZE - 1
+        };
+        let ext_bp = if T::FINITE {
+            BIT_DEPTH as u32 + 1
+        } else {
+            let bp = (T::NOT_FINITE_LINEAR_TABLE_SIZE - 1).count_ones();
+            bp + 1
+        };
+        let linear_scale = {
+            let lut_scale = (GAMMA_LUT - 1) as f64 / table_size as f64;
+            ((1u32 << ext_bp) - 1) as f64 * lut_scale
+        };
+        let mut new_box_linear = Box::new([R::default(); BUCKET]);
+        for (dst, &src) in new_box_linear.iter_mut().zip(self.linear.iter()) {
+            *dst = (src as f64 * linear_scale).round().as_();
+        }
+        let scale: f64 = (1i64 << PRECISION) as f64;
+        let source_matrix = self.adaptation_matrix;
+        let mut dst_matrix = Matrix3::<i32> {
+            v: [[i32::default(); 3]; 3],
+        };
+        for i in 0..3 {
+            for j in 0..3 {
+                dst_matrix.v[i][j] = (source_matrix.v[i][j] as f64 * scale) as i32;
+            }
+        }
+        TransformMatrixShaperFixedPointOpt {
+            linear: new_box_linear,
+            gamma: self.gamma.clone(),
+            adaptation_matrix: dst_matrix,
+        }
+    }
+}
+
+#[allow(unused)]
+struct TransformMatrixShaperScalar<
+    T: Clone,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+> {
+    pub(crate) profile: TransformMatrixShaper<T, LINEAR_CAP>,
+}
+
+#[allow(unused)]
+struct TransformMatrixShaperOptScalar<
+    T: Clone,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+> {
+    pub(crate) profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
+}
+
+#[cfg(any(
+    any(target_arch = "x86", target_arch = "x86_64"),
+    all(target_arch = "aarch64", target_feature = "neon")
+))]
+#[allow(unused)]
+macro_rules! create_rgb_xyz_dependant_executor {
+    ($dep_name: ident, $dependant: ident, $shaper: ident) => {
+        pub(crate) fn $dep_name<
+            T: Clone + Send + Sync + Default + PointeeSizeExpressible + Copy + 'static,
+            const LINEAR_CAP: usize,
+            const GAMMA_LUT: usize,
+            const BIT_DEPTH: usize,
+        >(
+            src_layout: Layout,
+            dst_layout: Layout,
+            profile: $shaper<T, LINEAR_CAP>,
+        ) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+        where
+            u32: AsPrimitive<T>,
+        {
+            if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgba) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgba as u8 },
+                    { Layout::Rgba as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                > {
+                    profile,
+                    bit_depth: BIT_DEPTH,
+                }));
+            } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgba) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgb as u8 },
+                    { Layout::Rgba as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                > {
+                    profile,
+                    bit_depth: BIT_DEPTH,
+                }));
+            } else if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgb) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgba as u8 },
+                    { Layout::Rgb as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                > {
+                    profile,
+                    bit_depth: BIT_DEPTH,
+                }));
+            } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgb) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgb as u8 },
+                    { Layout::Rgb as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                > {
+                    profile,
+                    bit_depth: BIT_DEPTH,
+                }));
+            }
+            Err(CmsError::UnsupportedProfileConnection)
+        }
+    };
+}
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+use crate::conversions::sse::{TransformShaperRgbOptSse, TransformShaperRgbSse};
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+use crate::conversions::avx::{TransformShaperRgbAvx, TransformShaperRgbOptAvx};
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+create_rgb_xyz_dependant_executor!(
+    make_rgb_xyz_rgb_transform_sse_41,
+    TransformShaperRgbSse,
+    TransformMatrixShaper
+);
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+create_rgb_xyz_dependant_executor!(
+    make_rgb_xyz_rgb_transform_sse_41_opt,
+    TransformShaperRgbOptSse,
+    TransformMatrixShaperOptimized
+);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+create_rgb_xyz_dependant_executor!(
+    make_rgb_xyz_rgb_transform_avx2,
+    TransformShaperRgbAvx,
+    TransformMatrixShaper
+);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+create_rgb_xyz_dependant_executor!(
+    make_rgb_xyz_rgb_transform_avx2_opt,
+    TransformShaperRgbOptAvx,
+    TransformMatrixShaperOptimized
+);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
+use crate::conversions::avx512::TransformShaperRgbOptAvx512;
+
+#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
+create_rgb_xyz_dependant_executor!(
+    make_rgb_xyz_rgb_transform_avx512_opt,
+    TransformShaperRgbOptAvx512,
+    TransformMatrixShaperOptimized
+);
+
+#[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+pub(crate) fn make_rgb_xyz_rgb_transform<
+    T: Clone + Send + Sync + PointeeSizeExpressible + 'static + Copy + Default,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    profile: TransformMatrixShaper<T, LINEAR_CAP>,
+) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+where
+    u32: AsPrimitive<T>,
+{
+    #[cfg(all(feature = "avx", target_arch = "x86_64"))]
+    if std::arch::is_x86_feature_detected!("avx2") {
+        return make_rgb_xyz_rgb_transform_avx2::<T, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        );
+    }
+    #[cfg(all(feature = "sse", any(target_arch = "x86", target_arch = "x86_64")))]
+    if std::arch::is_x86_feature_detected!("sse4.1") {
+        return make_rgb_xyz_rgb_transform_sse_41::<T, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        );
+    }
+    if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgba) {
+        return Ok(Box::new(TransformMatrixShaperScalar::<
+            T,
+            { Layout::Rgba as u8 },
+            { Layout::Rgba as u8 },
+            LINEAR_CAP,
+            GAMMA_LUT,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgba) {
+        return Ok(Box::new(TransformMatrixShaperScalar::<
+            T,
+            { Layout::Rgb as u8 },
+            { Layout::Rgba as u8 },
+            LINEAR_CAP,
+            GAMMA_LUT,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgb) {
+        return Ok(Box::new(TransformMatrixShaperScalar::<
+            T,
+            { Layout::Rgba as u8 },
+            { Layout::Rgb as u8 },
+            LINEAR_CAP,
+            GAMMA_LUT,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgb) {
+        return Ok(Box::new(TransformMatrixShaperScalar::<
+            T,
+            { Layout::Rgb as u8 },
+            { Layout::Rgb as u8 },
+            LINEAR_CAP,
+            GAMMA_LUT,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    }
+    Err(CmsError::UnsupportedProfileConnection)
+}
+
+#[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+pub(crate) fn make_rgb_xyz_rgb_transform_opt<
+    T: Clone + Send + Sync + PointeeSizeExpressible + 'static + Copy + Default,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
+) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+where
+    u32: AsPrimitive<T>,
+{
+    #[cfg(all(feature = "avx512", target_arch = "x86_64"))]
+    if std::arch::is_x86_feature_detected!("avx512bw")
+        && std::arch::is_x86_feature_detected!("avx512vl")
+        && std::arch::is_x86_feature_detected!("fma")
+    {
+        return make_rgb_xyz_rgb_transform_avx512_opt::<T, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        );
+    }
+    #[cfg(all(feature = "avx", target_arch = "x86_64"))]
+    if std::arch::is_x86_feature_detected!("avx2") {
+        return make_rgb_xyz_rgb_transform_avx2_opt::<T, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        );
+    }
+    #[cfg(all(feature = "sse", any(target_arch = "x86", target_arch = "x86_64")))]
+    if std::arch::is_x86_feature_detected!("sse4.1") {
+        return make_rgb_xyz_rgb_transform_sse_41_opt::<T, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>(
+            src_layout, dst_layout, profile,
+        );
+    }
+    if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgba) {
+        return Ok(Box::new(TransformMatrixShaperOptScalar::<
+            T,
+            { Layout::Rgba as u8 },
+            { Layout::Rgba as u8 },
+            LINEAR_CAP,
+            GAMMA_LUT,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgba) {
+        return Ok(Box::new(TransformMatrixShaperOptScalar::<
+            T,
+            { Layout::Rgb as u8 },
+            { Layout::Rgba as u8 },
+            LINEAR_CAP,
+            GAMMA_LUT,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgb) {
+        return Ok(Box::new(TransformMatrixShaperOptScalar::<
+            T,
+            { Layout::Rgba as u8 },
+            { Layout::Rgb as u8 },
+            LINEAR_CAP,
+            GAMMA_LUT,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgb) {
+        return Ok(Box::new(TransformMatrixShaperOptScalar::<
+            T,
+            { Layout::Rgb as u8 },
+            { Layout::Rgb as u8 },
+            LINEAR_CAP,
+            GAMMA_LUT,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    }
+    Err(CmsError::UnsupportedProfileConnection)
+}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+use crate::conversions::neon::{TransformShaperRgbNeon, TransformShaperRgbOptNeon};
+use crate::conversions::rgbxyz_fixed::{
+    TransformMatrixShaperFixedPoint, TransformMatrixShaperFixedPointOpt,
+};
+use crate::transform::PointeeSizeExpressible;
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+create_rgb_xyz_dependant_executor!(
+    make_rgb_xyz_rgb_transform,
+    TransformShaperRgbNeon,
+    TransformMatrixShaper
+);
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+create_rgb_xyz_dependant_executor!(
+    make_rgb_xyz_rgb_transform_opt,
+    TransformShaperRgbOptNeon,
+    TransformMatrixShaperOptimized
+);
+
+#[allow(unused)]
+impl<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+> TransformExecutor<T>
+    for TransformMatrixShaperScalar<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        use crate::mlaf::mlaf;
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let transform = self.profile.adaptation_matrix;
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << BIT_DEPTH) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let r = self.profile.r_linear[src[src_cn.r_i()]._as_usize()];
+            let g = self.profile.g_linear[src[src_cn.g_i()]._as_usize()];
+            let b = self.profile.b_linear[src[src_cn.b_i()]._as_usize()];
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_colors
+            };
+
+            let new_r = mlaf(
+                0.5f32,
+                mlaf(
+                    mlaf(r * transform.v[0][0], g, transform.v[0][1]),
+                    b,
+                    transform.v[0][2],
+                )
+                .max(0f32)
+                .min(1f32),
+                scale,
+            );
+
+            let new_g = mlaf(
+                0.5f32,
+                mlaf(
+                    mlaf(r * transform.v[1][0], g, transform.v[1][1]),
+                    b,
+                    transform.v[1][2],
+                )
+                .max(0f32)
+                .min(1f32),
+                scale,
+            );
+
+            let new_b = mlaf(
+                0.5f32,
+                mlaf(
+                    mlaf(r * transform.v[2][0], g, transform.v[2][1]),
+                    b,
+                    transform.v[2][2],
+                )
+                .max(0f32)
+                .min(1f32),
+                scale,
+            );
+
+            dst[dst_cn.r_i()] = self.profile.r_gamma[(new_r as u16) as usize];
+            dst[dst_cn.g_i()] = self.profile.g_gamma[(new_g as u16) as usize];
+            dst[dst_cn.b_i()] = self.profile.b_gamma[(new_b as u16) as usize];
+            if dst_channels == 4 {
+                dst[dst_cn.a_i()] = a;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[allow(unused)]
+impl<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const BIT_DEPTH: usize,
+> TransformExecutor<T>
+    for TransformMatrixShaperOptScalar<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        use crate::mlaf::mlaf;
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let transform = self.profile.adaptation_matrix;
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << BIT_DEPTH) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let r = self.profile.linear[src[src_cn.r_i()]._as_usize()];
+            let g = self.profile.linear[src[src_cn.g_i()]._as_usize()];
+            let b = self.profile.linear[src[src_cn.b_i()]._as_usize()];
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_colors
+            };
+
+            let new_r = mlaf(
+                0.5f32,
+                mlaf(
+                    mlaf(r * transform.v[0][0], g, transform.v[0][1]),
+                    b,
+                    transform.v[0][2],
+                )
+                .max(0f32)
+                .min(1f32),
+                scale,
+            );
+
+            let new_g = mlaf(
+                0.5f32,
+                mlaf(
+                    mlaf(r * transform.v[1][0], g, transform.v[1][1]),
+                    b,
+                    transform.v[1][2],
+                )
+                .max(0f32)
+                .min(1f32),
+                scale,
+            );
+
+            let new_b = mlaf(
+                0.5f32,
+                mlaf(
+                    mlaf(r * transform.v[2][0], g, transform.v[2][1]),
+                    b,
+                    transform.v[2][2],
+                )
+                .max(0f32)
+                .min(1f32),
+                scale,
+            );
+
+            dst[dst_cn.r_i()] = self.profile.gamma[(new_r as u16) as usize];
+            dst[dst_cn.g_i()] = self.profile.gamma[(new_g as u16) as usize];
+            dst[dst_cn.b_i()] = self.profile.gamma[(new_b as u16) as usize];
+            if dst_channels == 4 {
+                dst[dst_cn.a_i()] = a;
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/rgbxyz_fixed.rs
+++ b/vendor/moxcms/src/conversions/rgbxyz_fixed.rs
@@ -0,0 +1,487 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::Layout;
+use crate::conversions::TransformMatrixShaper;
+use crate::matrix::Matrix3;
+use crate::{CmsError, TransformExecutor};
+use num_traits::AsPrimitive;
+
+/// Fixed point conversion Q2.13
+pub(crate) struct TransformMatrixShaperFixedPoint<R, T, const LINEAR_CAP: usize> {
+    pub(crate) r_linear: Box<[R; LINEAR_CAP]>,
+    pub(crate) g_linear: Box<[R; LINEAR_CAP]>,
+    pub(crate) b_linear: Box<[R; LINEAR_CAP]>,
+    pub(crate) r_gamma: Box<[T; 65536]>,
+    pub(crate) g_gamma: Box<[T; 65536]>,
+    pub(crate) b_gamma: Box<[T; 65536]>,
+    pub(crate) adaptation_matrix: Matrix3<i16>,
+}
+
+/// Fixed point conversion Q2.13
+///
+/// Optimized routine for *all same curves* matrix shaper.
+pub(crate) struct TransformMatrixShaperFixedPointOpt<R, W, T, const LINEAR_CAP: usize> {
+    pub(crate) linear: Box<[R; LINEAR_CAP]>,
+    pub(crate) gamma: Box<[T; 65536]>,
+    pub(crate) adaptation_matrix: Matrix3<W>,
+}
+
+#[allow(unused)]
+struct TransformMatrixShaperQ2_13<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPoint<i16, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+#[allow(unused)]
+struct TransformMatrixShaperQ2_13Optimized<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPointOpt<i16, i16, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+#[allow(unused)]
+impl<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformMatrixShaperQ2_13<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let transform = self.profile.adaptation_matrix;
+        let max_colors: T = ((1 << self.bit_depth as u32) - 1u32).as_();
+        let rnd: i32 = (1i32 << (PRECISION - 1));
+
+        let v_gamma_max = GAMMA_LUT as i32 - 1;
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let r = self.profile.r_linear[src[src_cn.r_i()]._as_usize()];
+            let g = self.profile.g_linear[src[src_cn.g_i()]._as_usize()];
+            let b = self.profile.b_linear[src[src_cn.b_i()]._as_usize()];
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_colors
+            };
+
+            let new_r = r as i32 * transform.v[0][0] as i32
+                + g as i32 * transform.v[0][1] as i32
+                + b as i32 * transform.v[0][2] as i32
+                + rnd;
+
+            let r_q2_13 = (new_r >> PRECISION).min(v_gamma_max).max(0) as u16;
+
+            let new_g = r as i32 * transform.v[1][0] as i32
+                + g as i32 * transform.v[1][1] as i32
+                + b as i32 * transform.v[1][2] as i32
+                + rnd;
+
+            let g_q2_13 = (new_g >> PRECISION).min(v_gamma_max).max(0) as u16;
+
+            let new_b = r as i32 * transform.v[2][0] as i32
+                + g as i32 * transform.v[2][1] as i32
+                + b as i32 * transform.v[2][2] as i32
+                + rnd;
+
+            let b_q2_13 = (new_b >> PRECISION).min(v_gamma_max).max(0) as u16;
+
+            dst[dst_cn.r_i()] = self.profile.r_gamma[r_q2_13 as usize];
+            dst[dst_cn.g_i()] = self.profile.g_gamma[g_q2_13 as usize];
+            dst[dst_cn.b_i()] = self.profile.b_gamma[b_q2_13 as usize];
+            if dst_channels == 4 {
+                dst[dst_cn.a_i()] = a;
+            }
+        }
+        Ok(())
+    }
+}
+
+#[allow(unused)]
+impl<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformMatrixShaperQ2_13Optimized<
+        T,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        LINEAR_CAP,
+        GAMMA_LUT,
+        PRECISION,
+    >
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let transform = self.profile.adaptation_matrix;
+        let max_colors: T = ((1 << self.bit_depth as u32) - 1u32).as_();
+        let rnd: i32 = (1i32 << (PRECISION - 1));
+
+        let v_gamma_max = GAMMA_LUT as i32 - 1;
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let r = self.profile.linear[src[src_cn.r_i()]._as_usize()];
+            let g = self.profile.linear[src[src_cn.g_i()]._as_usize()];
+            let b = self.profile.linear[src[src_cn.b_i()]._as_usize()];
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_colors
+            };
+
+            let new_r = r as i32 * transform.v[0][0] as i32
+                + g as i32 * transform.v[0][1] as i32
+                + b as i32 * transform.v[0][2] as i32
+                + rnd;
+
+            let r_q2_13 = (new_r >> PRECISION).min(v_gamma_max).max(0) as u16;
+
+            let new_g = r as i32 * transform.v[1][0] as i32
+                + g as i32 * transform.v[1][1] as i32
+                + b as i32 * transform.v[1][2] as i32
+                + rnd;
+
+            let g_q2_13 = (new_g >> PRECISION).min(v_gamma_max).max(0) as u16;
+
+            let new_b = r as i32 * transform.v[2][0] as i32
+                + g as i32 * transform.v[2][1] as i32
+                + b as i32 * transform.v[2][2] as i32
+                + rnd;
+
+            let b_q2_13 = (new_b >> PRECISION).min(v_gamma_max).max(0) as u16;
+
+            dst[dst_cn.r_i()] = self.profile.gamma[r_q2_13 as usize];
+            dst[dst_cn.g_i()] = self.profile.gamma[g_q2_13 as usize];
+            dst[dst_cn.b_i()] = self.profile.gamma[b_q2_13 as usize];
+            if dst_channels == 4 {
+                dst[dst_cn.a_i()] = a;
+            }
+        }
+        Ok(())
+    }
+}
+
+macro_rules! create_rgb_xyz_dependant_q2_13_executor {
+    ($dep_name: ident, $dependant: ident, $resolution: ident, $shaper: ident) => {
+        pub(crate) fn $dep_name<
+            T: Clone + Send + Sync + AsPrimitive<usize> + Default + PointeeSizeExpressible,
+            const LINEAR_CAP: usize,
+            const GAMMA_LUT: usize,
+            const BIT_DEPTH: usize,
+            const PRECISION: i32,
+        >(
+            src_layout: Layout,
+            dst_layout: Layout,
+            profile: $shaper<T, LINEAR_CAP>,
+        ) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+        where
+            u32: AsPrimitive<T>,
+        {
+            let q2_13_profile =
+                profile.to_q2_13_n::<$resolution, PRECISION, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>();
+            if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgba) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgba as u8 },
+                    { Layout::Rgba as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    PRECISION,
+                > {
+                    profile: q2_13_profile,
+                    bit_depth: BIT_DEPTH,
+                }));
+            } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgba) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgb as u8 },
+                    { Layout::Rgba as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    PRECISION,
+                > {
+                    profile: q2_13_profile,
+                    bit_depth: BIT_DEPTH,
+                }));
+            } else if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgb) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgba as u8 },
+                    { Layout::Rgb as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    PRECISION,
+                > {
+                    profile: q2_13_profile,
+                    bit_depth: BIT_DEPTH,
+                }));
+            } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgb) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgb as u8 },
+                    { Layout::Rgb as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    PRECISION,
+                > {
+                    profile: q2_13_profile,
+                    bit_depth: BIT_DEPTH,
+                }));
+            }
+            Err(CmsError::UnsupportedProfileConnection)
+        }
+    };
+}
+
+#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+macro_rules! create_rgb_xyz_dependant_q1_30_executor {
+    ($dep_name: ident, $dependant: ident, $resolution: ident, $shaper: ident) => {
+        pub(crate) fn $dep_name<
+            T: Clone + Send + Sync + AsPrimitive<usize> + Default + PointeeSizeExpressible,
+            const LINEAR_CAP: usize,
+            const GAMMA_LUT: usize,
+            const BIT_DEPTH: usize,
+            const PRECISION: i32,
+        >(
+            src_layout: Layout,
+            dst_layout: Layout,
+            profile: $shaper<T, LINEAR_CAP>,
+        ) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+        where
+            u32: AsPrimitive<T>,
+        {
+            let q1_30_profile =
+                profile.to_q1_30_n::<$resolution, PRECISION, LINEAR_CAP, GAMMA_LUT, BIT_DEPTH>();
+            if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgba) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgba as u8 },
+                    { Layout::Rgba as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    BIT_DEPTH,
+                    PRECISION,
+                > {
+                    profile: q1_30_profile,
+                }));
+            } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgba) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgb as u8 },
+                    { Layout::Rgba as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    BIT_DEPTH,
+                    PRECISION,
+                > {
+                    profile: q1_30_profile,
+                }));
+            } else if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgb) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgba as u8 },
+                    { Layout::Rgb as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    BIT_DEPTH,
+                    PRECISION,
+                > {
+                    profile: q1_30_profile,
+                }));
+            } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgb) {
+                return Ok(Box::new($dependant::<
+                    T,
+                    { Layout::Rgb as u8 },
+                    { Layout::Rgb as u8 },
+                    LINEAR_CAP,
+                    GAMMA_LUT,
+                    BIT_DEPTH,
+                    PRECISION,
+                > {
+                    profile: q1_30_profile,
+                }));
+            }
+            Err(CmsError::UnsupportedProfileConnection)
+        }
+    };
+}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+use crate::conversions::neon::{
+    TransformShaperQ1_30NeonOpt, TransformShaperQ2_13Neon, TransformShaperQ2_13NeonOpt,
+};
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13,
+    TransformShaperQ2_13Neon,
+    i16,
+    TransformMatrixShaper
+);
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13_opt,
+    TransformShaperQ2_13NeonOpt,
+    i16,
+    TransformMatrixShaperOptimized
+);
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "neon"))]
+create_rgb_xyz_dependant_q1_30_executor!(
+    make_rgb_xyz_q1_30_opt,
+    TransformShaperQ1_30NeonOpt,
+    i32,
+    TransformMatrixShaperOptimized
+);
+
+#[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13,
+    TransformMatrixShaperQ2_13,
+    i16,
+    TransformMatrixShaper
+);
+
+#[cfg(not(all(target_arch = "aarch64", target_feature = "neon", feature = "neon")))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13_opt,
+    TransformMatrixShaperQ2_13Optimized,
+    i16,
+    TransformMatrixShaperOptimized
+);
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+use crate::conversions::sse::{TransformShaperQ2_13OptSse, TransformShaperQ2_13Sse};
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13_transform_sse_41,
+    TransformShaperQ2_13Sse,
+    i32,
+    TransformMatrixShaper
+);
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "sse"))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13_transform_sse_41_opt,
+    TransformShaperQ2_13OptSse,
+    i32,
+    TransformMatrixShaperOptimized
+);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+use crate::conversions::avx::{TransformShaperRgbQ2_13Avx, TransformShaperRgbQ2_13OptAvx};
+use crate::conversions::rgbxyz::TransformMatrixShaperOptimized;
+use crate::transform::PointeeSizeExpressible;
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13_transform_avx2,
+    TransformShaperRgbQ2_13Avx,
+    i32,
+    TransformMatrixShaper
+);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13_transform_avx2_opt,
+    TransformShaperRgbQ2_13OptAvx,
+    i32,
+    TransformMatrixShaperOptimized
+);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
+use crate::conversions::avx512::TransformShaperRgbQ2_13OptAvx512;
+
+#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
+create_rgb_xyz_dependant_q2_13_executor!(
+    make_rgb_xyz_q2_13_transform_avx512_opt,
+    TransformShaperRgbQ2_13OptAvx512,
+    i32,
+    TransformMatrixShaperOptimized
+);
--- a/vendor/moxcms/src/conversions/rgbxyz_float.rs
+++ b/vendor/moxcms/src/conversions/rgbxyz_float.rs
@@ -0,0 +1,332 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::trc::ToneCurveEvaluator;
+use crate::{CmsError, Layout, Matrix3f, PointeeSizeExpressible, Rgb, TransformExecutor};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformShaperRgbFloat<T: Clone, const BUCKET: usize> {
+    pub(crate) r_linear: Box<[f32; BUCKET]>,
+    pub(crate) g_linear: Box<[f32; BUCKET]>,
+    pub(crate) b_linear: Box<[f32; BUCKET]>,
+    pub(crate) gamma_evaluator: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    pub(crate) adaptation_matrix: Matrix3f,
+    pub(crate) phantom_data: PhantomData<T>,
+}
+
+pub(crate) struct TransformShaperFloatInOut<T: Clone> {
+    pub(crate) linear_evaluator: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    pub(crate) gamma_evaluator: Box<dyn ToneCurveEvaluator + Send + Sync>,
+    pub(crate) adaptation_matrix: Matrix3f,
+    pub(crate) phantom_data: PhantomData<T>,
+}
+
+struct TransformShaperFloatScalar<
+    T: Clone,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const BIT_DEPTH: usize,
+> {
+    pub(crate) profile: TransformShaperRgbFloat<T, LINEAR_CAP>,
+}
+
+struct TransformShaperRgbFloatInOut<T: Clone, const SRC_LAYOUT: u8, const DST_LAYOUT: u8> {
+    pub(crate) profile: TransformShaperFloatInOut<T>,
+    pub(crate) bit_depth: usize,
+}
+
+pub(crate) fn make_rgb_xyz_rgb_transform_float<
+    T: Clone + Send + Sync + PointeeSizeExpressible + 'static + Copy + Default,
+    const LINEAR_CAP: usize,
+    const BIT_DEPTH: usize,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    profile: TransformShaperRgbFloat<T, LINEAR_CAP>,
+) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgba) {
+        return Ok(Box::new(TransformShaperFloatScalar::<
+            T,
+            { Layout::Rgba as u8 },
+            { Layout::Rgba as u8 },
+            LINEAR_CAP,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgba) {
+        return Ok(Box::new(TransformShaperFloatScalar::<
+            T,
+            { Layout::Rgb as u8 },
+            { Layout::Rgba as u8 },
+            LINEAR_CAP,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgb) {
+        return Ok(Box::new(TransformShaperFloatScalar::<
+            T,
+            { Layout::Rgba as u8 },
+            { Layout::Rgb as u8 },
+            LINEAR_CAP,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgb) {
+        return Ok(Box::new(TransformShaperFloatScalar::<
+            T,
+            { Layout::Rgb as u8 },
+            { Layout::Rgb as u8 },
+            LINEAR_CAP,
+            BIT_DEPTH,
+        > {
+            profile,
+        }));
+    }
+    Err(CmsError::UnsupportedProfileConnection)
+}
+
+pub(crate) fn make_rgb_xyz_rgb_transform_float_in_out<
+    T: Clone + Send + Sync + PointeeSizeExpressible + 'static + Copy + Default + AsPrimitive<f32>,
+    const BIT_DEPTH: usize,
+>(
+    src_layout: Layout,
+    dst_layout: Layout,
+    profile: TransformShaperFloatInOut<T>,
+) -> Result<Box<dyn TransformExecutor<T> + Send + Sync>, CmsError>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgba) {
+        return Ok(Box::new(TransformShaperRgbFloatInOut::<
+            T,
+            { Layout::Rgba as u8 },
+            { Layout::Rgba as u8 },
+        > {
+            profile,
+            bit_depth: BIT_DEPTH,
+        }));
+    } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgba) {
+        return Ok(Box::new(TransformShaperRgbFloatInOut::<
+            T,
+            { Layout::Rgb as u8 },
+            { Layout::Rgba as u8 },
+        > {
+            profile,
+            bit_depth: BIT_DEPTH,
+        }));
+    } else if (src_layout == Layout::Rgba) && (dst_layout == Layout::Rgb) {
+        return Ok(Box::new(TransformShaperRgbFloatInOut::<
+            T,
+            { Layout::Rgba as u8 },
+            { Layout::Rgb as u8 },
+        > {
+            profile,
+            bit_depth: BIT_DEPTH,
+        }));
+    } else if (src_layout == Layout::Rgb) && (dst_layout == Layout::Rgb) {
+        return Ok(Box::new(TransformShaperRgbFloatInOut::<
+            T,
+            { Layout::Rgb as u8 },
+            { Layout::Rgb as u8 },
+        > {
+            profile,
+            bit_depth: BIT_DEPTH,
+        }));
+    }
+    Err(CmsError::UnsupportedProfileConnection)
+}
+
+impl<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const BIT_DEPTH: usize,
+> TransformExecutor<T>
+    for TransformShaperFloatScalar<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, BIT_DEPTH>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        use crate::mlaf::mlaf;
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let transform = self.profile.adaptation_matrix;
+        let max_colors: T = ((1 << BIT_DEPTH) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let r = self.profile.r_linear[src[src_cn.r_i()]._as_usize()];
+            let g = self.profile.g_linear[src[src_cn.g_i()]._as_usize()];
+            let b = self.profile.b_linear[src[src_cn.b_i()]._as_usize()];
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_colors
+            };
+
+            let new_r = mlaf(
+                mlaf(r * transform.v[0][0], g, transform.v[0][1]),
+                b,
+                transform.v[0][2],
+            );
+
+            let new_g = mlaf(
+                mlaf(r * transform.v[1][0], g, transform.v[1][1]),
+                b,
+                transform.v[1][2],
+            );
+
+            let new_b = mlaf(
+                mlaf(r * transform.v[2][0], g, transform.v[2][1]),
+                b,
+                transform.v[2][2],
+            );
+
+            let mut rgb = Rgb::new(new_r, new_g, new_b);
+            rgb = self.profile.gamma_evaluator.evaluate_tristimulus(rgb);
+
+            dst[dst_cn.r_i()] = rgb.r.as_();
+            dst[dst_cn.g_i()] = rgb.g.as_();
+            dst[dst_cn.b_i()] = rgb.b.as_();
+            if dst_channels == 4 {
+                dst[dst_cn.a_i()] = a;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Clone + PointeeSizeExpressible + Copy + Default + 'static + AsPrimitive<f32>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+> TransformExecutor<T> for TransformShaperRgbFloatInOut<T, SRC_LAYOUT, DST_LAYOUT>
+where
+    u32: AsPrimitive<T>,
+    f32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        use crate::mlaf::mlaf;
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let transform = self.profile.adaptation_matrix;
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let mut src_rgb = Rgb::new(
+                src[src_cn.r_i()].as_(),
+                src[src_cn.g_i()].as_(),
+                src[src_cn.b_i()].as_(),
+            );
+            src_rgb = self.profile.linear_evaluator.evaluate_tristimulus(src_rgb);
+            let r = src_rgb.r;
+            let g = src_rgb.g;
+            let b = src_rgb.b;
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_colors
+            };
+
+            let new_r = mlaf(
+                mlaf(r * transform.v[0][0], g, transform.v[0][1]),
+                b,
+                transform.v[0][2],
+            );
+
+            let new_g = mlaf(
+                mlaf(r * transform.v[1][0], g, transform.v[1][1]),
+                b,
+                transform.v[1][2],
+            );
+
+            let new_b = mlaf(
+                mlaf(r * transform.v[2][0], g, transform.v[2][1]),
+                b,
+                transform.v[2][2],
+            );
+
+            let mut rgb = Rgb::new(new_r, new_g, new_b);
+            rgb = self.profile.gamma_evaluator.evaluate_tristimulus(rgb);
+
+            dst[dst_cn.r_i()] = rgb.r.as_();
+            dst[dst_cn.g_i()] = rgb.g.as_();
+            dst[dst_cn.b_i()] = rgb.b.as_();
+
+            if dst_channels == 4 {
+                dst[dst_cn.a_i()] = a;
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/interpolator.rs
+++ b/vendor/moxcms/src/conversions/sse/interpolator.rs
@@ -0,0 +1,457 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::math::FusedMultiplyAdd;
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::ops::{Add, Mul, Sub};
+
+#[repr(align(16), C)]
+pub(crate) struct SseAlignedF32(pub(crate) [f32; 4]);
+
+#[cfg(feature = "options")]
+pub(crate) struct TetrahedralSse<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [SseAlignedF32],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct PyramidalSse<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [SseAlignedF32],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct PrismaticSse<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [SseAlignedF32],
+}
+
+pub(crate) struct TrilinearSse<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [SseAlignedF32],
+}
+
+trait Fetcher<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
+}
+
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub(crate) struct SseVector {
+    pub(crate) v: __m128,
+}
+
+impl From<f32> for SseVector {
+    #[inline(always)]
+    fn from(v: f32) -> Self {
+        SseVector {
+            v: unsafe { _mm_set1_ps(v) },
+        }
+    }
+}
+
+impl Sub<SseVector> for SseVector {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: SseVector) -> Self::Output {
+        SseVector {
+            v: unsafe { _mm_sub_ps(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Add<SseVector> for SseVector {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: SseVector) -> Self::Output {
+        SseVector {
+            v: unsafe { _mm_add_ps(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Mul<SseVector> for SseVector {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: SseVector) -> Self::Output {
+        SseVector {
+            v: unsafe { _mm_mul_ps(self.v, rhs.v) },
+        }
+    }
+}
+
+impl FusedMultiplyAdd<SseVector> for SseVector {
+    #[inline(always)]
+    fn mla(&self, b: SseVector, c: SseVector) -> SseVector {
+        SseVector {
+            v: unsafe { _mm_add_ps(self.v, _mm_mul_ps(b.v, c.v)) },
+        }
+    }
+}
+
+struct TetrahedralSseFetchVector<'a, const GRID_SIZE: usize> {
+    cube: &'a [SseAlignedF32],
+}
+
+impl<const GRID_SIZE: usize> Fetcher<SseVector> for TetrahedralSseFetchVector<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32) -> SseVector {
+        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+            + y as u32 * GRID_SIZE as u32
+            + z as u32) as usize;
+        let jx = unsafe { self.cube.get_unchecked(offset..) };
+        SseVector {
+            v: unsafe { _mm_load_ps(jx.as_ptr() as *const _) },
+        }
+    }
+}
+
+pub(crate) trait SseMdInterpolation<'a, const GRID_SIZE: usize> {
+    fn new(table: &'a [SseAlignedF32]) -> Self;
+    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+    ) -> SseVector;
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> TetrahedralSse<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<SseVector>,
+    ) -> SseVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let rx = lut_r.w;
+        let ry = lut_g.w;
+        let rz = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
+                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
+            c3 = r.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, SseVector::from(rx));
+        let s1 = s0.mla(c2, SseVector::from(ry));
+        s1.mla(c3, SseVector::from(rz))
+    }
+}
+
+macro_rules! define_inter_sse {
+    ($interpolator: ident) => {
+        impl<'a, const GRID_SIZE: usize> SseMdInterpolation<'a, GRID_SIZE>
+            for $interpolator<'a, GRID_SIZE>
+        {
+            #[inline]
+            fn new(table: &'a [SseAlignedF32]) -> Self {
+                Self { cube: table }
+            }
+
+            #[inline(always)]
+            fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
+                &self,
+                in_r: U,
+                in_g: U,
+                in_b: U,
+                lut: &[BarycentricWeight<f32>; BINS],
+            ) -> SseVector {
+                self.interpolate(
+                    in_r,
+                    in_g,
+                    in_b,
+                    lut,
+                    TetrahedralSseFetchVector::<GRID_SIZE> { cube: self.cube },
+                )
+            }
+        }
+    };
+}
+
+#[cfg(feature = "options")]
+define_inter_sse!(TetrahedralSse);
+#[cfg(feature = "options")]
+define_inter_sse!(PyramidalSse);
+#[cfg(feature = "options")]
+define_inter_sse!(PrismaticSse);
+define_inter_sse!(TrilinearSse);
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> PyramidalSse<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<SseVector>,
+    ) -> SseVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n);
+            let x1 = r.fetch(x_n, y_n, z);
+            let x2 = r.fetch(x_n, y, z);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            s2.mla(c4, SseVector::from(dr * dg))
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y_n, z_n);
+            let x2 = r.fetch(x, y_n, z_n);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            s2.mla(c4, SseVector::from(dg * db))
+        } else {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z);
+            let x2 = r.fetch(x_n, y, z_n);
+            let x3 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            s2.mla(c4, SseVector::from(db * dr))
+        }
+    }
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> PrismaticSse<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<SseVector>,
+    ) -> SseVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        if db > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x, y_n, z_n);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            let s3 = s2.mla(c4, SseVector::from(dg * db));
+            s3.mla(c5, SseVector::from(dr * dg))
+        } else {
+            let x0 = r.fetch(x_n, y, z);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x_n, y_n, z);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            let s3 = s2.mla(c4, SseVector::from(dg * db));
+            s3.mla(c5, SseVector::from(dr * dg))
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> TrilinearSse<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<f32>; BINS],
+        r: impl Fetcher<SseVector>,
+    ) -> SseVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let w0 = SseVector::from(dr);
+        let w1 = SseVector::from(dg);
+        let w2 = SseVector::from(db);
+
+        let c000 = r.fetch(x, y, z);
+        let c100 = r.fetch(x_n, y, z);
+        let c010 = r.fetch(x, y_n, z);
+        let c110 = r.fetch(x_n, y_n, z);
+        let c001 = r.fetch(x, y, z_n);
+        let c101 = r.fetch(x_n, y, z_n);
+        let c011 = r.fetch(x, y_n, z_n);
+        let c111 = r.fetch(x_n, y_n, z_n);
+
+        let dx = SseVector::from(1.0 - dr);
+
+        let c00 = (c000 * dx).mla(c100, w0);
+        let c10 = (c010 * dx).mla(c110, w0);
+        let c01 = (c001 * dx).mla(c101, w0);
+        let c11 = (c011 * dx).mla(c111, w0);
+
+        let dy = SseVector::from(1.0 - dg);
+
+        let c0 = (c00 * dy).mla(c10, w1);
+        let c1 = (c01 * dy).mla(c11, w1);
+
+        let dz = SseVector::from(1.0 - db);
+
+        (c0 * dz).mla(c1, w2)
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/interpolator_q0_15.rs
+++ b/vendor/moxcms/src/conversions/sse/interpolator_q0_15.rs
@@ -0,0 +1,456 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::math::FusedMultiplyAdd;
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::ops::{Add, Mul, Sub};
+
+#[repr(align(8), C)]
+pub(crate) struct SseAlignedI16x4(pub(crate) [i16; 4]);
+
+#[cfg(feature = "options")]
+pub(crate) struct TetrahedralSseQ0_15<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [SseAlignedI16x4],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct PyramidalSseQ0_15<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [SseAlignedI16x4],
+}
+
+#[cfg(feature = "options")]
+pub(crate) struct PrismaticSseQ0_15<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [SseAlignedI16x4],
+}
+
+pub(crate) struct TrilinearSseQ0_15<'a, const GRID_SIZE: usize> {
+    pub(crate) cube: &'a [SseAlignedI16x4],
+}
+
+trait Fetcher<T> {
+    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
+}
+
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub(crate) struct SseVector {
+    pub(crate) v: __m128i,
+}
+
+impl From<i16> for SseVector {
+    #[inline(always)]
+    fn from(v: i16) -> Self {
+        SseVector {
+            v: unsafe { _mm_set1_epi16(v) },
+        }
+    }
+}
+
+impl Sub<SseVector> for SseVector {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: SseVector) -> Self::Output {
+        SseVector {
+            v: unsafe { _mm_sub_epi16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Add<SseVector> for SseVector {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: SseVector) -> Self::Output {
+        SseVector {
+            v: unsafe { _mm_add_epi16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Mul<SseVector> for SseVector {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: SseVector) -> Self::Output {
+        SseVector {
+            v: unsafe { _mm_mulhrs_epi16(self.v, rhs.v) },
+        }
+    }
+}
+
+impl FusedMultiplyAdd<SseVector> for SseVector {
+    #[inline(always)]
+    fn mla(&self, b: SseVector, c: SseVector) -> SseVector {
+        SseVector {
+            v: unsafe { _mm_add_epi16(self.v, _mm_mulhrs_epi16(b.v, c.v)) },
+        }
+    }
+}
+
+struct TetrahedralSseQ0_15FetchVector<'a, const GRID_SIZE: usize> {
+    cube: &'a [SseAlignedI16x4],
+}
+
+impl<const GRID_SIZE: usize> Fetcher<SseVector> for TetrahedralSseQ0_15FetchVector<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn fetch(&self, x: i32, y: i32, z: i32) -> SseVector {
+        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
+            + y as u32 * GRID_SIZE as u32
+            + z as u32) as usize;
+        let jx = unsafe { self.cube.get_unchecked(offset..) };
+        SseVector {
+            v: unsafe { _mm_loadu_si64(jx.as_ptr() as *const _) },
+        }
+    }
+}
+
+pub(crate) trait SseMdInterpolationQ0_15<'a, const GRID_SIZE: usize> {
+    fn new(table: &'a [SseAlignedI16x4]) -> Self;
+    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+    ) -> SseVector;
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> TetrahedralSseQ0_15<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<SseVector>,
+    ) -> SseVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let rx = lut_r.w;
+        let ry = lut_g.w;
+        let rz = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        let c2;
+        let c1;
+        let c3;
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
+                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1 = r.fetch(x_n, y, z) - c0;
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
+                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
+                c3 = r.fetch(x, y, z_n) - c0;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z) - c0;
+            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
+            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
+            c3 = r.fetch(x, y, z_n) - c0;
+        }
+        let s0 = c0.mla(c1, SseVector::from(rx));
+        let s1 = s0.mla(c2, SseVector::from(ry));
+        s1.mla(c3, SseVector::from(rz))
+    }
+}
+
+macro_rules! define_inter_sse {
+    ($interpolator: ident) => {
+        impl<'a, const GRID_SIZE: usize> SseMdInterpolationQ0_15<'a, GRID_SIZE>
+            for $interpolator<'a, GRID_SIZE>
+        {
+            #[inline]
+            fn new(table: &'a [SseAlignedI16x4]) -> Self {
+                Self { cube: table }
+            }
+
+            #[inline(always)]
+            fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
+                &self,
+                in_r: U,
+                in_g: U,
+                in_b: U,
+                lut: &[BarycentricWeight<i16>; BINS],
+            ) -> SseVector {
+                self.interpolate(
+                    in_r,
+                    in_g,
+                    in_b,
+                    lut,
+                    TetrahedralSseQ0_15FetchVector::<GRID_SIZE> { cube: self.cube },
+                )
+            }
+        }
+    };
+}
+
+#[cfg(feature = "options")]
+define_inter_sse!(TetrahedralSseQ0_15);
+#[cfg(feature = "options")]
+define_inter_sse!(PyramidalSseQ0_15);
+#[cfg(feature = "options")]
+define_inter_sse!(PrismaticSseQ0_15);
+define_inter_sse!(TrilinearSseQ0_15);
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> PyramidalSseQ0_15<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<SseVector>,
+    ) -> SseVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        if dr > db && dg > db {
+            let x0 = r.fetch(x_n, y_n, z_n);
+            let x1 = r.fetch(x_n, y_n, z);
+            let x2 = r.fetch(x_n, y, z);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - x1;
+            let c2 = x2 - c0;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x2 + x1;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            s2.mla(c4, SseVector::from(dr) * SseVector::from(dg))
+        } else if db > dr && dg > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y_n, z_n);
+            let x2 = r.fetch(x, y_n, z_n);
+            let x3 = r.fetch(x, y_n, z);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x2;
+            let c3 = x3 - c0;
+            let c4 = c0 - x3 - x0 + x2;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            s2.mla(c4, SseVector::from(dg) * SseVector::from(db))
+        } else {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z);
+            let x2 = r.fetch(x_n, y, z_n);
+            let x3 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - c0;
+            let c3 = x3 - x2;
+            let c4 = c0 - x1 - x0 + x2;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            s2.mla(c4, SseVector::from(db) * SseVector::from(dr))
+        }
+    }
+}
+
+#[cfg(feature = "options")]
+impl<const GRID_SIZE: usize> PrismaticSseQ0_15<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<SseVector>,
+    ) -> SseVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        let c0 = r.fetch(x, y, z);
+
+        if db > dr {
+            let x0 = r.fetch(x, y, z_n);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x, y_n, z_n);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x0 - c0;
+            let c2 = x1 - x0;
+            let c3 = x2 - c0;
+            let c4 = c0 - x2 - x0 + x3;
+            let c5 = x0 - x3 - x1 + x4;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            let s3 = s2.mla(c4, SseVector::from(dg) * SseVector::from(db));
+            s3.mla(c5, SseVector::from(dr) * SseVector::from(dg))
+        } else {
+            let x0 = r.fetch(x_n, y, z);
+            let x1 = r.fetch(x_n, y, z_n);
+            let x2 = r.fetch(x, y_n, z);
+            let x3 = r.fetch(x_n, y_n, z);
+            let x4 = r.fetch(x_n, y_n, z_n);
+
+            let c1 = x1 - x0;
+            let c2 = x0 - c0;
+            let c3 = x2 - c0;
+            let c4 = x0 - x3 - x1 + x4;
+            let c5 = c0 - x2 - x0 + x3;
+
+            let s0 = c0.mla(c1, SseVector::from(db));
+            let s1 = s0.mla(c2, SseVector::from(dr));
+            let s2 = s1.mla(c3, SseVector::from(dg));
+            let s3 = s2.mla(c4, SseVector::from(dg) * SseVector::from(db));
+            s3.mla(c5, SseVector::from(dr) * SseVector::from(dg))
+        }
+    }
+}
+
+impl<const GRID_SIZE: usize> TrilinearSseQ0_15<'_, GRID_SIZE> {
+    #[inline(always)]
+    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
+        &self,
+        in_r: U,
+        in_g: U,
+        in_b: U,
+        lut: &[BarycentricWeight<i16>; BINS],
+        r: impl Fetcher<SseVector>,
+    ) -> SseVector {
+        let lut_r = lut[in_r.as_()];
+        let lut_g = lut[in_g.as_()];
+        let lut_b = lut[in_b.as_()];
+
+        let x: i32 = lut_r.x;
+        let y: i32 = lut_g.x;
+        let z: i32 = lut_b.x;
+
+        let x_n: i32 = lut_r.x_n;
+        let y_n: i32 = lut_g.x_n;
+        let z_n: i32 = lut_b.x_n;
+
+        let dr = lut_r.w;
+        let dg = lut_g.w;
+        let db = lut_b.w;
+
+        const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
+        let q_max = SseVector::from(Q_MAX);
+        let w0 = SseVector::from(dr);
+        let w1 = SseVector::from(dg);
+        let w2 = SseVector::from(db);
+        let dx = q_max - SseVector::from(dr);
+        let dy = q_max - SseVector::from(dg);
+        let dz = q_max - SseVector::from(db);
+
+        let c000 = r.fetch(x, y, z);
+        let c100 = r.fetch(x_n, y, z);
+        let c010 = r.fetch(x, y_n, z);
+        let c110 = r.fetch(x_n, y_n, z);
+        let c001 = r.fetch(x, y, z_n);
+        let c101 = r.fetch(x_n, y, z_n);
+        let c011 = r.fetch(x, y_n, z_n);
+        let c111 = r.fetch(x_n, y_n, z_n);
+
+        let c00 = (c000 * dx).mla(c100, w0);
+        let c10 = (c010 * dx).mla(c110, w0);
+        let c01 = (c001 * dx).mla(c101, w0);
+        let c11 = (c011 * dx).mla(c111, w0);
+
+        let c0 = (c00 * dy).mla(c10, w1);
+        let c1 = (c01 * dy).mla(c11, w1);
+
+        (c0 * dz).mla(c1, w2)
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/lut4_to_3.rs
+++ b/vendor/moxcms/src/conversions/sse/lut4_to_3.rs
@@ -0,0 +1,330 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::lut_transforms::Lut4x3Factory;
+use crate::conversions::sse::interpolator::*;
+use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
+use crate::conversions::sse::lut4_to_3_q0_15::TransformLut4To3SseQ0_15;
+use crate::transform::PointeeSizeExpressible;
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::marker::PhantomData;
+
+struct TransformLut4To3Sse<
+    T,
+    U,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    lut: Vec<SseAlignedF32>,
+    _phantom: PhantomData<T>,
+    _phantom1: PhantomData<U>,
+    interpolation_method: InterpolationMethod,
+    weights: Box<[BarycentricWeight<f32>; BINS]>,
+    color_space: DataColorSpace,
+    is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn transform_chunk<'b, Interpolator: SseMdInterpolation<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        let grid_size = GRID_SIZE as i32;
+        let grid_size3 = grid_size * grid_size * grid_size;
+
+        let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
+        let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
+
+        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
+            let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[0],
+            );
+            let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[1],
+            );
+            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[2],
+            );
+            let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[3],
+            );
+
+            let k_weights = self.weights[k.as_()];
+
+            let w: i32 = k_weights.x;
+            let w_n: i32 = k_weights.x_n;
+            let t: f32 = k_weights.w;
+
+            let table1 = &self.lut[(w * grid_size3) as usize..];
+            let table2 = &self.lut[(w_n * grid_size3) as usize..];
+
+            let tetrahedral1 = Interpolator::new(table1);
+            let tetrahedral2 = Interpolator::new(table2);
+            let a0 = tetrahedral1.inter3_sse(c, m, y, &self.weights).v;
+            let b0 = tetrahedral2.inter3_sse(c, m, y, &self.weights).v;
+
+            if T::FINITE {
+                unsafe {
+                    let t0 = _mm_set1_ps(t);
+                    let ones = _mm_set1_ps(1f32);
+                    let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
+                    let mut v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
+                    v = _mm_max_ps(v, _mm_setzero_ps());
+                    v = _mm_mul_ps(v, value_scale);
+                    v = _mm_min_ps(v, value_scale);
+                    let jvz = _mm_cvtps_epi32(v);
+
+                    let x = _mm_extract_epi32::<0>(jvz);
+                    let y = _mm_extract_epi32::<1>(jvz);
+                    let z = _mm_extract_epi32::<2>(jvz);
+
+                    dst[cn.r_i()] = (x as u32).as_();
+                    dst[cn.g_i()] = (y as u32).as_();
+                    dst[cn.b_i()] = (z as u32).as_();
+                }
+            } else {
+                unsafe {
+                    let t0 = _mm_set1_ps(t);
+                    let ones = _mm_set1_ps(1f32);
+                    let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
+                    let v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
+
+                    dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
+                    dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
+                    dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
+                }
+            }
+            if channels == 4 {
+                dst[cn.a_i()] = max_value;
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        if src.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / 4;
+        let dst_chunks = dst.len() / channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralSse<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidalSse<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticSse<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) struct SseLut4x3Factory {}
+
+impl Lut4x3Factory for SseLut4x3Factory {
+    fn make_transform_4x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Sync + Send>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>,
+    {
+        if options.prefer_fixed_point && BIT_DEPTH < 16 {
+            let q: f32 = if T::FINITE {
+                ((1i32 << BIT_DEPTH as i32) - 1) as f32
+            } else {
+                ((1i32 << 14i32) - 1) as f32
+            };
+            let lut = lut
+                .chunks_exact(3)
+                .map(|x| {
+                    SseAlignedI16x4([
+                        (x[0] * q).round() as i16,
+                        (x[1] * q).round() as i16,
+                        (x[2] * q).round() as i16,
+                        0,
+                    ])
+                })
+                .collect::<Vec<_>>();
+            return match options.barycentric_weight_scale {
+                BarycentricWeightScale::Low => Box::new(TransformLut4To3SseQ0_15::<
+                    T,
+                    u8,
+                    LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    256,
+                    256,
+                > {
+                    lut,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
+                    _phantom: PhantomData,
+                    _phantom1: PhantomData,
+                    color_space,
+                    is_linear,
+                }),
+                #[cfg(feature = "options")]
+                BarycentricWeightScale::High => Box::new(TransformLut4To3SseQ0_15::<
+                    T,
+                    u16,
+                    LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    65536,
+                    65536,
+                > {
+                    lut,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
+                    _phantom: PhantomData,
+                    _phantom1: PhantomData,
+                    color_space,
+                    is_linear,
+                }),
+            };
+        }
+        let lut = lut
+            .chunks_exact(3)
+            .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
+            .collect::<Vec<_>>();
+        match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => {
+                Box::new(
+                    TransformLut4To3Sse::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
+                        lut,
+                        _phantom: PhantomData,
+                        _phantom1: PhantomData,
+                        interpolation_method: options.interpolation_method,
+                        weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                        color_space,
+                        is_linear,
+                    },
+                )
+            }
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => {
+                Box::new(
+                    TransformLut4To3Sse::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
+                        lut,
+                        _phantom: PhantomData,
+                        _phantom1: PhantomData,
+                        interpolation_method: options.interpolation_method,
+                        weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                        color_space,
+                        is_linear,
+                    },
+                )
+            }
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/lut4_to_3_q0_15.rs
+++ b/vendor/moxcms/src/conversions/sse/lut4_to_3_q0_15.rs
@@ -0,0 +1,212 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::sse::interpolator_q0_15::*;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformLut4To3SseQ0_15<
+    T,
+    U,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    pub(crate) lut: Vec<SseAlignedI16x4>,
+    pub(crate) _phantom: PhantomData<T>,
+    pub(crate) _phantom1: PhantomData<U>,
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
+    pub(crate) color_space: DataColorSpace,
+    pub(crate) is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut4To3SseQ0_15<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn transform_chunk<'b, Interpolator: SseMdInterpolationQ0_15<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        unsafe {
+            let cn = Layout::from(LAYOUT);
+            let channels = cn.channels();
+            let grid_size = GRID_SIZE as i32;
+            let grid_size3 = grid_size * grid_size * grid_size;
+
+            let f_value_scale = _mm_set1_ps(1. / ((1 << 14i32) - 1) as f32);
+            let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+            let v_max_scale = if T::FINITE {
+                _mm_set1_epi16(((1i32 << BIT_DEPTH) - 1) as i16)
+            } else {
+                _mm_set1_epi16(((1i32 << 14i32) - 1) as i16)
+            };
+
+            for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
+                let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[0],
+                );
+                let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[1],
+                );
+                let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[2],
+                );
+                let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[3],
+                );
+
+                let k_weights = self.weights[k.as_()];
+
+                let w: i32 = k_weights.x;
+                let w_n: i32 = k_weights.x_n;
+                const Q: i16 = ((1i32 << 15) - 1) as i16;
+                let t: i16 = k_weights.w;
+                let t_n: i16 = Q - t;
+
+                let table1 = &self.lut[(w * grid_size3) as usize..];
+                let table2 = &self.lut[(w_n * grid_size3) as usize..];
+
+                let tetrahedral1 = Interpolator::new(table1);
+                let tetrahedral2 = Interpolator::new(table2);
+                let a0 = tetrahedral1.inter3_sse(c, m, y, &self.weights).v;
+                let b0 = tetrahedral2.inter3_sse(c, m, y, &self.weights).v;
+
+                let hp = _mm_mulhrs_epi16(_mm_set1_epi16(t_n), a0);
+                let v = _mm_add_epi16(hp, _mm_mulhrs_epi16(b0, _mm_set1_epi16(t)));
+
+                if T::FINITE {
+                    let mut o = _mm_max_epi16(v, _mm_setzero_si128());
+                    o = _mm_min_epi16(o, v_max_scale);
+
+                    let x = _mm_extract_epi16::<0>(o);
+                    let y = _mm_extract_epi16::<1>(o);
+                    let z = _mm_extract_epi16::<2>(o);
+
+                    dst[cn.r_i()] = (x as u32).as_();
+                    dst[cn.g_i()] = (y as u32).as_();
+                    dst[cn.b_i()] = (z as u32).as_();
+                } else {
+                    let mut r = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(v));
+                    r = _mm_mul_ps(r, f_value_scale);
+                    dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(r) as u32).as_();
+                    dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(r) as u32).as_();
+                    dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(r) as u32).as_();
+                }
+                if channels == 4 {
+                    dst[cn.a_i()] = max_value;
+                }
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut4To3SseQ0_15<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        if src.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / 4;
+        let dst_chunks = dst.len() / channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearSseQ0_15<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralSseQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidalSseQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticSseQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearSseQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/mod.rs
+++ b/vendor/moxcms/src/conversions/sse/mod.rs
@@ -0,0 +1,45 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+mod interpolator;
+mod interpolator_q0_15;
+mod lut4_to_3;
+mod lut4_to_3_q0_15;
+mod rgb_xyz;
+mod rgb_xyz_opt;
+mod rgb_xyz_q2_13;
+mod rgb_xyz_q2_13_opt;
+mod t_lut3_to_3;
+mod t_lut3_to_3_q0_15;
+
+pub(crate) use lut4_to_3::SseLut4x3Factory;
+pub(crate) use rgb_xyz::TransformShaperRgbSse;
+pub(crate) use rgb_xyz_opt::TransformShaperRgbOptSse;
+pub(crate) use rgb_xyz_q2_13::TransformShaperQ2_13Sse;
+pub(crate) use rgb_xyz_q2_13_opt::TransformShaperQ2_13OptSse;
+pub(crate) use t_lut3_to_3::SseLut3x3Factory;
--- a/vendor/moxcms/src/conversions/sse/rgb_xyz.rs
+++ b/vendor/moxcms/src/conversions/sse/rgb_xyz.rs
@@ -0,0 +1,154 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::TransformMatrixShaper;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[repr(align(16), C)]
+pub(crate) struct SseAlignedU16(pub(crate) [u16; 8]);
+
+pub(crate) struct TransformShaperRgbSse<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> {
+    pub(crate) profile: TransformMatrixShaper<T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformShaperRgbSse<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary = SseAlignedU16([0; 8]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        unsafe {
+            let m0 = _mm_setr_ps(t.v[0][0], t.v[0][1], t.v[0][2], 0f32);
+            let m1 = _mm_setr_ps(t.v[1][0], t.v[1][1], t.v[1][2], 0f32);
+            let m2 = _mm_setr_ps(t.v[2][0], t.v[2][1], t.v[2][2], 0f32);
+
+            let zeros = _mm_setzero_ps();
+
+            let v_scale = _mm_set1_ps(scale);
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.r_linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.g_linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.b_linear[src[src_cn.b_i()]._as_usize()];
+
+                let mut r = _mm_load_ss(rp);
+                let mut g = _mm_load_ss(gp);
+                let mut b = _mm_load_ss(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                r = _mm_shuffle_ps::<0>(r, r);
+                g = _mm_shuffle_ps::<0>(g, g);
+                b = _mm_shuffle_ps::<0>(b, b);
+
+                let v0 = _mm_mul_ps(r, m0);
+                let v1 = _mm_mul_ps(g, m1);
+                let v2 = _mm_mul_ps(b, m2);
+
+                let mut v = _mm_add_ps(_mm_add_ps(v0, v1), v2);
+                v = _mm_max_ps(v, zeros);
+                v = _mm_mul_ps(v, v_scale);
+                v = _mm_min_ps(v, v_scale);
+
+                let zx = _mm_cvtps_epi32(v);
+                _mm_store_si128(temporary.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformExecutor<T> for TransformShaperRgbSse<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/rgb_xyz_opt.rs
+++ b/vendor/moxcms/src/conversions/sse/rgb_xyz_opt.rs
@@ -0,0 +1,153 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::TransformMatrixShaperOptimized;
+use crate::conversions::sse::rgb_xyz::SseAlignedU16;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub(crate) struct TransformShaperRgbOptSse<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> {
+    pub(crate) profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformShaperRgbOptSse<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary = SseAlignedU16([0; 8]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let scale = (GAMMA_LUT - 1) as f32;
+        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
+
+        unsafe {
+            let m0 = _mm_setr_ps(t.v[0][0], t.v[0][1], t.v[0][2], 0f32);
+            let m1 = _mm_setr_ps(t.v[1][0], t.v[1][1], t.v[1][2], 0f32);
+            let m2 = _mm_setr_ps(t.v[2][0], t.v[2][1], t.v[2][2], 0f32);
+
+            let zeros = _mm_setzero_ps();
+
+            let v_scale = _mm_set1_ps(scale);
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.linear[src[src_cn.b_i()]._as_usize()];
+
+                let mut r = _mm_load_ss(rp);
+                let mut g = _mm_load_ss(gp);
+                let mut b = _mm_load_ss(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                r = _mm_shuffle_ps::<0>(r, r);
+                g = _mm_shuffle_ps::<0>(g, g);
+                b = _mm_shuffle_ps::<0>(b, b);
+
+                let v0 = _mm_mul_ps(r, m0);
+                let v1 = _mm_mul_ps(g, m1);
+                let v2 = _mm_mul_ps(b, m2);
+
+                let mut v = _mm_add_ps(_mm_add_ps(v0, v1), v2);
+                v = _mm_max_ps(v, zeros);
+                v = _mm_mul_ps(v, v_scale);
+                v = _mm_min_ps(v, v_scale);
+
+                let zx = _mm_cvtps_epi32(v);
+                _mm_store_si128(temporary.0.as_mut_ptr() as *mut _, zx);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+> TransformExecutor<T>
+    for TransformShaperRgbOptSse<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/rgb_xyz_q2_13.rs
+++ b/vendor/moxcms/src/conversions/sse/rgb_xyz_q2_13.rs
@@ -0,0 +1,167 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPoint;
+use crate::conversions::sse::rgb_xyz::SseAlignedU16;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub(crate) struct TransformShaperQ2_13Sse<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPoint<i32, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+#[inline(always)]
+pub(crate) unsafe fn _xmm_load_epi32(f: &i32) -> __m128i {
+    let float_ref: &f32 = unsafe { &*(f as *const i32 as *const f32) };
+    unsafe { _mm_castps_si128(_mm_load_ss(float_ref)) }
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformShaperQ2_13Sse<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary = SseAlignedU16([0; 8]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let max_colors = ((1 << self.bit_depth) - 1).as_();
+
+        unsafe {
+            let m0 = _mm_setr_epi16(
+                t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
+            );
+            let m2 = _mm_setr_epi16(t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0);
+
+            let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
+            let rnd = _mm_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
+
+            let v_max_value = _mm_set1_epi32(GAMMA_LUT as i32 - 1);
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.r_linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.g_linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.b_linear[src[src_cn.b_i()]._as_usize()];
+
+                let mut r = _xmm_load_epi32(rp);
+                let mut g = _xmm_load_epi32(gp);
+                let mut b = _xmm_load_epi32(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                r = _mm_shuffle_epi32::<0>(r);
+                g = _mm_shuffle_epi32::<0>(g);
+                b = _mm_shuffle_epi32::<0>(b);
+
+                g = _mm_slli_epi32::<16>(g);
+
+                let zrg0 = _mm_or_si128(r, g);
+                let zbz0 = _mm_or_si128(b, rnd);
+
+                let v0 = _mm_madd_epi16(zrg0, m0);
+                let v1 = _mm_madd_epi16(zbz0, m2);
+
+                let mut v = _mm_add_epi32(v0, v1);
+
+                v = _mm_srai_epi32::<PRECISION>(v);
+                v = _mm_max_epi32(v, _mm_setzero_si128());
+                v = _mm_min_epi32(v, v_max_value);
+
+                _mm_store_si128(temporary.0.as_mut_ptr() as *mut _, v);
+
+                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformShaperQ2_13Sse<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/rgb_xyz_q2_13_opt.rs
+++ b/vendor/moxcms/src/conversions/sse/rgb_xyz_q2_13_opt.rs
@@ -0,0 +1,162 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPointOpt;
+use crate::conversions::sse::rgb_xyz::SseAlignedU16;
+use crate::conversions::sse::rgb_xyz_q2_13::_xmm_load_epi32;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub(crate) struct TransformShaperQ2_13OptSse<
+    T: Copy,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> {
+    pub(crate) profile: TransformMatrixShaperFixedPointOpt<i32, i16, T, LINEAR_CAP>,
+    pub(crate) bit_depth: usize,
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformShaperQ2_13OptSse<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let src_channels = src_cn.channels();
+        let dst_channels = dst_cn.channels();
+
+        let mut temporary = SseAlignedU16([0; 8]);
+
+        if src.len() / src_channels != dst.len() / dst_channels {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+
+        let t = self.profile.adaptation_matrix.transpose();
+
+        let max_colors = ((1 << self.bit_depth) - 1).as_();
+
+        unsafe {
+            let m0 = _mm_setr_epi16(
+                t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
+            );
+            let m2 = _mm_setr_epi16(t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0);
+
+            let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
+            let rnd = _mm_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
+
+            let v_max_value = _mm_set1_epi32(GAMMA_LUT as i32 - 1);
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let rp = &self.profile.linear[src[src_cn.r_i()]._as_usize()];
+                let gp = &self.profile.linear[src[src_cn.g_i()]._as_usize()];
+                let bp = &self.profile.linear[src[src_cn.b_i()]._as_usize()];
+
+                let mut r = _xmm_load_epi32(rp);
+                let mut g = _xmm_load_epi32(gp);
+                let mut b = _xmm_load_epi32(bp);
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_colors
+                };
+
+                r = _mm_shuffle_epi32::<0>(r);
+                g = _mm_shuffle_epi32::<0>(g);
+                b = _mm_shuffle_epi32::<0>(b);
+
+                g = _mm_slli_epi32::<16>(g);
+
+                let zrg0 = _mm_or_si128(r, g);
+                let zbz0 = _mm_or_si128(b, rnd);
+
+                let v0 = _mm_madd_epi16(zrg0, m0);
+                let v1 = _mm_madd_epi16(zbz0, m2);
+
+                let mut v = _mm_add_epi32(v0, v1);
+
+                v = _mm_srai_epi32::<PRECISION>(v);
+                v = _mm_max_epi32(v, _mm_setzero_si128());
+                v = _mm_min_epi32(v, v_max_value);
+
+                _mm_store_si128(temporary.0.as_mut_ptr() as *mut _, v);
+
+                dst[dst_cn.r_i()] = self.profile.gamma[temporary.0[0] as usize];
+                dst[dst_cn.g_i()] = self.profile.gamma[temporary.0[2] as usize];
+                dst[dst_cn.b_i()] = self.profile.gamma[temporary.0[4] as usize];
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl<
+    T: Copy + PointeeSizeExpressible + 'static + Default,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const LINEAR_CAP: usize,
+    const GAMMA_LUT: usize,
+    const PRECISION: i32,
+> TransformExecutor<T>
+    for TransformShaperQ2_13OptSse<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
+where
+    u32: AsPrimitive<T>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        unsafe { self.transform_impl(src, dst) }
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/t_lut3_to_3.rs
+++ b/vendor/moxcms/src/conversions/sse/t_lut3_to_3.rs
@@ -0,0 +1,343 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::lut_transforms::Lut3x3Factory;
+use crate::conversions::sse::interpolator::*;
+use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
+use crate::conversions::sse::t_lut3_to_3_q0_15::TransformLut3x3SseQ0_15;
+use crate::transform::PointeeSizeExpressible;
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::marker::PhantomData;
+
+struct TransformLut3x3Sse<
+    T,
+    U,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    lut: Vec<SseAlignedF32>,
+    _phantom: PhantomData<T>,
+    _phantom2: PhantomData<U>,
+    interpolation_method: InterpolationMethod,
+    weights: Box<[BarycentricWeight<f32>; BINS]>,
+    color_space: DataColorSpace,
+    is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut3x3Sse<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn transform_chunk<'b, Interpolator: SseMdInterpolation<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+
+        let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
+        let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.r_i()],
+            );
+            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.g_i()],
+            );
+            let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.b_i()],
+            );
+
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_value
+            };
+
+            let tetrahedral = Interpolator::new(&self.lut);
+            let v = tetrahedral.inter3_sse(x, y, z, &self.weights);
+            if T::FINITE {
+                unsafe {
+                    let mut r = _mm_mul_ps(v.v, value_scale);
+                    r = _mm_max_ps(r, _mm_setzero_ps());
+                    r = _mm_min_ps(r, value_scale);
+                    let jvz = _mm_cvtps_epi32(r);
+
+                    let x = _mm_extract_epi32::<0>(jvz);
+                    let y = _mm_extract_epi32::<1>(jvz);
+                    let z = _mm_extract_epi32::<2>(jvz);
+
+                    dst[dst_cn.r_i()] = (x as u32).as_();
+                    dst[dst_cn.g_i()] = (y as u32).as_();
+                    dst[dst_cn.b_i()] = (z as u32).as_();
+                }
+            } else {
+                unsafe {
+                    dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
+                    dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
+                    dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
+                }
+            }
+            if dst_channels == 4 {
+                dst[dst_cn.a_i()] = a;
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut3x3Sse<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / src_channels;
+        let dst_chunks = dst.len() / dst_channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralSse<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidalSse<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticSse<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+pub(crate) struct SseLut3x3Factory {}
+
+impl Lut3x3Factory for SseLut3x3Factory {
+    fn make_transform_3x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const SRC_LAYOUT: u8,
+        const DST_LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Sync + Send>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>,
+    {
+        if options.prefer_fixed_point && BIT_DEPTH < 16 {
+            let q: f32 = if T::FINITE {
+                ((1i32 << BIT_DEPTH as i32) - 1) as f32
+            } else {
+                ((1i32 << 14i32) - 1) as f32
+            };
+            let lut = lut
+                .chunks_exact(3)
+                .map(|x| {
+                    SseAlignedI16x4([
+                        (x[0] * q).round() as i16,
+                        (x[1] * q).round() as i16,
+                        (x[2] * q).round() as i16,
+                        0,
+                    ])
+                })
+                .collect::<Vec<_>>();
+            return match options.barycentric_weight_scale {
+                BarycentricWeightScale::Low => Box::new(TransformLut3x3SseQ0_15::<
+                    T,
+                    u8,
+                    SRC_LAYOUT,
+                    DST_LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    256,
+                    256,
+                > {
+                    lut,
+                    _phantom: PhantomData,
+                    _phantom2: PhantomData,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
+                    color_space,
+                    is_linear,
+                }),
+                #[cfg(feature = "options")]
+                BarycentricWeightScale::High => Box::new(TransformLut3x3SseQ0_15::<
+                    T,
+                    u16,
+                    SRC_LAYOUT,
+                    DST_LAYOUT,
+                    GRID_SIZE,
+                    BIT_DEPTH,
+                    65536,
+                    65536,
+                > {
+                    lut,
+                    _phantom: PhantomData,
+                    _phantom2: PhantomData,
+                    interpolation_method: options.interpolation_method,
+                    weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
+                    color_space,
+                    is_linear,
+                }),
+            };
+        }
+        let lut = lut
+            .chunks_exact(3)
+            .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
+            .collect::<Vec<_>>();
+        match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => Box::new(TransformLut3x3Sse::<
+                T,
+                u8,
+                SRC_LAYOUT,
+                DST_LAYOUT,
+                GRID_SIZE,
+                BIT_DEPTH,
+                256,
+                256,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom2: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                color_space,
+                is_linear,
+            }),
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => Box::new(TransformLut3x3Sse::<
+                T,
+                u16,
+                SRC_LAYOUT,
+                DST_LAYOUT,
+                GRID_SIZE,
+                BIT_DEPTH,
+                65536,
+                65536,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom2: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                color_space,
+                is_linear,
+            }),
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/sse/t_lut3_to_3_q0_15.rs
+++ b/vendor/moxcms/src/conversions/sse/t_lut3_to_3_q0_15.rs
@@ -0,0 +1,225 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::BarycentricWeight;
+use crate::conversions::sse::interpolator_q0_15::*;
+use crate::transform::PointeeSizeExpressible;
+use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
+use num_traits::AsPrimitive;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformLut3x3SseQ0_15<
+    T,
+    U,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    pub(crate) lut: Vec<SseAlignedI16x4>,
+    pub(crate) _phantom: PhantomData<T>,
+    pub(crate) _phantom2: PhantomData<U>,
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
+    pub(crate) color_space: DataColorSpace,
+    pub(crate) is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+>
+    TransformLut3x3SseQ0_15<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[allow(unused_unsafe)]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn transform_chunk<'b, Interpolator: SseMdInterpolationQ0_15<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        unsafe {
+            let src_cn = Layout::from(SRC_LAYOUT);
+            let src_channels = src_cn.channels();
+
+            let dst_cn = Layout::from(DST_LAYOUT);
+            let dst_channels = dst_cn.channels();
+
+            let f_value_scale = _mm_set1_ps(1. / ((1 << 14i32) - 1) as f32);
+            let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+            let v_max_scale = if T::FINITE {
+                _mm_set1_epi16(((1i32 << BIT_DEPTH) - 1) as i16)
+            } else {
+                _mm_set1_epi16(((1i32 << 14i32) - 1) as i16)
+            };
+
+            for (src, dst) in src
+                .chunks_exact(src_channels)
+                .zip(dst.chunks_exact_mut(dst_channels))
+            {
+                let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.r_i()],
+                );
+                let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.g_i()],
+                );
+                let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                    src[src_cn.b_i()],
+                );
+
+                let a = if src_channels == 4 {
+                    src[src_cn.a_i()]
+                } else {
+                    max_value
+                };
+
+                let tetrahedral = Interpolator::new(&self.lut);
+                let v = tetrahedral.inter3_sse(x, y, z, &self.weights);
+                if T::FINITE {
+                    let mut o = _mm_max_epi16(v.v, _mm_setzero_si128());
+                    o = _mm_min_epi16(o, v_max_scale);
+                    let x = _mm_extract_epi16::<0>(o);
+                    let y = _mm_extract_epi16::<1>(o);
+                    let z = _mm_extract_epi16::<2>(o);
+
+                    dst[dst_cn.r_i()] = (x as u32).as_();
+                    dst[dst_cn.g_i()] = (y as u32).as_();
+                    dst[dst_cn.b_i()] = (z as u32).as_();
+                } else {
+                    let mut r = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(v.v));
+                    r = _mm_mul_ps(r, f_value_scale);
+                    dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(r) as u32).as_();
+                    dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(r) as u32).as_();
+                    dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(r) as u32).as_();
+                }
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut3x3SseQ0_15<
+        T,
+        U,
+        SRC_LAYOUT,
+        DST_LAYOUT,
+        GRID_SIZE,
+        BIT_DEPTH,
+        BINS,
+        BARYCENTRIC_BINS,
+    >
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / src_channels;
+        let dst_chunks = dst.len() / dst_channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        unsafe {
+            if self.color_space == DataColorSpace::Lab
+                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+                || self.color_space == DataColorSpace::Xyz
+            {
+                self.transform_chunk::<TrilinearSseQ0_15<GRID_SIZE>>(src, dst);
+            } else {
+                match self.interpolation_method {
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Tetrahedral => {
+                        self.transform_chunk::<TetrahedralSseQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Pyramid => {
+                        self.transform_chunk::<PyramidalSseQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    #[cfg(feature = "options")]
+                    InterpolationMethod::Prism => {
+                        self.transform_chunk::<PrismaticSseQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                    InterpolationMethod::Linear => {
+                        self.transform_chunk::<TrilinearSseQ0_15<GRID_SIZE>>(src, dst);
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/conversions/transform_lut3_to_3.rs
+++ b/vendor/moxcms/src/conversions/transform_lut3_to_3.rs
@@ -0,0 +1,261 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#![allow(dead_code)]
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::{BarycentricWeight, MultidimensionalInterpolation};
+use crate::conversions::lut_transforms::Lut3x3Factory;
+use crate::transform::PointeeSizeExpressible;
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformLut3x3<
+    T,
+    U,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    pub(crate) lut: Vec<f32>,
+    pub(crate) _phantom: PhantomData<T>,
+    pub(crate) _phantom1: PhantomData<U>,
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) weights: Box<[BarycentricWeight<f32>; BINS]>,
+    pub(crate) color_space: DataColorSpace,
+    pub(crate) is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut3x3<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[inline(always)]
+    fn transform_chunk<'b, Tetrahedral: MultidimensionalInterpolation<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+
+        let value_scale = ((1 << BIT_DEPTH) - 1) as f32;
+        let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
+
+        for (src, dst) in src
+            .chunks_exact(src_channels)
+            .zip(dst.chunks_exact_mut(dst_channels))
+        {
+            let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.r_i()],
+            );
+            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.g_i()],
+            );
+            let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[src_cn.b_i()],
+            );
+
+            let a = if src_channels == 4 {
+                src[src_cn.a_i()]
+            } else {
+                max_value
+            };
+
+            let tetrahedral = Tetrahedral::new(&self.lut);
+            let v = tetrahedral.inter3(x, y, z, &self.weights);
+            if T::FINITE {
+                let r = v * value_scale + 0.5;
+                dst[dst_cn.r_i()] = r.v[0].min(value_scale).max(0.).as_();
+                dst[dst_cn.g_i()] = r.v[1].min(value_scale).max(0.).as_();
+                dst[dst_cn.b_i()] = r.v[2].min(value_scale).max(0.).as_();
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            } else {
+                dst[dst_cn.r_i()] = v.v[0].as_();
+                dst[dst_cn.g_i()] = v.v[1].as_();
+                dst[dst_cn.b_i()] = v.v[2].as_();
+                if dst_channels == 4 {
+                    dst[dst_cn.a_i()] = a;
+                }
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const SRC_LAYOUT: u8,
+    const DST_LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut3x3<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let src_cn = Layout::from(SRC_LAYOUT);
+        let src_channels = src_cn.channels();
+
+        let dst_cn = Layout::from(DST_LAYOUT);
+        let dst_channels = dst_cn.channels();
+        if src.len() % src_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % dst_channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / src_channels;
+        let dst_chunks = dst.len() / dst_channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        if self.color_space == DataColorSpace::Lab
+            || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+            || self.color_space == DataColorSpace::Xyz
+        {
+            use crate::conversions::interpolator::Trilinear;
+            self.transform_chunk::<Trilinear<GRID_SIZE>>(src, dst);
+        } else {
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    use crate::conversions::interpolator::Tetrahedral;
+                    self.transform_chunk::<Tetrahedral<GRID_SIZE>>(src, dst);
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    use crate::conversions::interpolator::Pyramidal;
+                    self.transform_chunk::<Pyramidal<GRID_SIZE>>(src, dst);
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    use crate::conversions::interpolator::Prismatic;
+                    self.transform_chunk::<Prismatic<GRID_SIZE>>(src, dst);
+                }
+                InterpolationMethod::Linear => {
+                    use crate::conversions::interpolator::Trilinear;
+                    self.transform_chunk::<Trilinear<GRID_SIZE>>(src, dst);
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) struct DefaultLut3x3Factory {}
+
+impl Lut3x3Factory for DefaultLut3x3Factory {
+    fn make_transform_3x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const SRC_LAYOUT: u8,
+        const DST_LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Send + Sync>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>,
+    {
+        match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => Box::new(TransformLut3x3::<
+                T,
+                u8,
+                SRC_LAYOUT,
+                DST_LAYOUT,
+                GRID_SIZE,
+                BIT_DEPTH,
+                256,
+                256,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom1: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                color_space,
+                is_linear,
+            }),
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => Box::new(TransformLut3x3::<
+                T,
+                u16,
+                SRC_LAYOUT,
+                DST_LAYOUT,
+                GRID_SIZE,
+                BIT_DEPTH,
+                65536,
+                65536,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom1: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                color_space,
+                is_linear,
+            }),
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/transform_lut3_to_4.rs
+++ b/vendor/moxcms/src/conversions/transform_lut3_to_4.rs
@@ -0,0 +1,269 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::LutBarycentricReduction;
+use crate::conversions::interpolator::{BarycentricWeight, MultidimensionalInterpolation};
+use crate::transform::PointeeSizeExpressible;
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    TransformExecutor, TransformOptions,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+pub(crate) struct TransformLut3x4<
+    T,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    pub(crate) lut: Vec<f32>,
+    pub(crate) _phantom: PhantomData<T>,
+    pub(crate) _phantom1: PhantomData<U>,
+    pub(crate) interpolation_method: InterpolationMethod,
+    pub(crate) weights: Box<[BarycentricWeight<f32>; BINS]>,
+    pub(crate) color_space: DataColorSpace,
+    pub(crate) is_linear: bool,
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut3x4<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[inline(always)]
+    fn transform_chunk<'b, Tetrahedral: MultidimensionalInterpolation<'b, GRID_SIZE>>(
+        &'b self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+
+        let value_scale = ((1 << BIT_DEPTH) - 1) as f32;
+
+        for (src, dst) in src.chunks_exact(channels).zip(dst.chunks_exact_mut(4)) {
+            let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[cn.r_i()],
+            );
+            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[cn.g_i()],
+            );
+            let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[cn.b_i()],
+            );
+
+            let tetrahedral = Tetrahedral::new(&self.lut);
+            let v = tetrahedral.inter4(x, y, z, &self.weights);
+            if T::FINITE {
+                let r = v * value_scale + 0.5;
+                dst[0] = r.v[0].min(value_scale).max(0.).as_();
+                dst[1] = r.v[1].min(value_scale).max(0.).as_();
+                dst[2] = r.v[2].min(value_scale).max(0.).as_();
+                dst[3] = r.v[3].min(value_scale).max(0.).as_();
+            } else {
+                dst[0] = v.v[0].as_();
+                dst[1] = v.v[1].as_();
+                dst[2] = v.v[2].as_();
+                dst[3] = v.v[3].as_();
+            }
+        }
+    }
+}
+
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut3x4<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        if src.len() % channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / channels;
+        let dst_chunks = dst.len() / 4;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        if self.color_space == DataColorSpace::Lab
+            || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+            || self.color_space == DataColorSpace::Xyz
+        {
+            use crate::conversions::interpolator::Trilinear;
+            self.transform_chunk::<Trilinear<GRID_SIZE>>(src, dst);
+        } else {
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    use crate::conversions::interpolator::Tetrahedral;
+                    self.transform_chunk::<Tetrahedral<GRID_SIZE>>(src, dst);
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    use crate::conversions::interpolator::Pyramidal;
+                    self.transform_chunk::<Pyramidal<GRID_SIZE>>(src, dst);
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    use crate::conversions::interpolator::Prismatic;
+                    self.transform_chunk::<Prismatic<GRID_SIZE>>(src, dst);
+                }
+                InterpolationMethod::Linear => {
+                    use crate::conversions::interpolator::Trilinear;
+                    self.transform_chunk::<Trilinear<GRID_SIZE>>(src, dst);
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) fn make_transform_3x4<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+>(
+    layout: Layout,
+    lut: Vec<f32>,
+    options: TransformOptions,
+    color_space: DataColorSpace,
+    is_linear: bool,
+) -> Box<dyn TransformExecutor<T> + Sync + Send>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, u8>,
+    (): LutBarycentricReduction<T, u16>,
+{
+    match layout {
+        Layout::Rgb => match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => Box::new(TransformLut3x4::<
+                T,
+                u8,
+                { Layout::Rgb as u8 },
+                GRID_SIZE,
+                BIT_DEPTH,
+                256,
+                256,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom1: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                color_space,
+                is_linear,
+            }),
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => Box::new(TransformLut3x4::<
+                T,
+                u16,
+                { Layout::Rgb as u8 },
+                GRID_SIZE,
+                BIT_DEPTH,
+                65536,
+                65536,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom1: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                color_space,
+                is_linear,
+            }),
+        },
+        Layout::Rgba => match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => Box::new(TransformLut3x4::<
+                T,
+                u8,
+                { Layout::Rgba as u8 },
+                GRID_SIZE,
+                BIT_DEPTH,
+                256,
+                256,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom1: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                color_space,
+                is_linear,
+            }),
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => Box::new(TransformLut3x4::<
+                T,
+                u16,
+                { Layout::Rgba as u8 },
+                GRID_SIZE,
+                BIT_DEPTH,
+                65536,
+                65536,
+            > {
+                lut,
+                _phantom: PhantomData,
+                _phantom1: PhantomData,
+                interpolation_method: options.interpolation_method,
+                weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                color_space,
+                is_linear,
+            }),
+        },
+        _ => unimplemented!(),
+    }
+}
--- a/vendor/moxcms/src/conversions/transform_lut4_to_3.rs
+++ b/vendor/moxcms/src/conversions/transform_lut4_to_3.rs
@@ -0,0 +1,316 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::conversions::interpolator::*;
+use crate::conversions::lut_transforms::Lut4x3Factory;
+use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd, m_clamp};
+use crate::{
+    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
+    PointeeSizeExpressible, TransformExecutor, TransformOptions, Vector3f,
+};
+use num_traits::AsPrimitive;
+use std::marker::PhantomData;
+
+pub(crate) trait Vector3fCmykLerp {
+    fn interpolate(a: Vector3f, b: Vector3f, t: f32, scale: f32) -> Vector3f;
+}
+
+#[allow(unused)]
+#[derive(Copy, Clone, Default)]
+struct DefaultVector3fLerp;
+
+impl Vector3fCmykLerp for DefaultVector3fLerp {
+    #[inline(always)]
+    fn interpolate(a: Vector3f, b: Vector3f, t: f32, scale: f32) -> Vector3f {
+        let t = Vector3f::from(t);
+        let inter = a.neg_mla(a, t).mla(b, t);
+        let mut new_vec = Vector3f::from(0.5).mla(inter, Vector3f::from(scale));
+        new_vec.v[0] = m_clamp(new_vec.v[0], 0.0, scale);
+        new_vec.v[1] = m_clamp(new_vec.v[1], 0.0, scale);
+        new_vec.v[2] = m_clamp(new_vec.v[2], 0.0, scale);
+        new_vec
+    }
+}
+
+#[allow(unused)]
+#[derive(Copy, Clone, Default)]
+pub(crate) struct NonFiniteVector3fLerp;
+
+impl Vector3fCmykLerp for NonFiniteVector3fLerp {
+    #[inline(always)]
+    fn interpolate(a: Vector3f, b: Vector3f, t: f32, _: f32) -> Vector3f {
+        let t = Vector3f::from(t);
+        a.neg_mla(a, t).mla(b, t)
+    }
+}
+
+#[allow(unused)]
+#[derive(Copy, Clone, Default)]
+pub(crate) struct NonFiniteVector3fLerpUnbound;
+
+impl Vector3fCmykLerp for NonFiniteVector3fLerpUnbound {
+    #[inline(always)]
+    fn interpolate(a: Vector3f, b: Vector3f, t: f32, _: f32) -> Vector3f {
+        let t = Vector3f::from(t);
+        a.neg_mla(a, t).mla(b, t)
+    }
+}
+
+#[allow(unused)]
+struct TransformLut4To3<
+    T,
+    U,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> {
+    lut: Vec<f32>,
+    _phantom: PhantomData<T>,
+    _phantom1: PhantomData<U>,
+    interpolation_method: InterpolationMethod,
+    weights: Box<[BarycentricWeight<f32>; BINS]>,
+    color_space: DataColorSpace,
+    is_linear: bool,
+}
+
+#[allow(unused)]
+impl<
+    T: Copy + AsPrimitive<f32> + Default,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformLut4To3<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    #[inline(always)]
+    fn transform_chunk<
+        'k,
+        Tetrahedral: MultidimensionalInterpolation<'k, GRID_SIZE>,
+        Interpolation: Vector3fCmykLerp,
+    >(
+        &'k self,
+        src: &[T],
+        dst: &mut [T],
+    ) {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        let grid_size = GRID_SIZE as i32;
+        let grid_size3 = grid_size * grid_size * grid_size;
+
+        let value_scale = ((1 << BIT_DEPTH) - 1) as f32;
+        let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
+
+        for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
+            let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[0],
+            );
+            let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[1],
+            );
+            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[2],
+            );
+            let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
+                src[3],
+            );
+
+            let k_weights = self.weights[k.as_()];
+
+            let w: i32 = k_weights.x;
+            let w_n: i32 = k_weights.x_n;
+            let t: f32 = k_weights.w;
+
+            let table1 = &self.lut[(w * grid_size3 * 3) as usize..];
+            let table2 = &self.lut[(w_n * grid_size3 * 3) as usize..];
+
+            let tetrahedral1 = Tetrahedral::new(table1);
+            let tetrahedral2 = Tetrahedral::new(table2);
+            let r1 = tetrahedral1.inter3(c, m, y, &self.weights);
+            let r2 = tetrahedral2.inter3(c, m, y, &self.weights);
+            let r = Interpolation::interpolate(r1, r2, t, value_scale);
+            dst[cn.r_i()] = r.v[0].as_();
+            dst[cn.g_i()] = r.v[1].as_();
+            dst[cn.b_i()] = r.v[2].as_();
+            if channels == 4 {
+                dst[cn.a_i()] = max_value;
+            }
+        }
+    }
+}
+
+#[allow(unused)]
+impl<
+    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
+    U: AsPrimitive<usize>,
+    const LAYOUT: u8,
+    const GRID_SIZE: usize,
+    const BIT_DEPTH: usize,
+    const BINS: usize,
+    const BARYCENTRIC_BINS: usize,
+> TransformExecutor<T>
+    for TransformLut4To3<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
+where
+    f32: AsPrimitive<T>,
+    u32: AsPrimitive<T>,
+    (): LutBarycentricReduction<T, U>,
+{
+    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
+        let cn = Layout::from(LAYOUT);
+        let channels = cn.channels();
+        if src.len() % 4 != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        if dst.len() % channels != 0 {
+            return Err(CmsError::LaneMultipleOfChannels);
+        }
+        let src_chunks = src.len() / 4;
+        let dst_chunks = dst.len() / channels;
+        if src_chunks != dst_chunks {
+            return Err(CmsError::LaneSizeMismatch);
+        }
+
+        if self.color_space == DataColorSpace::Lab
+            || (self.is_linear && self.color_space == DataColorSpace::Rgb)
+            || self.color_space == DataColorSpace::Xyz
+        {
+            if T::FINITE {
+                self.transform_chunk::<Trilinear<GRID_SIZE>, DefaultVector3fLerp>(src, dst);
+            } else {
+                self.transform_chunk::<Trilinear<GRID_SIZE>, NonFiniteVector3fLerp>(src, dst);
+            }
+        } else {
+            match self.interpolation_method {
+                #[cfg(feature = "options")]
+                InterpolationMethod::Tetrahedral => {
+                    if T::FINITE {
+                        self.transform_chunk::<Tetrahedral<GRID_SIZE>, DefaultVector3fLerp>(
+                            src, dst,
+                        );
+                    } else {
+                        self.transform_chunk::<Tetrahedral<GRID_SIZE>, NonFiniteVector3fLerp>(
+                            src, dst,
+                        );
+                    }
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Pyramid => {
+                    if T::FINITE {
+                        self.transform_chunk::<Pyramidal<GRID_SIZE>, DefaultVector3fLerp>(src, dst);
+                    } else {
+                        self.transform_chunk::<Pyramidal<GRID_SIZE>, NonFiniteVector3fLerp>(
+                            src, dst,
+                        );
+                    }
+                }
+                #[cfg(feature = "options")]
+                InterpolationMethod::Prism => {
+                    if T::FINITE {
+                        self.transform_chunk::<Prismatic<GRID_SIZE>, DefaultVector3fLerp>(src, dst);
+                    } else {
+                        self.transform_chunk::<Prismatic<GRID_SIZE>, NonFiniteVector3fLerp>(
+                            src, dst,
+                        );
+                    }
+                }
+                InterpolationMethod::Linear => {
+                    if T::FINITE {
+                        self.transform_chunk::<Trilinear<GRID_SIZE>, DefaultVector3fLerp>(src, dst);
+                    } else {
+                        self.transform_chunk::<Trilinear<GRID_SIZE>, NonFiniteVector3fLerp>(
+                            src, dst,
+                        );
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[allow(dead_code)]
+pub(crate) struct DefaultLut4x3Factory {}
+
+#[allow(dead_code)]
+impl Lut4x3Factory for DefaultLut4x3Factory {
+    fn make_transform_4x3<
+        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
+        const LAYOUT: u8,
+        const GRID_SIZE: usize,
+        const BIT_DEPTH: usize,
+    >(
+        lut: Vec<f32>,
+        options: TransformOptions,
+        color_space: DataColorSpace,
+        is_linear: bool,
+    ) -> Box<dyn TransformExecutor<T> + Sync + Send>
+    where
+        f32: AsPrimitive<T>,
+        u32: AsPrimitive<T>,
+        (): LutBarycentricReduction<T, u8>,
+        (): LutBarycentricReduction<T, u16>,
+    {
+        match options.barycentric_weight_scale {
+            BarycentricWeightScale::Low => {
+                Box::new(
+                    TransformLut4To3::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
+                        lut,
+                        _phantom: PhantomData,
+                        _phantom1: PhantomData,
+                        interpolation_method: options.interpolation_method,
+                        weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
+                        color_space,
+                        is_linear,
+                    },
+                )
+            }
+            #[cfg(feature = "options")]
+            BarycentricWeightScale::High => {
+                Box::new(
+                    TransformLut4To3::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
+                        lut,
+                        _phantom: PhantomData,
+                        _phantom1: PhantomData,
+                        interpolation_method: options.interpolation_method,
+                        weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
+                        color_space,
+                        is_linear,
+                    },
+                )
+            }
+        }
+    }
+}
--- a/vendor/moxcms/src/conversions/xyz_lab.rs
+++ b/vendor/moxcms/src/conversions/xyz_lab.rs
@@ -0,0 +1,61 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::{CmsError, InPlaceStage, Lab, Xyz};
+
+#[derive(Default)]
+pub(crate) struct StageLabToXyz {}
+
+impl InPlaceStage for StageLabToXyz {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        for dst in dst.chunks_exact_mut(3) {
+            let lab = Lab::new(dst[0], dst[1], dst[2]);
+            let xyz = lab.to_pcs_xyz();
+            dst[0] = xyz.x;
+            dst[1] = xyz.y;
+            dst[2] = xyz.z;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Default)]
+pub(crate) struct StageXyzToLab {}
+
+impl InPlaceStage for StageXyzToLab {
+    fn transform(&self, dst: &mut [f32]) -> Result<(), CmsError> {
+        for dst in dst.chunks_exact_mut(3) {
+            let xyz = Xyz::new(dst[0], dst[1], dst[2]);
+            let lab = Lab::from_pcs_xyz(xyz);
+            dst[0] = lab.l;
+            dst[1] = lab.a;
+            dst[2] = lab.b;
+        }
+        Ok(())
+    }
+}
--- a/vendor/moxcms/src/dat.rs
+++ b/vendor/moxcms/src/dat.rs
@@ -0,0 +1,154 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::CmsError;
+use crate::writer::write_u16_be;
+use std::time::{SystemTime, UNIX_EPOCH};
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Default)]
+pub struct ColorDateTime {
+    pub year: u16,
+    pub month: u16,
+    pub day_of_the_month: u16,
+    pub hours: u16,
+    pub minutes: u16,
+    pub seconds: u16,
+}
+
+fn is_leap(year: i32) -> bool {
+    (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0)
+}
+
+fn days_in_month(year: i32, month: i32) -> i32 {
+    match month {
+        1 => 31,
+        2 => {
+            if is_leap(year) {
+                29
+            } else {
+                28
+            }
+        }
+        3 => 31,
+        4 => 30,
+        5 => 31,
+        6 => 30,
+        7 => 31,
+        8 => 31,
+        9 => 30,
+        10 => 31,
+        11 => 30,
+        12 => 31,
+        _ => unreachable!("Unknown month"),
+    }
+}
+
+impl ColorDateTime {
+    /// Parses slice for date time
+    pub fn new_from_slice(slice: &[u8]) -> Result<ColorDateTime, CmsError> {
+        if slice.len() != 12 {
+            return Err(CmsError::InvalidProfile);
+        }
+        let year = u16::from_be_bytes([slice[0], slice[1]]);
+        let month = u16::from_be_bytes([slice[2], slice[3]]);
+        let day_of_the_month = u16::from_be_bytes([slice[4], slice[5]]);
+        let hours = u16::from_be_bytes([slice[6], slice[7]]);
+        let minutes = u16::from_be_bytes([slice[8], slice[9]]);
+        let seconds = u16::from_be_bytes([slice[10], slice[11]]);
+        Ok(ColorDateTime {
+            year,
+            month,
+            day_of_the_month,
+            hours,
+            minutes,
+            seconds,
+        })
+    }
+
+    /// Creates a new `ColorDateTime` from the current system time (UTC)
+    pub fn now() -> Self {
+        let now = match SystemTime::now().duration_since(UNIX_EPOCH) {
+            Ok(v) => v,
+            Err(_) => return Self::default(),
+        };
+        let mut days = (now.as_secs() / 86_400) as i64;
+        let secs_of_day = (now.as_secs() % 86_400) as i64;
+
+        let mut year = 1970;
+        loop {
+            let year_days = if is_leap(year) { 366 } else { 365 };
+            if days >= year_days {
+                days -= year_days;
+                year += 1;
+            } else {
+                break;
+            }
+        }
+
+        let mut month = 1;
+        loop {
+            let mdays = days_in_month(year, month);
+            if days >= mdays as i64 {
+                days -= mdays as i64;
+                month += 1;
+            } else {
+                break;
+            }
+        }
+        let day = days + 1; // days from zero based to 1 base
+
+        let hour = secs_of_day / 3600;
+        let min = (secs_of_day % 3600) / 60;
+        let sec = secs_of_day % 60;
+        Self {
+            year: year as u16,
+            month: month as u16,
+            day_of_the_month: day as u16,
+            hours: hour as u16,
+            minutes: min as u16,
+            seconds: sec as u16,
+        }
+    }
+
+    #[inline]
+    pub(crate) fn encode(&self, into: &mut Vec<u8>) {
+        let year = self.year;
+        let month = self.month;
+        let day_of_the_month = self.day_of_the_month;
+        let hours = self.hours;
+        let minutes = self.minutes;
+        let seconds = self.seconds;
+        write_u16_be(into, year);
+        write_u16_be(into, month);
+        write_u16_be(into, day_of_the_month);
+        write_u16_be(into, hours);
+        write_u16_be(into, minutes);
+        write_u16_be(into, seconds);
+    }
+}
--- a/vendor/moxcms/src/defaults.rs
+++ b/vendor/moxcms/src/defaults.rs
@@ -0,0 +1,541 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::chad::BRADFORD_D;
+use crate::cicp::create_rec709_parametric;
+use crate::trc::{ToneReprCurve, curve_from_gamma};
+use crate::{
+    CicpColorPrimaries, CicpProfile, ColorPrimaries, ColorProfile, DataColorSpace,
+    LocalizableString, Matrix3d, MatrixCoefficients, ProfileClass, ProfileText, RenderingIntent,
+    TransferCharacteristics, XyY,
+};
+use pxfm::{copysignk, exp, floor, pow};
+
+/// From lcms: `cmsWhitePointFromTemp`
+/// tempK must be >= 4000. and <= 25000.
+/// Invalid values of tempK will return
+/// (x,y,Y) = (-1.0, -1.0, -1.0)
+/// similar to argyll: `icx_DTEMP2XYZ()`
+const fn white_point_from_temperature(temp_k: i32) -> XyY {
+    let mut white_point = XyY {
+        x: 0.,
+        y: 0.,
+        yb: 0.,
+    };
+    // No optimization provided.
+    let temp_k = temp_k as f64; // Square
+    let temp_k2 = temp_k * temp_k; // Cube
+    let temp_k3 = temp_k2 * temp_k;
+    // For correlated color temperature (T) between 4000K and 7000K:
+    let x = if temp_k > 4000.0 && temp_k <= 7000.0 {
+        -4.6070 * (1E9 / temp_k3) + 2.9678 * (1E6 / temp_k2) + 0.09911 * (1E3 / temp_k) + 0.244063
+    } else if temp_k > 7000.0 && temp_k <= 25000.0 {
+        -2.0064 * (1E9 / temp_k3) + 1.9018 * (1E6 / temp_k2) + 0.24748 * (1E3 / temp_k) + 0.237040
+    } else {
+        // or for correlated color temperature (T) between 7000K and 25000K:
+        // Invalid tempK
+        white_point.x = -1.0;
+        white_point.y = -1.0;
+        white_point.yb = -1.0;
+        debug_assert!(false, "invalid temp");
+        return white_point;
+    };
+    // Obtain y(x)
+    let y = -3.000 * (x * x) + 2.870 * x - 0.275;
+    // wave factors (not used, but here for futures extensions)
+    // let M1 = (-1.3515 - 1.7703*x + 5.9114 *y)/(0.0241 + 0.2562*x - 0.7341*y);
+    // let M2 = (0.0300 - 31.4424*x + 30.0717*y)/(0.0241 + 0.2562*x - 0.7341*y);
+    // Fill white_point struct
+    white_point.x = x;
+    white_point.y = y;
+    white_point.yb = 1.0;
+    white_point
+}
+
+pub const WHITE_POINT_D50: XyY = white_point_from_temperature(5003);
+pub const WHITE_POINT_D60: XyY = white_point_from_temperature(6000);
+pub const WHITE_POINT_D65: XyY = white_point_from_temperature(6504);
+pub const WHITE_POINT_DCI_P3: XyY = white_point_from_temperature(6300);
+
+// https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2100-2-201807-I!!PDF-F.pdf
+// Perceptual Quantization / SMPTE standard ST.2084
+#[inline]
+const fn pq_curve(x: f64) -> f64 {
+    const M1: f64 = 2610.0 / 16384.0;
+    const M2: f64 = (2523.0 / 4096.0) * 128.0;
+    const C1: f64 = 3424.0 / 4096.0;
+    const C2: f64 = (2413.0 / 4096.0) * 32.0;
+    const C3: f64 = (2392.0 / 4096.0) * 32.0;
+
+    if x == 0.0 {
+        return 0.0;
+    }
+    let sign = x;
+    let x = x.abs();
+
+    let xpo = pow(x, 1.0 / M2);
+    let num = (xpo - C1).max(0.0);
+    let den = C2 - C3 * xpo;
+    let res = pow(num / den, 1.0 / M1);
+
+    copysignk(res, sign)
+}
+
+pub(crate) const fn build_trc_table_pq() -> [u16; 4096] {
+    let mut table = [0u16; 4096];
+
+    const NUM_ENTRIES: usize = 4096;
+    let mut i = 0usize;
+    while i < NUM_ENTRIES {
+        let x: f64 = i as f64 / (NUM_ENTRIES - 1) as f64;
+        let y: f64 = pq_curve(x);
+        let mut output: f64;
+        output = y * 65535.0 + 0.5;
+        if output > 65535.0 {
+            output = 65535.0
+        }
+        if output < 0.0 {
+            output = 0.0
+        }
+        table[i] = floor(output) as u16;
+        i += 1;
+    }
+    table
+}
+
+pub(crate) const fn build_trc_table_hlg() -> [u16; 4096] {
+    let mut table = [0u16; 4096];
+
+    const NUM_ENTRIES: usize = 4096;
+    let mut i = 0usize;
+    while i < NUM_ENTRIES {
+        let x: f64 = i as f64 / (NUM_ENTRIES - 1) as f64;
+        let y: f64 = hlg_curve(x);
+        let mut output: f64;
+        output = y * 65535.0 + 0.5;
+        if output > 65535.0 {
+            output = 65535.0
+        }
+        if output < 0.0 {
+            output = 0.0
+        }
+        table[i] = floor(output) as u16;
+        i += 1;
+    }
+    table
+}
+
+// https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2100-2-201807-I!!PDF-F.pdf
+// Hybrid Log-Gamma
+const fn hlg_curve(x: f64) -> f64 {
+    const BETA: f64 = 0.04;
+    const RA: f64 = 5.591816309728916; // 1.0 / A where A = 0.17883277
+    const B: f64 = 0.28466892; // 1.0 - 4.0 * A
+    const C: f64 = 0.5599107295; // 0,5 –aln(4a)
+
+    let e = (x * (1.0 - BETA) + BETA).max(0.0);
+
+    if e == 0.0 {
+        return 0.0;
+    }
+
+    let sign = e.abs();
+
+    let res = if e <= 0.5 {
+        e * e / 3.0
+    } else {
+        (exp((e - C) * RA) + B) / 12.0
+    };
+
+    copysignk(res, sign)
+}
+
+/// Perceptual Quantizer Lookup table
+pub const PQ_LUT_TABLE: [u16; 4096] = build_trc_table_pq();
+/// Hybrid Log Gamma Lookup table
+pub const HLG_LUT_TABLE: [u16; 4096] = build_trc_table_hlg();
+
+impl ColorProfile {
+    const SRGB_COLORANTS: Matrix3d =
+        ColorProfile::colorants_matrix(WHITE_POINT_D65, ColorPrimaries::BT_709);
+
+    const DISPLAY_P3_COLORANTS: Matrix3d =
+        ColorProfile::colorants_matrix(WHITE_POINT_D65, ColorPrimaries::SMPTE_432);
+
+    const ADOBE_RGB_COLORANTS: Matrix3d =
+        ColorProfile::colorants_matrix(WHITE_POINT_D65, ColorPrimaries::ADOBE_RGB);
+
+    const DCI_P3_COLORANTS: Matrix3d =
+        ColorProfile::colorants_matrix(WHITE_POINT_DCI_P3, ColorPrimaries::DCI_P3);
+
+    const PRO_PHOTO_RGB_COLORANTS: Matrix3d =
+        ColorProfile::colorants_matrix(WHITE_POINT_D50, ColorPrimaries::PRO_PHOTO_RGB);
+
+    const BT2020_COLORANTS: Matrix3d =
+        ColorProfile::colorants_matrix(WHITE_POINT_D65, ColorPrimaries::BT_2020);
+
+    const ACES_2065_1_COLORANTS: Matrix3d =
+        ColorProfile::colorants_matrix(WHITE_POINT_D60, ColorPrimaries::ACES_2065_1);
+
+    const ACES_CG_COLORANTS: Matrix3d =
+        ColorProfile::colorants_matrix(WHITE_POINT_D60, ColorPrimaries::ACES_CG);
+
+    #[inline]
+    fn basic_rgb_profile() -> ColorProfile {
+        ColorProfile {
+            profile_class: ProfileClass::DisplayDevice,
+            rendering_intent: RenderingIntent::Perceptual,
+            color_space: DataColorSpace::Rgb,
+            pcs: DataColorSpace::Xyz,
+            chromatic_adaptation: Some(BRADFORD_D),
+            white_point: WHITE_POINT_D50.to_xyzd(),
+            ..Default::default()
+        }
+    }
+
+    /// Creates new profile from CICP
+    pub fn new_from_cicp(cicp_color_primaries: CicpProfile) -> ColorProfile {
+        let mut basic = ColorProfile::basic_rgb_profile();
+        basic.update_rgb_colorimetry_from_cicp(cicp_color_primaries);
+        basic
+    }
+
+    /// Creates new sRGB profile
+    pub fn new_srgb() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::SRGB_COLORANTS);
+
+        let curve =
+            ToneReprCurve::Parametric(vec![2.4, 1. / 1.055, 0.055 / 1.055, 1. / 12.92, 0.04045]);
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D65.to_xyzd());
+        profile.cicp = Some(CicpProfile {
+            color_primaries: CicpColorPrimaries::Bt709,
+            transfer_characteristics: TransferCharacteristics::Srgb,
+            matrix_coefficients: MatrixCoefficients::Bt709,
+            full_range: false,
+        });
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "sRGB IEC61966-2.1".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new Adobe RGB profile
+    pub fn new_adobe_rgb() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::ADOBE_RGB_COLORANTS);
+
+        let curve = curve_from_gamma(2.19921875f32);
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D65.to_xyzd());
+        profile.white_point = WHITE_POINT_D50.to_xyzd();
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Adobe RGB 1998".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new Display P3 profile
+    pub fn new_display_p3() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::DISPLAY_P3_COLORANTS);
+
+        let curve =
+            ToneReprCurve::Parametric(vec![2.4, 1. / 1.055, 0.055 / 1.055, 1. / 12.92, 0.04045]);
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D65.to_xyzd());
+        profile.cicp = Some(CicpProfile {
+            color_primaries: CicpColorPrimaries::Smpte431,
+            transfer_characteristics: TransferCharacteristics::Srgb,
+            matrix_coefficients: MatrixCoefficients::Bt709,
+            full_range: false,
+        });
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Display P3".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new Display P3 PQ profile
+    pub fn new_display_p3_pq() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::DISPLAY_P3_COLORANTS);
+
+        let curve = ToneReprCurve::Lut(PQ_LUT_TABLE.to_vec());
+
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D65.to_xyzd());
+        profile.cicp = Some(CicpProfile {
+            color_primaries: CicpColorPrimaries::Smpte431,
+            transfer_characteristics: TransferCharacteristics::Smpte2084,
+            matrix_coefficients: MatrixCoefficients::Bt709,
+            full_range: false,
+        });
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Display P3 PQ".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new DCI P3 profile
+    pub fn new_dci_p3() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::DCI_P3_COLORANTS);
+
+        let curve = curve_from_gamma(2.6f32);
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_DCI_P3.to_xyzd());
+        profile.cicp = Some(CicpProfile {
+            color_primaries: CicpColorPrimaries::Smpte432,
+            transfer_characteristics: TransferCharacteristics::Srgb,
+            matrix_coefficients: MatrixCoefficients::Bt709,
+            full_range: false,
+        });
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "DCI P3".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new ProPhoto RGB profile
+    pub fn new_pro_photo_rgb() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::PRO_PHOTO_RGB_COLORANTS);
+
+        let curve = curve_from_gamma(1.8f32);
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D50.to_xyzd());
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "ProPhoto RGB".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new Bt.2020 profile
+    pub fn new_bt2020() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::BT2020_COLORANTS);
+
+        let curve = ToneReprCurve::Parametric(create_rec709_parametric().to_vec());
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D65.to_xyzd());
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Rec.2020".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new Bt.2020 PQ profile
+    pub fn new_bt2020_pq() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::BT2020_COLORANTS);
+
+        let curve = ToneReprCurve::Lut(PQ_LUT_TABLE.to_vec());
+
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D65.to_xyzd());
+        profile.cicp = Some(CicpProfile {
+            color_primaries: CicpColorPrimaries::Bt2020,
+            transfer_characteristics: TransferCharacteristics::Smpte2084,
+            matrix_coefficients: MatrixCoefficients::Bt709,
+            full_range: false,
+        });
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Rec.2020 PQ".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new Bt.2020 HLG profile
+    pub fn new_bt2020_hlg() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::BT2020_COLORANTS);
+
+        let curve = ToneReprCurve::Lut(HLG_LUT_TABLE.to_vec());
+
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D65.to_xyzd());
+        profile.cicp = Some(CicpProfile {
+            color_primaries: CicpColorPrimaries::Bt2020,
+            transfer_characteristics: TransferCharacteristics::Hlg,
+            matrix_coefficients: MatrixCoefficients::Bt709,
+            full_range: false,
+        });
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Rec.2020 HLG".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new Monochrome profile
+    pub fn new_gray_with_gamma(gamma: f32) -> ColorProfile {
+        ColorProfile {
+            gray_trc: Some(curve_from_gamma(gamma)),
+            profile_class: ProfileClass::DisplayDevice,
+            rendering_intent: RenderingIntent::Perceptual,
+            color_space: DataColorSpace::Gray,
+            media_white_point: Some(WHITE_POINT_D65.to_xyzd()),
+            white_point: WHITE_POINT_D50.to_xyzd(),
+            chromatic_adaptation: Some(BRADFORD_D),
+            copyright: Some(ProfileText::Localizable(vec![LocalizableString::new(
+                "en".to_string(),
+                "US".to_string(),
+                "Public Domain".to_string(),
+            )])),
+            ..Default::default()
+        }
+    }
+
+    /// Creates new ACES 2065-1/AP0 profile
+    pub fn new_aces_aces_2065_1_linear() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::ACES_2065_1_COLORANTS);
+
+        let curve = ToneReprCurve::Lut(vec![]);
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D60.to_xyzd());
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "ACES 2065-1".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+
+    /// Creates new ACEScg profile
+    pub fn new_aces_cg_linear() -> ColorProfile {
+        let mut profile = ColorProfile::basic_rgb_profile();
+        profile.update_colorants(ColorProfile::ACES_CG_COLORANTS);
+
+        let curve = ToneReprCurve::Lut(vec![]);
+        profile.red_trc = Some(curve.clone());
+        profile.blue_trc = Some(curve.clone());
+        profile.green_trc = Some(curve);
+        profile.media_white_point = Some(WHITE_POINT_D60.to_xyzd());
+        profile.description = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "ACEScg/AP1".to_string(),
+        )]));
+        profile.copyright = Some(ProfileText::Localizable(vec![LocalizableString::new(
+            "en".to_string(),
+            "US".to_string(),
+            "Public Domain".to_string(),
+        )]));
+        profile
+    }
+}
--- a/vendor/moxcms/src/dt_ucs.rs
+++ b/vendor/moxcms/src/dt_ucs.rs
@@ -0,0 +1,359 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::Xyz;
+use crate::mlaf::mlaf;
+use pxfm::{f_atan2f, f_powf, f_sincosf};
+
+/// Darktable UCS JCH ( Darktable Uniform Color Space )
+#[derive(Copy, Clone, PartialOrd, PartialEq, Debug)]
+pub struct DtUchJch {
+    pub j: f32,
+    pub c: f32,
+    pub h: f32,
+}
+
+/// Darktable UCS HSB ( Darktable Uniform Color Space )
+#[derive(Copy, Clone, PartialOrd, PartialEq, Debug)]
+pub struct DtUchHsb {
+    pub h: f32,
+    pub s: f32,
+    pub b: f32,
+}
+
+/// Darktable HCB ( Darktable Uniform Color Space )
+#[derive(Copy, Clone, PartialOrd, PartialEq, Debug)]
+pub struct DtUchHcb {
+    pub h: f32,
+    pub c: f32,
+    pub b: f32,
+}
+
+const DT_UCS_L_STAR_RANGE: f32 = 2.098883786377;
+
+#[inline]
+fn y_to_dt_ucs_l_star(y: f32) -> f32 {
+    let y_hat = f_powf(y, 0.631651345306265);
+    DT_UCS_L_STAR_RANGE * y_hat / (y_hat + 1.12426773749357)
+}
+
+#[inline]
+fn dt_ucs_l_star_to_y(x: f32) -> f32 {
+    f_powf(
+        1.12426773749357 * x / (DT_UCS_L_STAR_RANGE - x),
+        1.5831518565279648,
+    )
+}
+
+const L_WHITE: f32 = 0.98805060;
+
+#[inline]
+fn dt_ucs_luv_to_ucs_jch(
+    l_star: f32,
+    l_white: f32,
+    u_star_prime: f32,
+    v_star_prime: f32,
+) -> DtUchJch {
+    let m2: f32 = mlaf(u_star_prime * u_star_prime, v_star_prime, v_star_prime); // square of colorfulness M
+
+    // should be JCH[0] = powf(L_star / L_white), cz) but we treat only the case where cz = 1
+    let j = l_star / l_white;
+    let c =
+        15.932993652962535 * f_powf(l_star, 0.6523997524738018) * f_powf(m2, 0.6007557017508491)
+            / l_white;
+    let h = f_atan2f(v_star_prime, u_star_prime);
+    DtUchJch::new(j, c, h)
+}
+
+#[inline]
+fn dt_ucs_xy_to_uv(x: f32, y: f32) -> (f32, f32) {
+    const X_C: [f32; 3] = [-0.783941002840055, 0.745273540913283, 0.318707282433486];
+    const Y_C: [f32; 3] = [0.277512987809202, -0.205375866083878, 2.16743692732158];
+    const BIAS: [f32; 3] = [0.153836578598858, -0.165478376301988, 0.291320554395942];
+
+    let mut u_c = mlaf(mlaf(BIAS[0], Y_C[0], y), X_C[0], x);
+    let mut v_c = mlaf(mlaf(BIAS[1], Y_C[1], y), X_C[1], x);
+    let d_c = mlaf(mlaf(BIAS[2], Y_C[2], y), X_C[2], x);
+
+    let div = if d_c >= 0.0 {
+        d_c.max(f32::MIN)
+    } else {
+        d_c.min(-f32::MIN)
+    };
+    u_c /= div;
+    v_c /= div;
+
+    const STAR_C: [f32; 2] = [1.39656225667, 1.4513954287];
+    const STAR_HF_C: [f32; 2] = [1.49217352929, 1.52488637914];
+
+    let u_star = STAR_C[0] * u_c / (u_c.abs() + STAR_HF_C[0]);
+    let v_star = STAR_C[1] * v_c / (v_c.abs() + STAR_HF_C[1]);
+
+    // The following is equivalent to a 2D matrix product
+    let u_star_prime = mlaf(-1.124983854323892 * u_star, -0.980483721769325, v_star);
+    let v_star_prime = mlaf(1.86323315098672 * u_star, 1.971853092390862, v_star);
+    (u_star_prime, v_star_prime)
+}
+
+impl DtUchJch {
+    #[inline]
+    pub fn new(j: f32, c: f32, h: f32) -> DtUchJch {
+        DtUchJch { j, c, h }
+    }
+
+    #[inline]
+    pub fn from_xyz(xyz: Xyz) -> DtUchJch {
+        DtUchJch::from_xyy(xyz.to_xyy())
+    }
+
+    #[inline]
+    pub fn to_xyz(&self) -> Xyz {
+        let xyy = self.to_xyy();
+        Xyz::from_xyy(xyy)
+    }
+
+    #[inline]
+    pub fn from_xyy(xyy: [f32; 3]) -> DtUchJch {
+        let l_star = y_to_dt_ucs_l_star(xyy[2]);
+        // let l_white = y_to_dt_ucs_l_star(1.);
+
+        let (u_star_prime, v_star_prime) = dt_ucs_xy_to_uv(xyy[0], xyy[1]);
+        dt_ucs_luv_to_ucs_jch(l_star, L_WHITE, u_star_prime, v_star_prime)
+    }
+
+    #[inline]
+    pub fn to_xyy(&self) -> [f32; 3] {
+        // let l_white: f32 = y_to_dt_ucs_l_star(1.0);
+        let l_star = (self.j * L_WHITE).max(0.0).min(2.09885);
+        let m = if l_star != 0. {
+            f_powf(
+                self.c * L_WHITE / (15.932993652962535 * f_powf(l_star, 0.6523997524738018)),
+                0.8322850678616855,
+            )
+        } else {
+            0.
+        };
+
+        let sin_cos_h = f_sincosf(self.h);
+        let u_star_prime = m * sin_cos_h.1;
+        let v_star_prime = m * sin_cos_h.0;
+
+        // The following is equivalent to a 2D matrix product
+        let u_star = mlaf(
+            -5.037522385190711 * u_star_prime,
+            -2.504856328185843,
+            v_star_prime,
+        );
+        let v_star = mlaf(
+            4.760029407436461 * u_star_prime,
+            2.874012963239247,
+            v_star_prime,
+        );
+
+        const F: [f32; 2] = [1.39656225667, 1.4513954287];
+        const HF: [f32; 2] = [1.49217352929, 1.52488637914];
+
+        let u_c = -HF[0] * u_star / (u_star.abs() - F[0]);
+        let v_c = -HF[1] * v_star / (v_star.abs() - F[1]);
+
+        const U_C: [f32; 3] = [0.167171472114775, -0.150959086409163, 0.940254742367256];
+        const V_C: [f32; 3] = [0.141299802443708, -0.155185060382272, 1.000000000000000];
+        const BIAS: [f32; 3] = [
+            -0.00801531300850582,
+            -0.00843312433578007,
+            -0.0256325967652889,
+        ];
+
+        let mut x = mlaf(mlaf(BIAS[0], V_C[0], v_c), U_C[0], u_c);
+        let mut y = mlaf(mlaf(BIAS[1], V_C[1], v_c), U_C[1], u_c);
+        let d = mlaf(mlaf(BIAS[2], V_C[2], v_c), U_C[2], u_c);
+
+        let div = if d >= 0.0 {
+            d.max(f32::MIN)
+        } else {
+            d.min(-f32::MIN)
+        };
+        x /= div;
+        y /= div;
+        let yb = dt_ucs_l_star_to_y(l_star);
+        [x, y, yb]
+    }
+}
+
+impl DtUchHsb {
+    #[inline]
+    pub fn new(h: f32, s: f32, b: f32) -> DtUchHsb {
+        DtUchHsb { h, s, b }
+    }
+
+    #[inline]
+    pub fn from_jch(jch: DtUchJch) -> DtUchHsb {
+        let b = jch.j * (f_powf(jch.c, 1.33654221029386) + 1.);
+        let s = if b > 0. { jch.c / b } else { 0. };
+        let h = jch.h;
+        DtUchHsb::new(h, s, b)
+    }
+
+    #[inline]
+    pub fn to_jch(&self) -> DtUchJch {
+        let h = self.h;
+        let c = self.s * self.b;
+        let j = self.b / (f_powf(c, 1.33654221029386) + 1.);
+        DtUchJch::new(j, c, h)
+    }
+}
+
+impl DtUchHcb {
+    #[inline]
+    pub fn new(h: f32, c: f32, b: f32) -> DtUchHcb {
+        DtUchHcb { h, c, b }
+    }
+
+    #[inline]
+    pub fn from_jch(jch: DtUchJch) -> DtUchHcb {
+        let b = jch.j * (f_powf(jch.c, 1.33654221029386) + 1.);
+        let c = jch.c;
+        let h = jch.h;
+        DtUchHcb::new(h, c, b)
+    }
+
+    #[inline]
+    pub fn to_jch(&self) -> DtUchJch {
+        let h = self.h;
+        let c = self.c;
+        let j = self.b / (f_powf(self.c, 1.33654221029386) + 1.);
+        DtUchJch::new(j, c, h)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_darktable_ucs_jch() {
+        let xyy = [0.4, 0.2, 0.5];
+        let ucs = DtUchJch::from_xyy(xyy);
+        let xyy_rev = ucs.to_xyy();
+        assert!(
+            (xyy[0] - xyy_rev[0]).abs() < 1e-5,
+            "Expected {}, got {}",
+            xyy[0],
+            xyy_rev[0]
+        );
+        assert!(
+            (xyy[1] - xyy_rev[1]).abs() < 1e-5,
+            "Expected {}, got {}",
+            xyy[1],
+            xyy_rev[1]
+        );
+        assert!(
+            (xyy[2] - xyy_rev[2]).abs() < 1e-5,
+            "Expected {}, got {}",
+            xyy[2],
+            xyy_rev[2]
+        );
+    }
+
+    #[test]
+    fn test_darktable_hsb() {
+        let jch = DtUchJch::new(0.3, 0.6, 0.4);
+        let hsb = DtUchHsb::from_jch(jch);
+        let r_jch = hsb.to_jch();
+
+        assert!(
+            (r_jch.j - jch.j).abs() < 1e-5,
+            "Expected {}, got {}",
+            jch.j,
+            r_jch.j
+        );
+        assert!(
+            (r_jch.c - jch.c).abs() < 1e-5,
+            "Expected {}, got {}",
+            jch.c,
+            r_jch.c
+        );
+        assert!(
+            (r_jch.h - jch.h).abs() < 1e-5,
+            "Expected {}, got {}",
+            jch.h,
+            r_jch.h
+        );
+    }
+
+    #[test]
+    fn test_darktable_hcb() {
+        let jch = DtUchJch::new(0.3, 0.6, 0.4);
+        let hcb = DtUchHcb::from_jch(jch);
+        let r_jch = hcb.to_jch();
+
+        assert!(
+            (r_jch.j - jch.j).abs() < 1e-5,
+            "Expected {}, got {}",
+            jch.j,
+            r_jch.j
+        );
+        assert!(
+            (r_jch.c - jch.c).abs() < 1e-5,
+            "Expected {}, got {}",
+            jch.c,
+            r_jch.c
+        );
+        assert!(
+            (r_jch.h - jch.h).abs() < 1e-5,
+            "Expected {}, got {}",
+            jch.h,
+            r_jch.h
+        );
+    }
+
+    #[test]
+    fn test_darktable_ucs_jch_from_xyz() {
+        let xyz = Xyz::new(0.4, 0.2, 0.5);
+        let ucs = DtUchJch::from_xyz(xyz);
+        let xyy_rev = ucs.to_xyz();
+        assert!(
+            (xyz.x - xyz.x).abs() < 1e-5,
+            "Expected {}, got {}",
+            xyz.x,
+            xyy_rev.x
+        );
+        assert!(
+            (xyz.y - xyz.y).abs() < 1e-5,
+            "Expected {}, got {}",
+            xyz.y,
+            xyy_rev.y
+        );
+        assert!(
+            (xyz.z - xyz.z).abs() < 1e-5,
+            "Expected {}, got {}",
+            xyz.z,
+            xyy_rev.z
+        );
+    }
+}
--- a/vendor/moxcms/src/err.rs
+++ b/vendor/moxcms/src/err.rs
@@ -0,0 +1,122 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 2/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::RenderingIntent;
+use std::error::Error;
+use std::fmt::Display;
+
+#[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
+pub struct MalformedSize {
+    pub size: usize,
+    pub expected: usize,
+}
+
+#[derive(Debug, Clone, PartialOrd, PartialEq)]
+pub enum CmsError {
+    LaneSizeMismatch,
+    LaneMultipleOfChannels,
+    InvalidProfile,
+    InvalidTrcCurve,
+    InvalidCicp,
+    CurveLutIsTooLarge,
+    ParametricCurveZeroDivision,
+    InvalidRenderingIntent,
+    DivisionByZero,
+    UnsupportedColorPrimaries(u8),
+    UnsupportedTrc(u8),
+    InvalidLayout,
+    UnsupportedProfileConnection,
+    BuildTransferFunction,
+    UnsupportedChannelConfiguration,
+    UnknownTag(u32),
+    UnknownTagTypeDefinition(u32),
+    UnsupportedLutRenderingIntent(RenderingIntent),
+    InvalidAtoBLut,
+    OverflowingError,
+    LUTTablesInvalidKind,
+    MalformedClut(MalformedSize),
+    MalformedCurveLutTable(MalformedSize),
+    InvalidInksCountForProfile,
+    MalformedTrcCurve(String),
+}
+
+impl Display for CmsError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            CmsError::LaneSizeMismatch => f.write_str("Lanes length must match"),
+            CmsError::LaneMultipleOfChannels => {
+                f.write_str("Lane length must not be multiple of channel count")
+            }
+            CmsError::InvalidProfile => f.write_str("Invalid ICC profile"),
+            CmsError::InvalidCicp => {
+                f.write_str("Invalid Code Independent point (CICP) in ICC profile")
+            }
+            CmsError::InvalidTrcCurve => f.write_str("Invalid TRC curve"),
+            CmsError::CurveLutIsTooLarge => f.write_str("Curve Lut is too large"),
+            CmsError::ParametricCurveZeroDivision => {
+                f.write_str("Parametric Curve definition causes division by zero")
+            }
+            CmsError::InvalidRenderingIntent => f.write_str("Invalid rendering intent"),
+            CmsError::DivisionByZero => f.write_str("Division by zero"),
+            CmsError::UnsupportedColorPrimaries(value) => {
+                f.write_fmt(format_args!("Unsupported color primaries, {value}"))
+            }
+            CmsError::UnsupportedTrc(value) => f.write_fmt(format_args!("Unsupported TRC {value}")),
+            CmsError::InvalidLayout => f.write_str("Invalid layout"),
+            CmsError::UnsupportedProfileConnection => f.write_str("Unsupported profile connection"),
+            CmsError::BuildTransferFunction => f.write_str("Can't reconstruct transfer function"),
+            CmsError::UnsupportedChannelConfiguration => {
+                f.write_str("Can't reconstruct channel configuration")
+            }
+            CmsError::UnknownTag(t) => f.write_fmt(format_args!("Unknown tag: {t}")),
+            CmsError::UnknownTagTypeDefinition(t) => {
+                f.write_fmt(format_args!("Unknown tag type definition: {t}"))
+            }
+            CmsError::UnsupportedLutRenderingIntent(intent) => f.write_fmt(format_args!(
+                "Can't find LUT for rendering intent: {intent:?}"
+            )),
+            CmsError::InvalidAtoBLut => f.write_str("Invalid A to B Lut"),
+            CmsError::OverflowingError => {
+                f.write_str("Overflowing was happen, that is not allowed")
+            }
+            CmsError::LUTTablesInvalidKind => f.write_str("All LUT curves must have same kind"),
+            CmsError::MalformedClut(size) => {
+                f.write_fmt(format_args!("Invalid CLUT size: {size:?}"))
+            }
+            CmsError::MalformedCurveLutTable(size) => {
+                f.write_fmt(format_args!("Malformed curve LUT size: {size:?}"))
+            }
+            CmsError::InvalidInksCountForProfile => {
+                f.write_str("Invalid inks count for profile was provided")
+            }
+            CmsError::MalformedTrcCurve(str) => f.write_str(str),
+        }
+    }
+}
+
+impl Error for CmsError {}
--- a/vendor/moxcms/src/gamma.rs
+++ b/vendor/moxcms/src/gamma.rs
--- a/vendor/moxcms/src/gamut.rs
+++ b/vendor/moxcms/src/gamut.rs
@@ -0,0 +1,66 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::Rgb;
+
+#[inline]
+fn filmlike_clip_rgb_tone(r: &mut f32, g: &mut f32, b: &mut f32, l: f32) {
+    let new_r = r.min(l);
+    let new_b = b.min(l);
+    let new_g = new_b + ((new_r - new_b) * (*g - *b) / (*r - *b));
+    *r = new_r;
+    *g = new_g;
+    *b = new_b;
+}
+
+/// Soft clipping out-of-bounds values in S-curve
+///
+/// Works only on highlights, negative values are skipped
+#[inline]
+pub fn filmlike_clip(rgb: Rgb<f32>) -> Rgb<f32> {
+    const L: f32 = 1.;
+    let mut rgb = rgb;
+    if rgb.r >= rgb.g {
+        if rgb.g > rgb.b {
+            filmlike_clip_rgb_tone(&mut rgb.r, &mut rgb.g, &mut rgb.b, L);
+        } else if rgb.b > rgb.r {
+            filmlike_clip_rgb_tone(&mut rgb.b, &mut rgb.r, &mut rgb.g, L);
+        } else if rgb.b > rgb.g {
+            filmlike_clip_rgb_tone(&mut rgb.r, &mut rgb.b, &mut rgb.g, L);
+        } else {
+            Rgb::new(rgb.r.min(L), rgb.g.min(L), rgb.g);
+        }
+    } else if rgb.r >= rgb.b {
+        filmlike_clip_rgb_tone(&mut rgb.g, &mut rgb.r, &mut rgb.b, L);
+    } else if rgb.b > rgb.g {
+        filmlike_clip_rgb_tone(&mut rgb.b, &mut rgb.g, &mut rgb.r, L);
+    } else {
+        filmlike_clip_rgb_tone(&mut rgb.g, &mut rgb.b, &mut rgb.r, L);
+    }
+    rgb
+}
--- a/vendor/moxcms/src/helpers.rs
+++ b/vendor/moxcms/src/helpers.rs
@@ -0,0 +1,223 @@
+/*
+ * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
+ * //
+ * // Redistribution and use in source and binary forms, with or without modification,
+ * // are permitted provided that the following conditions are met:
+ * //
+ * // 1.  Redistributions of source code must retain the above copyright notice, this
+ * // list of conditions and the following disclaimer.
+ * //
+ * // 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * // this list of conditions and the following disclaimer in the documentation
+ * // and/or other materials provided with the distribution.
+ * //
+ * // 3.  Neither the name of the copyright holder nor the names of its
+ * // contributors may be used to endorse or promote products derived from
+ * // this software without specific prior written permission.
+ * //
+ * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::matan::{
+    does_curve_have_discontinuity, is_curve_ascending, is_curve_degenerated, is_curve_descending,
+    is_curve_linear8, is_curve_linear16, is_curve_monotonic,
+};
+use crate::reader::{
+    s15_fixed16_number_to_double, uint8_number_to_float_fast, uint16_number_to_float_fast,
+};
+use crate::{CmsError, LutStore, Matrix3d, ToneReprCurve, Vector3d};
+
+impl LutStore {
+    pub fn to_clut_f32(&self) -> Vec<f32> {
+        match self {
+            LutStore::Store8(store) => store
+                .iter()
+                .map(|x| uint8_number_to_float_fast(*x))
+                .collect(),
+            LutStore::Store16(store) => store
+                .iter()
+                .map(|x| uint16_number_to_float_fast(*x as u32))
+                .collect(),
+        }
+    }
+
+    pub(crate) fn is_degenerated(&self, entries: usize, channel: usize) -> bool {
+        let start = entries * channel;
+        let end = start + entries;
+
+        match &self {
+            LutStore::Store8(v) => is_curve_degenerated(&v[start..end]),
+            LutStore::Store16(v) => is_curve_degenerated(&v[start..end]),
+        }
+    }
+
+    pub(crate) fn is_monotonic(&self, entries: usize, channel: usize) -> bool {
+        let start = entries * channel;
+        let end = start + entries;
+
+        match &self {
+            LutStore::Store8(v) => is_curve_monotonic(&v[start..end]),
+            LutStore::Store16(v) => is_curve_monotonic(&v[start..end]),
+        }
+    }
+
+    pub(crate) fn have_discontinuities(&self, entries: usize, channel: usize) -> bool {
+        let start = entries * channel;
+        let end = start + entries;
+
+        match &self {
+            LutStore::Store8(v) => does_curve_have_discontinuity(&v[start..end]),
+            LutStore::Store16(v) => does_curve_have_discontinuity(&v[start..end]),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn is_linear(&self, entries: usize, channel: usize) -> bool {
+        let start = entries * channel;
+        let end = start + entries;
+
+        match &self {
+            LutStore::Store8(v) => is_curve_linear8(&v[start..end]),
+            LutStore::Store16(v) => is_curve_linear16(&v[start..end]),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn is_descending(&self, entries: usize, channel: usize) -> bool {
+        let start = entries * channel;
+        let end = start + entries;
+
+        match &self {
+            LutStore::Store8(v) => is_curve_descending(&v[start..end]),
+            LutStore::Store16(v) => is_curve_descending(&v[start..end]),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn is_ascending(&self, entries: usize, channel: usize) -> bool {
+        let start = entries * channel;
+        let end = start + entries;
+
+        match &self {
+            LutStore::Store8(v) => is_curve_ascending(&v[start..end]),
+            LutStore::Store16(v) => is_curve_ascending(&v[start..end]),
+        }
+    }
+}
+
+impl ToneReprCurve {
+    pub(crate) fn is_linear(&self) -> bool {
+        match &self {
+            ToneReprCurve::Lut(lut) => {
+                if lut.is_empty() {
+                    return true;
+                }
+                if lut.len() == 1 {
+                    let gamma = 1. / crate::trc::u8_fixed_8number_to_float(lut[0]);
+                    if (gamma - 1.).abs() < 1e-4 {
+                        return true;
+                    }
+                }
+                is_curve_linear16(lut)
+            }
+            ToneReprCurve::Parametric(parametric) => {
+                if parametric.is_empty() {
+                    return true;
+                }
+                if parametric.len() == 1 && parametric[0] == 1. {
+                    return true;
+                }
+                false
+            }
+        }
+    }
+
+    pub(crate) fn is_monotonic(&self) -> bool {
+        match &self {
+            ToneReprCurve::Lut(lut) => is_curve_monotonic(lut),
+            ToneReprCurve::Parametric(_) => true,
+        }
+    }
+
+    pub(crate) fn is_degenerated(&self) -> bool {
+        match &self {
+            ToneReprCurve::Lut(lut) => is_curve_degenerated(lut),
+            ToneReprCurve::Parametric(_) => false,
+        }
+    }
+
+    pub(crate) fn have_discontinuities(&self) -> bool {
+        match &self {
+            ToneReprCurve::Lut(lut) => does_curve_have_discontinuity(lut),
+            ToneReprCurve::Parametric(_) => false,
+        }
+    }
+}
+
+pub(crate) fn read_matrix_3d(arr: &[u8]) -> Result<Matrix3d, CmsError> {
+    if arr.len() < 36 {
+        return Err(CmsError::InvalidProfile);
+    }
+
+    let m_tag = &arr[..36];
+
+    let e00 = i32::from_be_bytes([m_tag[0], m_tag[1], m_tag[2], m_tag[3]]);
+    let e01 = i32::from_be_bytes([m_tag[4], m_tag[5], m_tag[6], m_tag[7]]);
+    let e02 = i32::from_be_bytes([m_tag[8], m_tag[9], m_tag[10], m_tag[11]]);
+
+    let e10 = i32::from_be_bytes([m_tag[12], m_tag[13], m_tag[14], m_tag[15]]);
+    let e11 = i32::from_be_bytes([m_tag[16], m_tag[17], m_tag[18], m_tag[19]]);
+    let e12 = i32::from_be_bytes([m_tag[20], m_tag[21], m_tag[22], m_tag[23]]);
+
+    let e20 = i32::from_be_bytes([m_tag[24], m_tag[25], m_tag[26], m_tag[27]]);
+    let e21 = i32::from_be_bytes([m_tag[28], m_tag[29], m_tag[30], m_tag[31]]);
+    let e22 = i32::from_be_bytes([m_tag[32], m_tag[33], m_tag[34], m_tag[35]]);
+
+    Ok(Matrix3d {
+        v: [
+            [
+                s15_fixed16_number_to_double(e00),
+                s15_fixed16_number_to_double(e01),
+                s15_fixed16_number_to_double(e02),
+            ],
+            [
+                s15_fixed16_number_to_double(e10),
+                s15_fixed16_number_to_double(e11),
+                s15_fixed16_number_to_double(e12),
+            ],
+            [
+                s15_fixed16_number_to_double(e20),
+                s15_fixed16_number_to_double(e21),
+                s15_fixed16_number_to_double(e22),
+            ],
+        ],
+    })
+}
+
+pub(crate) fn read_vector_3d(arr: &[u8]) -> Result<Vector3d, CmsError> {
+    if arr.len() < 12 {
+        return Err(CmsError::InvalidProfile);
+    }
+
+    let m_tag = &arr[..12];
+
+    let b0 = i32::from_be_bytes([m_tag[0], m_tag[1], m_tag[2], m_tag[3]]);
+    let b1 = i32::from_be_bytes([m_tag[4], m_tag[5], m_tag[6], m_tag[7]]);
+    let b2 = i32::from_be_bytes([m_tag[8], m_tag[9], m_tag[10], m_tag[11]]);
+
+    Ok(Vector3d {
+        v: [
+            s15_fixed16_number_to_double(b0),
+            s15_fixed16_number_to_double(b1),
+            s15_fixed16_number_to_double(b2),
+        ],
+    })
+}
--- a/Show More
+++ b/Show More